# Binning

## Import Libraries

In [11]:
import pandas as pd

## Import Data

In [12]:
train = pd.read_csv('train_regression.csv')

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Partition Data into X and y

In [14]:
y = train['SalePrice']
X = train.drop(['SalePrice','Id'], axis =  1)

## Get Numerical and Categorical Column Names

In [15]:
numerical_features = X.select_dtypes(include = ['int64','float64']).columns.tolist()
numerical_features

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [16]:
categorical_features = X.select_dtypes(exclude = ['int64','float64']).columns.tolist()
categorical_features

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

## Treat Missing Values

In [17]:
from sklearn.impute import SimpleImputer

num_impute = SimpleImputer(strategy = 'median')
num_impute.fit(X[numerical_features])
X[numerical_features] = num_impute.transform(X[numerical_features])

cat_impute = SimpleImputer(strategy = 'constant', fill_value = 'missing')
cat_impute.fit(X[categorical_features])
X[categorical_features] = cat_impute.transform(X[categorical_features])

# Binning Strategy

## 1/ Check Data

In [18]:
X[numerical_features[2:6]].head()

Unnamed: 0,LotArea,OverallQual,OverallCond,YearBuilt
0,8450.0,7.0,5.0,2003.0
1,9600.0,6.0,8.0,1976.0
2,11250.0,7.0,5.0,2001.0
3,9550.0,7.0,5.0,1915.0
4,14260.0,8.0,5.0,2000.0


In [19]:
X[numerical_features[2:6]].describe()

Unnamed: 0,LotArea,OverallQual,OverallCond,YearBuilt
count,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,6.099315,5.575342,1971.267808
std,9981.264932,1.382997,1.112799,30.202904
min,1300.0,1.0,1.0,1872.0
25%,7553.5,5.0,5.0,1954.0
50%,9478.5,6.0,5.0,1973.0
75%,11601.5,7.0,6.0,2000.0
max,215245.0,10.0,9.0,2010.0


## 2/ Binning

In [24]:
X['LotArea_bins'] = pd.cut(X['LotArea'], bins = 10).astype(str)

In [25]:
X['LotArea_qcut'] = pd.qcut(X['LotArea'], q = 10, duplicates= 'drop').astype(str)

## 3/ Check

In [26]:
X[['LotArea','LotArea_bins','LotArea_qcut']].head()

Unnamed: 0,LotArea,LotArea_bins,LotArea_qcut
0,8450.0,"(1086.055, 22694.5]","(8063.7, 8793.4]"
1,9600.0,"(1086.055, 22694.5]","(9478.5, 10198.2]"
2,11250.0,"(1086.055, 22694.5]","(11066.5, 12205.8]"
3,9550.0,"(1086.055, 22694.5]","(9478.5, 10198.2]"
4,14260.0,"(1086.055, 22694.5]","(12205.8, 14381.7]"


In [27]:
X['LotArea_bins'].value_counts()

(1086.055, 22694.5]     1423
(22694.5, 44089.0]        24
(44089.0, 65483.5]         8
(151061.5, 172456.0]       2
(193850.5, 215245.0]       1
(108272.5, 129667.0]       1
(65483.5, 86878.0]         1
Name: LotArea_bins, dtype: int64

In [28]:
X['LotArea_qcut'].value_counts()

(1299.999, 5000.0]     147
(10198.2, 11066.5]     146
(11066.5, 12205.8]     146
(8793.4, 9478.5]       146
(8063.7, 8793.4]       146
(14381.7, 215245.0]    146
(7078.4, 8063.7]       146
(9478.5, 10198.2]      146
(12205.8, 14381.7]     146
(5000.0, 7078.4]       145
Name: LotArea_qcut, dtype: int64