# Data Loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("train.csv")
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


# Data Preprocessing

#checking data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
df['MSZoning'].unique()

array(['RL', 'RM', 'C (all)', 'FV', 'RH'], dtype=object)

#Dummy encoding on MSZoning

In [5]:
df1 = pd.get_dummies(df['MSZoning'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('MSZoning', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,C (all),FV,RH,RL,RM
0,1,60,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,2,2008,WD,Normal,208500,0,0,0,1,0
1,2,20,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,5,2007,WD,Normal,181500,0,0,0,1,0
2,3,60,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,9,2008,WD,Normal,223500,0,0,0,1,0
3,4,70,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,2,2006,WD,Abnorml,140000,0,0,0,1,0
4,5,60,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,12,2008,WD,Normal,250000,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,8,2007,WD,Normal,175000,0,0,0,1,0
1456,1457,20,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,2,2010,WD,Normal,210000,0,0,0,1,0
1457,1458,70,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,5,2010,WD,Normal,266500,0,0,0,1,0
1458,1459,20,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,4,2010,WD,Normal,142125,0,0,0,1,0


In [6]:
df['Street'].unique()

array(['Pave', 'Grvl'], dtype=object)

#Label encoding on Street

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = le.fit_transform(df['Street'])
df.drop("Street", axis=1, inplace=True)
df["Street"] = label
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,YrSold,SaleType,SaleCondition,SalePrice,C (all),FV,RH,RL,RM,Street
0,1,60,65.0,8450,,Reg,Lvl,AllPub,Inside,Gtl,...,2008,WD,Normal,208500,0,0,0,1,0,1
1,2,20,80.0,9600,,Reg,Lvl,AllPub,FR2,Gtl,...,2007,WD,Normal,181500,0,0,0,1,0,1
2,3,60,68.0,11250,,IR1,Lvl,AllPub,Inside,Gtl,...,2008,WD,Normal,223500,0,0,0,1,0,1
3,4,70,60.0,9550,,IR1,Lvl,AllPub,Corner,Gtl,...,2006,WD,Abnorml,140000,0,0,0,1,0,1
4,5,60,84.0,14260,,IR1,Lvl,AllPub,FR2,Gtl,...,2008,WD,Normal,250000,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,,Reg,Lvl,AllPub,Inside,Gtl,...,2007,WD,Normal,175000,0,0,0,1,0,1
1456,1457,20,85.0,13175,,Reg,Lvl,AllPub,Inside,Gtl,...,2010,WD,Normal,210000,0,0,0,1,0,1
1457,1458,70,66.0,9042,,Reg,Lvl,AllPub,Inside,Gtl,...,2010,WD,Normal,266500,0,0,0,1,0,1
1458,1459,20,68.0,9717,,Reg,Lvl,AllPub,Inside,Gtl,...,2010,WD,Normal,142125,0,0,0,1,0,1


In [8]:
df['LotShape'].unique()

array(['Reg', 'IR1', 'IR2', 'IR3'], dtype=object)

#Dummy encoding on LotShape

In [9]:
df1 = pd.get_dummies(df['LotShape'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('LotShape', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Alley,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,C (all),FV,RH,RL,RM,Street,IR1,IR2,IR3,Reg
0,1,60,65.0,8450,,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,1,0,1,0,0,0,1
1,2,20,80.0,9600,,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,1,0,1,0,0,0,1
2,3,60,68.0,11250,,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,1,0,1,1,0,0,0
3,4,70,60.0,9550,,Lvl,AllPub,Corner,Gtl,Crawfor,...,0,0,0,1,0,1,1,0,0,0
4,5,60,84.0,14260,,Lvl,AllPub,FR2,Gtl,NoRidge,...,0,0,0,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,,Lvl,AllPub,Inside,Gtl,Gilbert,...,0,0,0,1,0,1,0,0,0,1
1456,1457,20,85.0,13175,,Lvl,AllPub,Inside,Gtl,NWAmes,...,0,0,0,1,0,1,0,0,0,1
1457,1458,70,66.0,9042,,Lvl,AllPub,Inside,Gtl,Crawfor,...,0,0,0,1,0,1,0,0,0,1
1458,1459,20,68.0,9717,,Lvl,AllPub,Inside,Gtl,NAmes,...,0,0,0,1,0,1,0,0,0,1


In [10]:
df["LandContour"].unique()

array(['Lvl', 'Bnk', 'Low', 'HLS'], dtype=object)

#Dummy encoding on LandContour

In [11]:
df1 = pd.get_dummies(df['LandContour'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('LandContour', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Alley,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,RM,Street,IR1,IR2,IR3,Reg,Bnk,HLS,Low,Lvl
0,1,60,65.0,8450,,AllPub,Inside,Gtl,CollgCr,Norm,...,0,1,0,0,0,1,0,0,0,1
1,2,20,80.0,9600,,AllPub,FR2,Gtl,Veenker,Feedr,...,0,1,0,0,0,1,0,0,0,1
2,3,60,68.0,11250,,AllPub,Inside,Gtl,CollgCr,Norm,...,0,1,1,0,0,0,0,0,0,1
3,4,70,60.0,9550,,AllPub,Corner,Gtl,Crawfor,Norm,...,0,1,1,0,0,0,0,0,0,1
4,5,60,84.0,14260,,AllPub,FR2,Gtl,NoRidge,Norm,...,0,1,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,,AllPub,Inside,Gtl,Gilbert,Norm,...,0,1,0,0,0,1,0,0,0,1
1456,1457,20,85.0,13175,,AllPub,Inside,Gtl,NWAmes,Norm,...,0,1,0,0,0,1,0,0,0,1
1457,1458,70,66.0,9042,,AllPub,Inside,Gtl,Crawfor,Norm,...,0,1,0,0,0,1,0,0,0,1
1458,1459,20,68.0,9717,,AllPub,Inside,Gtl,NAmes,Norm,...,0,1,0,0,0,1,0,0,0,1


In [12]:
df['Utilities'].unique()

array(['AllPub', 'NoSeWa'], dtype=object)

#Label encoding on Utilities

In [13]:
label = le.fit_transform(df['Utilities'])
df.drop("Utilities", axis=1, inplace=True)
df["Utilities"] = label
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Alley,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Street,IR1,IR2,IR3,Reg,Bnk,HLS,Low,Lvl,Utilities
0,1,60,65.0,8450,,Inside,Gtl,CollgCr,Norm,Norm,...,1,0,0,0,1,0,0,0,1,0
1,2,20,80.0,9600,,FR2,Gtl,Veenker,Feedr,Norm,...,1,0,0,0,1,0,0,0,1,0
2,3,60,68.0,11250,,Inside,Gtl,CollgCr,Norm,Norm,...,1,1,0,0,0,0,0,0,1,0
3,4,70,60.0,9550,,Corner,Gtl,Crawfor,Norm,Norm,...,1,1,0,0,0,0,0,0,1,0
4,5,60,84.0,14260,,FR2,Gtl,NoRidge,Norm,Norm,...,1,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,,Inside,Gtl,Gilbert,Norm,Norm,...,1,0,0,0,1,0,0,0,1,0
1456,1457,20,85.0,13175,,Inside,Gtl,NWAmes,Norm,Norm,...,1,0,0,0,1,0,0,0,1,0
1457,1458,70,66.0,9042,,Inside,Gtl,Crawfor,Norm,Norm,...,1,0,0,0,1,0,0,0,1,0
1458,1459,20,68.0,9717,,Inside,Gtl,NAmes,Norm,Norm,...,1,0,0,0,1,0,0,0,1,0


In [14]:
df['LotConfig'].unique()

array(['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3'], dtype=object)

#Dummy encoding on LotConfig

In [15]:
df1 = pd.get_dummies(df['LotConfig'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('LotConfig', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Alley,LandSlope,Neighborhood,Condition1,Condition2,BldgType,...,Bnk,HLS,Low,Lvl,Utilities,Corner,CulDSac,FR2,FR3,Inside
0,1,60,65.0,8450,,Gtl,CollgCr,Norm,Norm,1Fam,...,0,0,0,1,0,0,0,0,0,1
1,2,20,80.0,9600,,Gtl,Veenker,Feedr,Norm,1Fam,...,0,0,0,1,0,0,0,1,0,0
2,3,60,68.0,11250,,Gtl,CollgCr,Norm,Norm,1Fam,...,0,0,0,1,0,0,0,0,0,1
3,4,70,60.0,9550,,Gtl,Crawfor,Norm,Norm,1Fam,...,0,0,0,1,0,1,0,0,0,0
4,5,60,84.0,14260,,Gtl,NoRidge,Norm,Norm,1Fam,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,,Gtl,Gilbert,Norm,Norm,1Fam,...,0,0,0,1,0,0,0,0,0,1
1456,1457,20,85.0,13175,,Gtl,NWAmes,Norm,Norm,1Fam,...,0,0,0,1,0,0,0,0,0,1
1457,1458,70,66.0,9042,,Gtl,Crawfor,Norm,Norm,1Fam,...,0,0,0,1,0,0,0,0,0,1
1458,1459,20,68.0,9717,,Gtl,NAmes,Norm,Norm,1Fam,...,0,0,0,1,0,0,0,0,0,1


In [16]:
df['LandSlope'].unique()

array(['Gtl', 'Mod', 'Sev'], dtype=object)

#Dummy encoding on LandSlope

In [17]:
df1 = pd.get_dummies(df['LandSlope'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('LandSlope', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Alley,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,...,Lvl,Utilities,Corner,CulDSac,FR2,FR3,Inside,Gtl,Mod,Sev
0,1,60,65.0,8450,,CollgCr,Norm,Norm,1Fam,2Story,...,1,0,0,0,0,0,1,1,0,0
1,2,20,80.0,9600,,Veenker,Feedr,Norm,1Fam,1Story,...,1,0,0,0,1,0,0,1,0,0
2,3,60,68.0,11250,,CollgCr,Norm,Norm,1Fam,2Story,...,1,0,0,0,0,0,1,1,0,0
3,4,70,60.0,9550,,Crawfor,Norm,Norm,1Fam,2Story,...,1,0,1,0,0,0,0,1,0,0
4,5,60,84.0,14260,,NoRidge,Norm,Norm,1Fam,2Story,...,1,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,,Gilbert,Norm,Norm,1Fam,2Story,...,1,0,0,0,0,0,1,1,0,0
1456,1457,20,85.0,13175,,NWAmes,Norm,Norm,1Fam,1Story,...,1,0,0,0,0,0,1,1,0,0
1457,1458,70,66.0,9042,,Crawfor,Norm,Norm,1Fam,2Story,...,1,0,0,0,0,0,1,1,0,0
1458,1459,20,68.0,9717,,NAmes,Norm,Norm,1Fam,1Story,...,1,0,0,0,0,0,1,1,0,0


In [18]:
df['Neighborhood'].unique()

array(['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst',
       'NWAmes', 'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes',
       'SawyerW', 'IDOTRR', 'MeadowV', 'Edwards', 'Timber', 'Gilbert',
       'StoneBr', 'ClearCr', 'NPkVill', 'Blmngtn', 'BrDale', 'SWISU',
       'Blueste'], dtype=object)

#Binary Encoding

In [19]:
import category_encoders as ce
encoder = ce.BinaryEncoder(df['Neighborhood'])
df1 = encoder.fit_transform(df['Neighborhood'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('Neighborhood', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Alley,Condition1,Condition2,BldgType,HouseStyle,OverallQual,...,FR3,Inside,Gtl,Mod,Sev,Neighborhood_0,Neighborhood_1,Neighborhood_2,Neighborhood_3,Neighborhood_4
0,1,60,65.0,8450,,Norm,Norm,1Fam,2Story,7,...,0,1,1,0,0,0,0,0,0,1
1,2,20,80.0,9600,,Feedr,Norm,1Fam,1Story,6,...,0,0,1,0,0,0,0,0,1,0
2,3,60,68.0,11250,,Norm,Norm,1Fam,2Story,7,...,0,1,1,0,0,0,0,0,0,1
3,4,70,60.0,9550,,Norm,Norm,1Fam,2Story,7,...,0,0,1,0,0,0,0,0,1,1
4,5,60,84.0,14260,,Norm,Norm,1Fam,2Story,8,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,,Norm,Norm,1Fam,2Story,6,...,0,1,1,0,0,1,0,0,1,0
1456,1457,20,85.0,13175,,Norm,Norm,1Fam,1Story,6,...,0,1,1,0,0,0,0,1,1,1
1457,1458,70,66.0,9042,,Norm,Norm,1Fam,2Story,7,...,0,1,1,0,0,0,0,0,1,1
1458,1459,20,68.0,9717,,Norm,Norm,1Fam,1Story,5,...,0,1,1,0,0,0,1,1,0,0


In [21]:
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,...,FR3,Inside,Gtl,Mod,Sev,Neighborhood_0,Neighborhood_1,Neighborhood_2,Neighborhood_3,Neighborhood_4
0,1,60,65.0,8450,Norm,Norm,1Fam,2Story,7,5,...,0,1,1,0,0,0,0,0,0,1
1,2,20,80.0,9600,Feedr,Norm,1Fam,1Story,6,8,...,0,0,1,0,0,0,0,0,1,0
2,3,60,68.0,11250,Norm,Norm,1Fam,2Story,7,5,...,0,1,1,0,0,0,0,0,0,1
3,4,70,60.0,9550,Norm,Norm,1Fam,2Story,7,5,...,0,0,1,0,0,0,0,0,1,1
4,5,60,84.0,14260,Norm,Norm,1Fam,2Story,8,5,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,Norm,Norm,1Fam,2Story,6,5,...,0,1,1,0,0,1,0,0,1,0
1456,1457,20,85.0,13175,Norm,Norm,1Fam,1Story,6,6,...,0,1,1,0,0,0,0,1,1,1
1457,1458,70,66.0,9042,Norm,Norm,1Fam,2Story,7,9,...,0,1,1,0,0,0,0,0,1,1
1458,1459,20,68.0,9717,Norm,Norm,1Fam,1Story,5,6,...,0,1,1,0,0,0,1,1,0,0


In [22]:
df['Condition1'].unique()

array(['Norm', 'Feedr', 'PosN', 'Artery', 'RRAe', 'RRNn', 'RRAn', 'PosA',
       'RRNe'], dtype=object)

#Binary encoding on Condition1

In [23]:
encoder = ce.BinaryEncoder(df['Condition1'])
df1 = encoder.fit_transform(df['Condition1'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('Condition1', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,...,Sev,Neighborhood_0,Neighborhood_1,Neighborhood_2,Neighborhood_3,Neighborhood_4,Condition1_0,Condition1_1,Condition1_2,Condition1_3
0,1,60,65.0,8450,Norm,1Fam,2Story,7,5,2003,...,0,0,0,0,0,1,0,0,0,1
1,2,20,80.0,9600,Norm,1Fam,1Story,6,8,1976,...,0,0,0,0,1,0,0,0,1,0
2,3,60,68.0,11250,Norm,1Fam,2Story,7,5,2001,...,0,0,0,0,0,1,0,0,0,1
3,4,70,60.0,9550,Norm,1Fam,2Story,7,5,1915,...,0,0,0,0,1,1,0,0,0,1
4,5,60,84.0,14260,Norm,1Fam,2Story,8,5,2000,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,Norm,1Fam,2Story,6,5,1999,...,0,1,0,0,1,0,0,0,0,1
1456,1457,20,85.0,13175,Norm,1Fam,1Story,6,6,1978,...,0,0,0,1,1,1,0,0,0,1
1457,1458,70,66.0,9042,Norm,1Fam,2Story,7,9,1941,...,0,0,0,0,1,1,0,0,0,1
1458,1459,20,68.0,9717,Norm,1Fam,1Story,5,6,1950,...,0,0,1,1,0,0,0,0,0,1


In [24]:
df['Condition2'].unique()

array(['Norm', 'Artery', 'RRNn', 'Feedr', 'PosN', 'PosA', 'RRAn', 'RRAe'],
      dtype=object)

#Binary encoding on Condition2

In [25]:
encoder = ce.BinaryEncoder(df['Condition2'])
df1 = encoder.fit_transform(df['Condition2'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('Condition2', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,...,Neighborhood_3,Neighborhood_4,Condition1_0,Condition1_1,Condition1_2,Condition1_3,Condition2_0,Condition2_1,Condition2_2,Condition2_3
0,1,60,65.0,8450,1Fam,2Story,7,5,2003,2003,...,0,1,0,0,0,1,0,0,0,1
1,2,20,80.0,9600,1Fam,1Story,6,8,1976,1976,...,1,0,0,0,1,0,0,0,0,1
2,3,60,68.0,11250,1Fam,2Story,7,5,2001,2002,...,0,1,0,0,0,1,0,0,0,1
3,4,70,60.0,9550,1Fam,2Story,7,5,1915,1970,...,1,1,0,0,0,1,0,0,0,1
4,5,60,84.0,14260,1Fam,2Story,8,5,2000,2000,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,1Fam,2Story,6,5,1999,2000,...,1,0,0,0,0,1,0,0,0,1
1456,1457,20,85.0,13175,1Fam,1Story,6,6,1978,1988,...,1,1,0,0,0,1,0,0,0,1
1457,1458,70,66.0,9042,1Fam,2Story,7,9,1941,2006,...,1,1,0,0,0,1,0,0,0,1
1458,1459,20,68.0,9717,1Fam,1Story,5,6,1950,1996,...,0,0,0,0,0,1,0,0,0,1


In [26]:
df['BldgType'].unique()

array(['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'], dtype=object)

#Dummy encoding

In [27]:
df1 = pd.get_dummies(df['BldgType'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('BldgType', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,...,Condition1_3,Condition2_0,Condition2_1,Condition2_2,Condition2_3,1Fam,2fmCon,Duplex,Twnhs,TwnhsE
0,1,60,65.0,8450,2Story,7,5,2003,2003,Gable,...,1,0,0,0,1,1,0,0,0,0
1,2,20,80.0,9600,1Story,6,8,1976,1976,Gable,...,0,0,0,0,1,1,0,0,0,0
2,3,60,68.0,11250,2Story,7,5,2001,2002,Gable,...,1,0,0,0,1,1,0,0,0,0
3,4,70,60.0,9550,2Story,7,5,1915,1970,Gable,...,1,0,0,0,1,1,0,0,0,0
4,5,60,84.0,14260,2Story,8,5,2000,2000,Gable,...,1,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,2Story,6,5,1999,2000,Gable,...,1,0,0,0,1,1,0,0,0,0
1456,1457,20,85.0,13175,1Story,6,6,1978,1988,Gable,...,1,0,0,0,1,1,0,0,0,0
1457,1458,70,66.0,9042,2Story,7,9,1941,2006,Gable,...,1,0,0,0,1,1,0,0,0,0
1458,1459,20,68.0,9717,1Story,5,6,1950,1996,Hip,...,1,0,0,0,1,1,0,0,0,0


#Checking null values

In [28]:
df['HouseStyle'].unique()

array(['2Story', '1Story', '1.5Fin', '1.5Unf', 'SFoyer', 'SLvl', '2.5Unf',
       '2.5Fin'], dtype=object)

#binary encoding

In [29]:
encoder = ce.BinaryEncoder(df['HouseStyle'])
df1 = encoder.fit_transform(df['HouseStyle'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('HouseStyle', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,...,Condition2_3,1Fam,2fmCon,Duplex,Twnhs,TwnhsE,HouseStyle_0,HouseStyle_1,HouseStyle_2,HouseStyle_3
0,1,60,65.0,8450,7,5,2003,2003,Gable,CompShg,...,1,1,0,0,0,0,0,0,0,1
1,2,20,80.0,9600,6,8,1976,1976,Gable,CompShg,...,1,1,0,0,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,Gable,CompShg,...,1,1,0,0,0,0,0,0,0,1
3,4,70,60.0,9550,7,5,1915,1970,Gable,CompShg,...,1,1,0,0,0,0,0,0,0,1
4,5,60,84.0,14260,8,5,2000,2000,Gable,CompShg,...,1,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,Gable,CompShg,...,1,1,0,0,0,0,0,0,0,1
1456,1457,20,85.0,13175,6,6,1978,1988,Gable,CompShg,...,1,1,0,0,0,0,0,0,1,0
1457,1458,70,66.0,9042,7,9,1941,2006,Gable,CompShg,...,1,1,0,0,0,0,0,0,0,1
1458,1459,20,68.0,9717,5,6,1950,1996,Hip,CompShg,...,1,1,0,0,0,0,0,0,1,0


In [30]:
df['RoofStyle'].unique()

array(['Gable', 'Hip', 'Gambrel', 'Mansard', 'Flat', 'Shed'], dtype=object)

In [31]:
encoder = ce.BinaryEncoder(df['RoofStyle'])
df1 = encoder.fit_transform(df['RoofStyle'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('RoofStyle', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofMatl,Exterior1st,...,Duplex,Twnhs,TwnhsE,HouseStyle_0,HouseStyle_1,HouseStyle_2,HouseStyle_3,RoofStyle_0,RoofStyle_1,RoofStyle_2
0,1,60,65.0,8450,7,5,2003,2003,CompShg,VinylSd,...,0,0,0,0,0,0,1,0,0,1
1,2,20,80.0,9600,6,8,1976,1976,CompShg,MetalSd,...,0,0,0,0,0,1,0,0,0,1
2,3,60,68.0,11250,7,5,2001,2002,CompShg,VinylSd,...,0,0,0,0,0,0,1,0,0,1
3,4,70,60.0,9550,7,5,1915,1970,CompShg,Wd Sdng,...,0,0,0,0,0,0,1,0,0,1
4,5,60,84.0,14260,8,5,2000,2000,CompShg,VinylSd,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,CompShg,VinylSd,...,0,0,0,0,0,0,1,0,0,1
1456,1457,20,85.0,13175,6,6,1978,1988,CompShg,Plywood,...,0,0,0,0,0,1,0,0,0,1
1457,1458,70,66.0,9042,7,9,1941,2006,CompShg,CemntBd,...,0,0,0,0,0,0,1,0,0,1
1458,1459,20,68.0,9717,5,6,1950,1996,CompShg,MetalSd,...,0,0,0,0,0,1,0,0,1,0


In [32]:
df['RoofMatl'].unique()

array(['CompShg', 'WdShngl', 'Metal', 'WdShake', 'Membran', 'Tar&Grv',
       'Roll', 'ClyTile'], dtype=object)

In [33]:
encoder = ce.BinaryEncoder(df['RoofMatl'])
df1 = encoder.fit_transform(df['RoofMatl'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('RoofMatl', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,Exterior2nd,...,HouseStyle_1,HouseStyle_2,HouseStyle_3,RoofStyle_0,RoofStyle_1,RoofStyle_2,RoofMatl_0,RoofMatl_1,RoofMatl_2,RoofMatl_3
0,1,60,65.0,8450,7,5,2003,2003,VinylSd,VinylSd,...,0,0,1,0,0,1,0,0,0,1
1,2,20,80.0,9600,6,8,1976,1976,MetalSd,MetalSd,...,0,1,0,0,0,1,0,0,0,1
2,3,60,68.0,11250,7,5,2001,2002,VinylSd,VinylSd,...,0,0,1,0,0,1,0,0,0,1
3,4,70,60.0,9550,7,5,1915,1970,Wd Sdng,Wd Shng,...,0,0,1,0,0,1,0,0,0,1
4,5,60,84.0,14260,8,5,2000,2000,VinylSd,VinylSd,...,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,VinylSd,VinylSd,...,0,0,1,0,0,1,0,0,0,1
1456,1457,20,85.0,13175,6,6,1978,1988,Plywood,Plywood,...,0,1,0,0,0,1,0,0,0,1
1457,1458,70,66.0,9042,7,9,1941,2006,CemntBd,CmentBd,...,0,0,1,0,0,1,0,0,0,1
1458,1459,20,68.0,9717,5,6,1950,1996,MetalSd,MetalSd,...,0,1,0,0,1,0,0,0,0,1


In [34]:
df['Exterior1st'].unique()

array(['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing',
       'CemntBd', 'Plywood', 'AsbShng', 'Stucco', 'BrkComm', 'AsphShn',
       'Stone', 'ImStucc', 'CBlock'], dtype=object)

In [35]:
encoder = ce.BinaryEncoder(df['Exterior1st'])
df1 = encoder.fit_transform(df['Exterior1st'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('Exterior1st', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior2nd,MasVnrType,...,RoofStyle_1,RoofStyle_2,RoofMatl_0,RoofMatl_1,RoofMatl_2,RoofMatl_3,Exterior1st_0,Exterior1st_1,Exterior1st_2,Exterior1st_3
0,1,60,65.0,8450,7,5,2003,2003,VinylSd,BrkFace,...,0,1,0,0,0,1,0,0,0,1
1,2,20,80.0,9600,6,8,1976,1976,MetalSd,,...,0,1,0,0,0,1,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,VinylSd,BrkFace,...,0,1,0,0,0,1,0,0,0,1
3,4,70,60.0,9550,7,5,1915,1970,Wd Shng,,...,0,1,0,0,0,1,0,0,1,1
4,5,60,84.0,14260,8,5,2000,2000,VinylSd,BrkFace,...,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,VinylSd,,...,0,1,0,0,0,1,0,0,0,1
1456,1457,20,85.0,13175,6,6,1978,1988,Plywood,Stone,...,0,1,0,0,0,1,1,0,0,0
1457,1458,70,66.0,9042,7,9,1941,2006,CmentBd,,...,0,1,0,0,0,1,0,1,1,1
1458,1459,20,68.0,9717,5,6,1950,1996,MetalSd,,...,1,0,0,0,0,1,0,0,1,0


In [36]:
df['Exterior2nd'].unique()

array(['VinylSd', 'MetalSd', 'Wd Shng', 'HdBoard', 'Plywood', 'Wd Sdng',
       'CmentBd', 'BrkFace', 'Stucco', 'AsbShng', 'Brk Cmn', 'ImStucc',
       'AsphShn', 'Stone', 'Other', 'CBlock'], dtype=object)

In [37]:
encoder = ce.BinaryEncoder(df['Exterior2nd'])
df1 = encoder.fit_transform(df['Exterior2nd'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('Exterior2nd', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrType,MasVnrArea,...,RoofMatl_3,Exterior1st_0,Exterior1st_1,Exterior1st_2,Exterior1st_3,Exterior2nd_0,Exterior2nd_1,Exterior2nd_2,Exterior2nd_3,Exterior2nd_4
0,1,60,65.0,8450,7,5,2003,2003,BrkFace,196.0,...,1,0,0,0,1,0,0,0,0,1
1,2,20,80.0,9600,6,8,1976,1976,,0.0,...,1,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,BrkFace,162.0,...,1,0,0,0,1,0,0,0,0,1
3,4,70,60.0,9550,7,5,1915,1970,,0.0,...,1,0,0,1,1,0,0,0,1,1
4,5,60,84.0,14260,8,5,2000,2000,BrkFace,350.0,...,1,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,,0.0,...,1,0,0,0,1,0,0,0,0,1
1456,1457,20,85.0,13175,6,6,1978,1988,Stone,119.0,...,1,1,0,0,0,0,0,1,0,1
1457,1458,70,66.0,9042,7,9,1941,2006,,0.0,...,1,0,1,1,1,0,0,1,1,1
1458,1459,20,68.0,9717,5,6,1950,1996,,0.0,...,1,0,0,1,0,0,0,0,1,0


In [46]:
df['MasVnrType'].unique()

array(['BrkFace', 'None', 'Stone', 'BrkCmn'], dtype=object)

In [47]:
encoder = ce.BinaryEncoder(df['MasVnrType'])
df1 = encoder.fit_transform(df['MasVnrType'])
df = pd.concat([df, df1], axis=1).reindex(df.index)
df.drop('MasVnrType', axis=1, inplace=True)
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,Exterior1st_2,Exterior1st_3,Exterior2nd_0,Exterior2nd_1,Exterior2nd_2,Exterior2nd_3,Exterior2nd_4,MasVnrType_0,MasVnrType_1,MasVnrType_2
0,1,60,65.0,8450,7,5,2003,2003,196.0,Gd,...,0,1,0,0,0,0,1,0,0,1
1,2,20,80.0,9600,6,8,1976,1976,0.0,TA,...,1,0,0,0,0,1,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,Gd,...,0,1,0,0,0,0,1,0,0,1
3,4,70,60.0,9550,7,5,1915,1970,0.0,TA,...,1,1,0,0,0,1,1,0,1,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,Gd,...,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,TA,...,0,1,0,0,0,0,1,0,1,0
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,TA,...,0,0,0,0,1,0,1,0,1,1
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,Ex,...,1,1,0,0,1,1,1,0,1,0
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,TA,...,1,0,0,0,0,1,0,0,1,0


In [40]:
df.isnull().sum()

Id                 0
MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
                ... 
Exterior2nd_0      0
Exterior2nd_1      0
Exterior2nd_2      0
Exterior2nd_3      0
Exterior2nd_4      0
Length: 125, dtype: int64

#Fill null values with mean and drop those columns which had more than 75% null values.

#filling with most common class

In [48]:
df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,Exterior1st_2,Exterior1st_3,Exterior2nd_0,Exterior2nd_1,Exterior2nd_2,Exterior2nd_3,Exterior2nd_4,MasVnrType_0,MasVnrType_1,MasVnrType_2
0,1,60,65.0,8450,7,5,2003,2003,196.0,Gd,...,0,1,0,0,0,0,1,0,0,1
1,2,20,80.0,9600,6,8,1976,1976,0.0,TA,...,1,0,0,0,0,1,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,Gd,...,0,1,0,0,0,0,1,0,0,1
3,4,70,60.0,9550,7,5,1915,1970,0.0,TA,...,1,1,0,0,0,1,1,0,1,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,Gd,...,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,TA,...,0,1,0,0,0,0,1,0,1,0
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,TA,...,0,0,0,0,1,0,1,0,1,1
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,Ex,...,1,1,0,0,1,1,1,0,1,0
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,TA,...,1,0,0,0,0,1,0,0,1,0


In [49]:
df['LotFrontage'].fillna(int(df['LotFrontage'].mean()), inplace=True)
df.isnull().sum()

Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
                ..
Exterior2nd_3    0
Exterior2nd_4    0
MasVnrType_0     0
MasVnrType_1     0
MasVnrType_2     0
Length: 127, dtype: int64

In [20]:
df.drop(['Alley'], axis=1, inplace = True)

In [42]:
df['MasVnrArea'].fillna(int(df['MasVnrArea'].mean()), inplace=True)
df.isnull().sum()

TypeError: can only concatenate str (not "int") to str

In [12]:
corr_matrix = df.corr()
corr_matrix['SalePrice'].sort_values(ascending=False)

  corr_matrix = df.corr()


SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr