In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

pd.pandas.set_option("display.max_columns", None)

In [2]:
dataset_train = pd.read_csv("train.csv")
print(dataset_train.head())


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities LotConfig LandSlope Neighborhood Condition1  \
0         Lvl    AllPub    Inside       Gtl      CollgCr       Norm   
1         Lvl    AllPub       FR2       Gtl      Veenker      Feedr   
2         Lvl    AllPub    Inside       Gtl      CollgCr       Norm   
3         Lvl    AllPub    Corner       Gtl      Crawfor       Norm   
4         Lvl    AllPub       FR2       Gtl      NoRidge       Norm   

  Condition2 BldgType HouseStyle  OverallQual  OverallCond  YearBuilt  \
0       Norm     1Fam     2Story            7          

In [3]:
dataset_test = pd.read_csv("test.csv")
dataset_test.shape

(1459, 81)

In [4]:
# storing the categorical feature containing nan values in dataset_train

categorical_feature_for_train = [feature for feature in dataset_train.columns if dataset_train[feature].isnull().sum()>1 and dataset_train[feature].dtypes == "O"]
len(categorical_feature_for_train)
categorical_feature_for_train


['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [5]:
dataset_train[categorical_feature_for_train].isnull().sum()

Alley           1369
MasVnrType         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [6]:
# storing the categorical features containing nan values in dataset_test

categorical_feature_for_test = [feature for feature in dataset_test.columns if dataset_test[feature].isnull().sum()>1 and dataset_test[feature].dtypes == "O"]
len(categorical_feature_for_test)
categorical_feature_for_test

['MSZoning',
 'Alley',
 'Utilities',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [7]:
dataset_test[categorical_feature_for_test].isnull().sum()

MSZoning           4
Alley           1352
Utilities          2
MasVnrType        16
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinType2      42
Functional         2
FireplaceQu      730
GarageType        76
GarageFinish      78
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
dtype: int64

In [8]:
dataset_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal,208500
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal,181500
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal,223500
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal,140000
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal,250000


In [9]:
for feature in categorical_feature_for_train:
    data = dataset_train.copy()
    
    print("In {} missing values are ----> {} %".format(feature, np.round(dataset_train[feature].isnull().mean()*100, 4)))

In Alley missing values are ----> 93.7671 %
In MasVnrType missing values are ----> 0.5479 %
In BsmtQual missing values are ----> 2.5342 %
In BsmtCond missing values are ----> 2.5342 %
In BsmtExposure missing values are ----> 2.6027 %
In BsmtFinType1 missing values are ----> 2.5342 %
In BsmtFinType2 missing values are ----> 2.6027 %
In FireplaceQu missing values are ----> 47.2603 %
In GarageType missing values are ----> 5.5479 %
In GarageFinish missing values are ----> 5.5479 %
In GarageQual missing values are ----> 5.5479 %
In GarageCond missing values are ----> 5.5479 %
In PoolQC missing values are ----> 99.5205 %
In Fence missing values are ----> 80.7534 %
In MiscFeature missing values are ----> 96.3014 %


In [10]:
for feature in categorical_feature_for_test:    
    print("In {} missing values are ----> {} %".format(feature, np.round(dataset_test[feature].isnull().mean(), 4)))

In MSZoning missing values are ----> 0.0027 %
In Alley missing values are ----> 0.9267 %
In Utilities missing values are ----> 0.0014 %
In MasVnrType missing values are ----> 0.011 %
In BsmtQual missing values are ----> 0.0302 %
In BsmtCond missing values are ----> 0.0308 %
In BsmtExposure missing values are ----> 0.0302 %
In BsmtFinType1 missing values are ----> 0.0288 %
In BsmtFinType2 missing values are ----> 0.0288 %
In Functional missing values are ----> 0.0014 %
In FireplaceQu missing values are ----> 0.5003 %
In GarageType missing values are ----> 0.0521 %
In GarageFinish missing values are ----> 0.0535 %
In GarageQual missing values are ----> 0.0535 %
In GarageCond missing values are ----> 0.0535 %
In PoolQC missing values are ----> 0.9979 %
In Fence missing values are ----> 0.8012 %
In MiscFeature missing values are ----> 0.965 %


In [11]:
for feature in categorical_feature_for_train:
    # data = dataset_train.copy()
    dataset_train[feature] = dataset_train[feature].fillna("Missing")
    

dataset_train[categorical_feature_for_train].isnull().sum() # now we will get 0 as output for each categorical feature since we have removed all the missing values 


Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [12]:
for feature in categorical_feature_for_test:
    dataset_test[feature] = dataset_test[feature].fillna("Missing")
    

dataset_test[categorical_feature_for_test].isnull().sum() # now we will get 0 as output for each categorical feature since we have removed all the missing values 


MSZoning        0
Alley           0
Utilities       0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Functional      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [13]:
dataset_train[categorical_feature_for_train].head()

Unnamed: 0,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,Missing,BrkFace,Gd,TA,No,GLQ,Unf,Missing,Attchd,RFn,TA,TA,Missing,Missing,Missing
1,Missing,,Gd,TA,Gd,ALQ,Unf,TA,Attchd,RFn,TA,TA,Missing,Missing,Missing
2,Missing,BrkFace,Gd,TA,Mn,GLQ,Unf,TA,Attchd,RFn,TA,TA,Missing,Missing,Missing
3,Missing,,TA,Gd,No,ALQ,Unf,Gd,Detchd,Unf,TA,TA,Missing,Missing,Missing
4,Missing,BrkFace,Gd,TA,Av,GLQ,Unf,TA,Attchd,RFn,TA,TA,Missing,Missing,Missing


In [14]:
dataset_test[categorical_feature_for_test].head()

Unnamed: 0,MSZoning,Alley,Utilities,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,RH,Missing,AllPub,,TA,TA,No,Rec,LwQ,Typ,Missing,Attchd,Unf,TA,TA,Missing,MnPrv,Missing
1,RL,Missing,AllPub,BrkFace,TA,TA,No,ALQ,Unf,Typ,Missing,Attchd,Unf,TA,TA,Missing,Missing,Gar2
2,RL,Missing,AllPub,,Gd,TA,No,GLQ,Unf,Typ,TA,Attchd,Fin,TA,TA,Missing,MnPrv,Missing
3,RL,Missing,AllPub,BrkFace,TA,TA,No,GLQ,Unf,Typ,Gd,Attchd,Fin,TA,TA,Missing,Missing,Missing
4,RL,Missing,AllPub,,Gd,TA,No,ALQ,Unf,Typ,Missing,Attchd,RFn,TA,TA,Missing,Missing,Missing


In [15]:
# handling the missing values for numerical features in dataset_train

numerical_feature_for_train = [feature for feature in dataset_train.columns if dataset_train[feature].isnull().sum()>1 and dataset_train[feature].dtypes != "O"]
numerical_feature_for_train

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [16]:
# handling the missing values for numerical features in dataset_test

numerical_feature_for_test = [feature for feature in dataset_test.columns if dataset_test[feature].isnull().sum()>1 and dataset_test[feature].dtypes != "O"]
numerical_feature_for_test


['LotFrontage', 'MasVnrArea', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt']

In [17]:
for feature in numerical_feature_for_train:
    print("In {} missing values are ----> {} %".format(feature, np.round(dataset_train[feature].isnull().mean(), 4)))

In LotFrontage missing values are ----> 0.1774 %
In MasVnrArea missing values are ----> 0.0055 %
In GarageYrBlt missing values are ----> 0.0555 %


In [18]:
for feature in numerical_feature_for_test:
    print("In {} missing values are ----> {} %".format(feature, np.round(dataset_test[feature].isnull().mean(), 4)))

In LotFrontage missing values are ----> 0.1556 %
In MasVnrArea missing values are ----> 0.0103 %
In BsmtFullBath missing values are ----> 0.0014 %
In BsmtHalfBath missing values are ----> 0.0014 %
In GarageYrBlt missing values are ----> 0.0535 %


In [19]:
# for replacing the missing values with median in dataset_train

def replace_nan_value(dataset_train, numerical_feature_for_train):
    data = dataset_test.copy()
    for feature in numerical_feature_for_train:
        median_value = data[feature].median()
        data[feature] = data[feature].fillna(median_value)
    return data

dataset_train = replace_nan_value(dataset_train, numerical_feature_for_train)
# dataset_train[numerical_feature_for_train].head()
dataset_train[numerical_feature_for_train].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [20]:
# for replacing the missing values with median in dataset_test

def replace_nan_value(dataset_test, numerical_feature_for_test):
    data = dataset_test.copy()
    for feature in numerical_feature_for_test:
        median_value = data[feature].median()
        data[feature] = data[feature].fillna(median_value)
    return data

dataset_test = replace_nan_value(dataset_train, numerical_feature_for_test)
# dataset_test[numerical_feature_for_test].head()
dataset_test[numerical_feature_for_test].isnull().sum()

LotFrontage     0
MasVnrArea      0
BsmtFullBath    0
BsmtHalfBath    0
GarageYrBlt     0
dtype: int64

In [21]:
temporal_feature = [feature for feature in dataset_train.columns if "Yr" in feature or "Year" in feature]
temporal_feature

['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']

In [22]:
for feature in temporal_feature:
    if feature != "YrSold":
        dataset_train[feature] = dataset_train["YrSold"] - dataset_train[feature]


In [23]:
dataset_train[temporal_feature].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt,YrSold
0,49,49,49.0,2010
1,52,52,52.0,2010
2,13,12,13.0,2010
3,12,12,12.0,2010
4,18,18,18.0,2010


In [24]:
# below is to find the feature in dataset_train which are numerical and doesn't contain 0 inside it

feature_for_log_trans_train = [feature for feature in dataset_train.columns if 0 not in dataset_train[feature].unique() and dataset_train[feature].dtypes != "O"]
feature_for_log_trans_train

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 '1stFlrSF',
 'GrLivArea',
 'TotRmsAbvGrd',
 'MoSold',
 'YrSold',
 'SalePrice']

In [25]:
len(feature_for_log_trans_train)

12

In [26]:
# below is to store the features from dataset_test which are numerical and do not contain 0 inside it 

feature_for_log_trans_test = [feature for feature in dataset_test.columns if 0 not in dataset_test[feature].unique() and dataset_test[feature].dtypes != "O"]
feature_for_log_trans_test

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 '1stFlrSF',
 'GrLivArea',
 'TotRmsAbvGrd',
 'GarageYrBlt',
 'MoSold',
 'YrSold',
 'SalePrice']

In [27]:
len(feature_for_log_trans_test)

15

In [28]:
# below list is to contain some features for logrithmic transformation

log_transform = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']
log_transform


['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']

In [29]:
dataset_test.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,Missing,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,Missing,MnPrv,Missing,0,6,2010,WD,Normal,208500
1,1462,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,Missing,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,Missing,Missing,Gar2,12500,6,2010,WD,Normal,181500
2,1463,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,Missing,MnPrv,Missing,0,3,2010,WD,Normal,223500
3,1464,60,RL,78.0,9978,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,Missing,Missing,Missing,0,6,2010,WD,Normal,140000
4,1465,120,RL,43.0,5005,Pave,Missing,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,Missing,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,Missing,Missing,Missing,0,1,2010,WD,Normal,250000


In [30]:
dataset_train.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,49,49,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,Missing,Attchd,49.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,Missing,MnPrv,Missing,0,6,2010,WD,Normal,208500
1,1462,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,52,52,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,Missing,Attchd,52.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,Missing,Missing,Gar2,12500,6,2010,WD,Normal,181500
2,1463,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,13,12,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,13.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,Missing,MnPrv,Missing,0,3,2010,WD,Normal,223500
3,1464,60,RL,78.0,9978,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,12,12,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,12.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,Missing,Missing,Missing,0,6,2010,WD,Normal,140000
4,1465,120,RL,43.0,5005,Pave,Missing,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,18,18,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,Missing,Attchd,18.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,Missing,Missing,Missing,0,1,2010,WD,Normal,250000


In [31]:
dataset_train[log_transform].head()

Unnamed: 0,LotFrontage,LotArea,1stFlrSF,GrLivArea,SalePrice
0,80.0,11622,896,896,208500
1,81.0,14267,1329,1329,181500
2,74.0,13830,928,1629,223500
3,78.0,9978,926,1604,140000
4,43.0,5005,1280,1280,250000


In [32]:
dataset_test[log_transform].head()

Unnamed: 0,LotFrontage,LotArea,1stFlrSF,GrLivArea,SalePrice
0,80.0,11622,896,896,208500
1,81.0,14267,1329,1329,181500
2,74.0,13830,928,1629,223500
3,78.0,9978,926,1604,140000
4,43.0,5005,1280,1280,250000


In [33]:
for feature in log_transform:
    dataset_train[feature] = np.log(dataset_train[feature])
    dataset_test[feature] = np.log(dataset_test[feature])
    

In [34]:
dataset_train[log_transform]

Unnamed: 0,LotFrontage,LotArea,1stFlrSF,GrLivArea,SalePrice
0,4.382027,9.360655,6.797940,6.797940,12.247694
1,4.394449,9.565704,7.192182,7.192182,12.109011
2,4.304065,9.534595,6.833032,7.395722,12.317167
3,4.356709,9.208138,6.830874,7.380256,11.849398
4,3.761200,8.518193,7.154615,7.154615,12.429216
...,...,...,...,...,...
1454,3.044522,7.568379,6.302619,6.995766,12.128111
1455,3.044522,7.546446,6.302619,6.995766,12.072541
1456,5.075174,9.903488,7.109879,7.109879,12.254863
1457,4.127134,9.253496,6.877296,6.877296,12.493130


In [35]:
dataset_test[log_transform]


Unnamed: 0,LotFrontage,LotArea,1stFlrSF,GrLivArea,SalePrice
0,4.382027,9.360655,6.797940,6.797940,12.247694
1,4.394449,9.565704,7.192182,7.192182,12.109011
2,4.304065,9.534595,6.833032,7.395722,12.317167
3,4.356709,9.208138,6.830874,7.380256,11.849398
4,3.761200,8.518193,7.154615,7.154615,12.429216
...,...,...,...,...,...
1454,3.044522,7.568379,6.302619,6.995766,12.128111
1455,3.044522,7.546446,6.302619,6.995766,12.072541
1456,5.075174,9.903488,7.109879,7.109879,12.254863
1457,4.127134,9.253496,6.877296,6.877296,12.493130


In [36]:
# handling Rare categorical feature from dataset_train

categorical_var_for_train = [feature for feature in dataset_train.columns if dataset_train[feature].dtypes == "O"]
len(categorical_var_for_train)

43

In [37]:
for feature in categorical_var_for_train:
    temp = dataset_train.groupby(feature)["SalePrice"].count()/len(dataset_train)
    temp_list = temp[temp>0.01].index
    dataset_train[feature] = np.where(dataset_train[feature].isin(temp_list), dataset_train[feature], "Rare Variable")

In [38]:
dataset_train[categorical_var_for_train].head(50)

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,Rare Variable,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,CBlock,TA,TA,No,Rec,LwQ,GasA,TA,Y,SBrkr,TA,Typ,Missing,Attchd,Unf,TA,TA,Y,Missing,MnPrv,Missing,WD,Normal
1,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,TA,TA,CBlock,TA,TA,No,ALQ,Unf,GasA,TA,Y,SBrkr,Gd,Typ,Missing,Attchd,Unf,TA,TA,Y,Missing,Missing,Rare Variable,WD,Normal
2,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Gd,Y,SBrkr,TA,Typ,TA,Attchd,Fin,TA,TA,Y,Missing,MnPrv,Missing,WD,Normal
3,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,TA,TA,PConc,TA,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,Fin,TA,TA,Y,Missing,Missing,Missing,WD,Normal
4,RL,Pave,Missing,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,Gable,CompShg,HdBoard,HdBoard,,Gd,TA,PConc,Gd,TA,No,ALQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Missing,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal
5,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,HdBoard,HdBoard,,TA,TA,PConc,Gd,TA,No,Unf,Unf,GasA,Gd,Y,SBrkr,TA,Typ,TA,Attchd,Fin,TA,TA,Y,Missing,Missing,Missing,WD,Normal
6,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,1Story,Gable,CompShg,HdBoard,HdBoard,,TA,Gd,PConc,Gd,TA,No,ALQ,Unf,GasA,Ex,Y,SBrkr,TA,Typ,Missing,Attchd,Fin,TA,TA,Y,Missing,GdPrv,Shed,WD,Normal
7,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,PConc,Gd,TA,No,Unf,Unf,GasA,Gd,Y,SBrkr,TA,Typ,Gd,Attchd,Fin,TA,TA,Y,Missing,Missing,Missing,WD,Normal
8,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,1Story,Gable,CompShg,HdBoard,HdBoard,,TA,TA,PConc,Gd,TA,Gd,GLQ,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Po,Attchd,Unf,TA,TA,Y,Missing,Missing,Missing,WD,Normal
9,RL,Pave,Missing,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,Gable,CompShg,Plywood,Plywood,,TA,TA,CBlock,TA,TA,No,ALQ,Rec,GasA,TA,Y,SBrkr,TA,Typ,Missing,Attchd,Fin,TA,TA,Y,Missing,MnPrv,Missing,WD,Normal


In [39]:
# handling Rare categorical feature from dataset_test

categorical_var_for_test = [feature for feature in dataset_test.columns if dataset_test[feature].dtypes == "O"]
len(categorical_var_for_test)

43

In [40]:
for feature in categorical_var_for_test:
    temp_var = dataset_test.groupby(feature)["SalePrice"].count()/len(dataset_test)
    temp_var_list = temp_var[temp_var>0.01].index
    dataset_test[feature] = np.where(dataset_test[feature].isin(temp_var_list), dataset_test[feature], "Rare Variable")

In [41]:
dataset_test[categorical_feature_for_test].head(20)

Unnamed: 0,MSZoning,Alley,Utilities,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,Rare Variable,Missing,AllPub,,TA,TA,No,Rec,LwQ,Typ,Missing,Attchd,Unf,TA,TA,Missing,MnPrv,Missing
1,RL,Missing,AllPub,BrkFace,TA,TA,No,ALQ,Unf,Typ,Missing,Attchd,Unf,TA,TA,Missing,Missing,Rare Variable
2,RL,Missing,AllPub,,Gd,TA,No,GLQ,Unf,Typ,TA,Attchd,Fin,TA,TA,Missing,MnPrv,Missing
3,RL,Missing,AllPub,BrkFace,TA,TA,No,GLQ,Unf,Typ,Gd,Attchd,Fin,TA,TA,Missing,Missing,Missing
4,RL,Missing,AllPub,,Gd,TA,No,ALQ,Unf,Typ,Missing,Attchd,RFn,TA,TA,Missing,Missing,Missing
5,RL,Missing,AllPub,,Gd,TA,No,Unf,Unf,Typ,TA,Attchd,Fin,TA,TA,Missing,Missing,Missing
6,RL,Missing,AllPub,,Gd,TA,No,ALQ,Unf,Typ,Missing,Attchd,Fin,TA,TA,Missing,GdPrv,Shed
7,RL,Missing,AllPub,,Gd,TA,No,Unf,Unf,Typ,Gd,Attchd,Fin,TA,TA,Missing,Missing,Missing
8,RL,Missing,AllPub,,Gd,TA,Gd,GLQ,Unf,Typ,Po,Attchd,Unf,TA,TA,Missing,Missing,Missing
9,RL,Missing,AllPub,,TA,TA,No,ALQ,Rec,Typ,Missing,Attchd,Fin,TA,TA,Missing,MnPrv,Missing


In [42]:
categorical_var_for_train

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [43]:
# converting the categorical features of dataset_train into numerical features

for feature in categorical_var_for_train:
    labels_ordered = dataset_train.groupby(feature)["SalePrice"].mean().sort_values().index
    labels_ordered = {k:i for i, k in enumerate(labels_ordered, 0)}
    dataset_train[feature] = dataset_train[feature].map(labels_ordered)


In [44]:
# converting the categorical features of dataset_test into numerical features

for feature in categorical_var_for_test:
    labels_ordered = dataset_test.groupby(feature)["SalePrice"].mean().sort_values().index
    labels_ordered = {k:i for i, k in enumerate(labels_ordered, 0)}
    dataset_test[feature] = dataset_test[feature].map(labels_ordered)

In [45]:
dataset_train.head(15)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,4,4.382027,9.360655,0,1,0,3,1,1,1,16,3,0,0,3,5,6,49,49,0,0,5,4,2,0.0,3,2,2,4,2,0,4,468.0,5,144.0,270.0,882.0,1,1,1,2,6.79794,0,0,6.79794,0.0,0.0,1,0,2,1,1,5,3,0,1,3,49.0,2,1.0,730.0,2,3,2,140,0,0,0,120,0,1,2,1,0,6,2010,2,2,12.247694
1,1462,20,3,4.394449,9.565704,0,1,1,3,1,2,1,16,2,0,0,3,6,6,52,52,1,0,6,5,1,108.0,3,2,2,4,2,0,5,923.0,4,0.0,406.0,1329.0,1,1,1,2,7.192182,0,0,7.192182,0.0,0.0,1,1,3,1,2,6,3,0,1,3,52.0,2,1.0,312.0,2,3,2,393,36,0,0,0,0,1,1,0,12500,6,2010,2,2,12.109011
2,1463,60,3,4.304065,9.534595,0,1,1,3,1,1,1,19,2,0,0,2,5,5,13,12,0,0,5,4,2,0.0,3,2,3,2,2,0,2,791.0,4,0.0,137.0,928.0,1,4,1,2,6.833032,701,0,7.395722,0.0,0.0,2,1,3,1,1,6,3,1,5,3,13.0,1,2.0,482.0,2,3,2,212,34,0,0,0,0,1,2,1,0,3,2010,2,2,12.317167
3,1464,60,3,4.356709,9.208138,0,1,1,3,1,1,1,19,2,0,0,2,6,6,12,12,0,0,5,4,1,20.0,3,2,3,4,2,0,2,602.0,4,0.0,324.0,926.0,1,3,1,2,6.830874,678,0,7.380256,0.0,0.0,2,1,3,1,2,7,3,1,2,3,12.0,1,2.0,470.0,2,3,2,360,36,0,0,0,0,1,1,1,0,6,2010,2,2,11.849398
4,1465,120,3,3.7612,8.518193,0,1,1,2,1,1,1,20,2,0,2,3,8,5,18,18,0,0,9,8,2,0.0,0,2,3,2,2,0,5,263.0,4,0.0,1017.0,1280.0,1,3,1,2,7.154615,0,0,7.154615,0.0,0.0,2,0,2,1,2,5,3,0,1,3,18.0,3,2.0,506.0,2,3,2,0,82,0,0,144,0,1,1,1,0,1,2010,2,2,12.429216
5,1466,60,3,4.317488,9.21034,0,1,1,3,1,2,1,19,2,0,0,2,6,5,17,16,0,0,9,8,2,0.0,3,2,3,2,2,0,3,0.0,4,0.0,763.0,763.0,1,4,1,2,6.637258,892,0,7.411556,0.0,0.0,2,1,3,1,1,7,3,1,5,3,17.0,1,2.0,440.0,2,3,2,157,84,0,0,0,0,1,1,1,0,4,2010,2,2,11.8706
6,1467,20,3,4.204693,8.984694,0,1,1,3,1,1,1,19,2,0,0,3,6,7,18,3,0,0,9,8,2,0.0,3,3,3,2,2,0,5,935.0,4,0.0,233.0,1168.0,1,3,1,2,7.079184,0,0,7.079184,1.0,0.0,2,0,3,1,1,6,3,0,1,3,18.0,1,2.0,420.0,2,3,2,483,21,0,0,0,0,1,3,2,500,3,2010,2,2,12.634603
7,1468,60,3,4.143135,9.036225,0,1,1,3,1,1,1,19,2,0,0,2,6,5,12,12,0,0,5,4,2,0.0,3,2,3,2,2,0,3,0.0,4,0.0,789.0,789.0,1,4,1,2,6.670766,676,0,7.289611,0.0,0.0,2,1,3,1,1,7,3,1,2,3,12.0,1,2.0,393.0,2,3,2,0,75,0,0,0,0,1,1,1,0,5,2010,2,2,12.206073
8,1469,20,3,4.442651,9.227787,0,1,0,3,1,1,1,19,2,0,0,3,7,5,20,20,0,0,9,8,2,0.0,3,2,3,2,2,3,2,637.0,4,0.0,663.0,1300.0,1,4,1,2,7.201171,0,0,7.201171,1.0,0.0,1,1,2,1,2,5,3,1,3,3,20.0,2,2.0,506.0,2,3,2,192,0,0,0,0,0,1,1,1,0,2,2010,2,2,11.77452
9,1470,20,3,4.248495,9.035987,0,1,0,3,1,2,1,16,2,0,0,3,4,5,40,40,0,0,7,9,2,0.0,3,2,2,4,2,0,5,804.0,1,78.0,0.0,882.0,1,1,1,2,6.782192,0,0,6.782192,1.0,0.0,1,0,2,1,1,4,3,0,1,3,40.0,1,2.0,525.0,2,3,2,240,0,0,0,0,0,1,2,1,0,4,2010,2,2,11.67844


In [46]:
dataset_test.head(15)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,4,4.382027,9.360655,0,1,0,3,1,1,1,16,3,0,0,3,5,6,1961,1961,0,0,5,4,2,0.0,3,2,2,4,2,0,4,468.0,5,144.0,270.0,882.0,1,1,1,2,6.79794,0,0,6.79794,0.0,0.0,1,0,2,1,1,5,3,0,1,3,1961.0,2,1.0,730.0,2,3,2,140,0,0,0,120,0,1,2,1,0,6,2010,2,2,12.247694
1,1462,20,3,4.394449,9.565704,0,1,1,3,1,2,1,16,2,0,0,3,6,6,1958,1958,1,0,6,5,1,108.0,3,2,2,4,2,0,5,923.0,4,0.0,406.0,1329.0,1,1,1,2,7.192182,0,0,7.192182,0.0,0.0,1,1,3,1,2,6,3,0,1,3,1958.0,2,1.0,312.0,2,3,2,393,36,0,0,0,0,1,1,0,12500,6,2010,2,2,12.109011
2,1463,60,3,4.304065,9.534595,0,1,1,3,1,1,1,19,2,0,0,2,5,5,1997,1998,0,0,5,4,2,0.0,3,2,3,2,2,0,2,791.0,4,0.0,137.0,928.0,1,4,1,2,6.833032,701,0,7.395722,0.0,0.0,2,1,3,1,1,6,3,1,5,3,1997.0,1,2.0,482.0,2,3,2,212,34,0,0,0,0,1,2,1,0,3,2010,2,2,12.317167
3,1464,60,3,4.356709,9.208138,0,1,1,3,1,1,1,19,2,0,0,2,6,6,1998,1998,0,0,5,4,1,20.0,3,2,3,4,2,0,2,602.0,4,0.0,324.0,926.0,1,3,1,2,6.830874,678,0,7.380256,0.0,0.0,2,1,3,1,2,7,3,1,2,3,1998.0,1,2.0,470.0,2,3,2,360,36,0,0,0,0,1,1,1,0,6,2010,2,2,11.849398
4,1465,120,3,3.7612,8.518193,0,1,1,2,1,1,1,20,2,0,2,3,8,5,1992,1992,0,0,9,8,2,0.0,0,2,3,2,2,0,5,263.0,4,0.0,1017.0,1280.0,1,3,1,2,7.154615,0,0,7.154615,0.0,0.0,2,0,2,1,2,5,3,0,1,3,1992.0,3,2.0,506.0,2,3,2,0,82,0,0,144,0,1,1,1,0,1,2010,2,2,12.429216
5,1466,60,3,4.317488,9.21034,0,1,1,3,1,2,1,19,2,0,0,2,6,5,1993,1994,0,0,9,8,2,0.0,3,2,3,2,2,0,3,0.0,4,0.0,763.0,763.0,1,4,1,2,6.637258,892,0,7.411556,0.0,0.0,2,1,3,1,1,7,3,1,5,3,1993.0,1,2.0,440.0,2,3,2,157,84,0,0,0,0,1,1,1,0,4,2010,2,2,11.8706
6,1467,20,3,4.204693,8.984694,0,1,1,3,1,1,1,19,2,0,0,3,6,7,1992,2007,0,0,9,8,2,0.0,3,3,3,2,2,0,5,935.0,4,0.0,233.0,1168.0,1,3,1,2,7.079184,0,0,7.079184,1.0,0.0,2,0,3,1,1,6,3,0,1,3,1992.0,1,2.0,420.0,2,3,2,483,21,0,0,0,0,1,3,2,500,3,2010,2,2,12.634603
7,1468,60,3,4.143135,9.036225,0,1,1,3,1,1,1,19,2,0,0,2,6,5,1998,1998,0,0,5,4,2,0.0,3,2,3,2,2,0,3,0.0,4,0.0,789.0,789.0,1,4,1,2,6.670766,676,0,7.289611,0.0,0.0,2,1,3,1,1,7,3,1,2,3,1998.0,1,2.0,393.0,2,3,2,0,75,0,0,0,0,1,1,1,0,5,2010,2,2,12.206073
8,1469,20,3,4.442651,9.227787,0,1,0,3,1,1,1,19,2,0,0,3,7,5,1990,1990,0,0,9,8,2,0.0,3,2,3,2,2,3,2,637.0,4,0.0,663.0,1300.0,1,4,1,2,7.201171,0,0,7.201171,1.0,0.0,1,1,2,1,2,5,3,1,3,3,1990.0,2,2.0,506.0,2,3,2,192,0,0,0,0,0,1,1,1,0,2,2010,2,2,11.77452
9,1470,20,3,4.248495,9.035987,0,1,0,3,1,2,1,16,2,0,0,3,4,5,1970,1970,0,0,7,9,2,0.0,3,2,2,4,2,0,5,804.0,1,78.0,0.0,882.0,1,1,1,2,6.782192,0,0,6.782192,1.0,0.0,1,0,2,1,1,4,3,0,1,3,1970.0,1,2.0,525.0,2,3,2,240,0,0,0,0,0,1,2,1,0,4,2010,2,2,11.67844


In [47]:
# storing the features from dataset_train for feature scaling 

scaling_features_for_train = [feature for feature in dataset_train.columns if feature not in ["Id", "SalePrice"]]
len(scaling_features_for_train)

79

In [48]:
from sklearn.preprocessing import MinMaxScaler
scalar_for_train = MinMaxScaler()
scalar_for_train.fit(dataset_train[scaling_features_for_train])

In [49]:
scalar_for_train.transform(dataset_train[scaling_features_for_train])

array([[0.        , 1.        , 0.59344538, ..., 1.        , 0.5       ,
        0.5       ],
       [0.        , 0.75      , 0.59895721, ..., 1.        , 0.5       ,
        0.5       ],
       [0.23529412, 0.75      , 0.55885415, ..., 1.        , 0.5       ,
        0.5       ],
       ...,
       [0.        , 0.75      , 0.90099208, ..., 0.        , 0.5       ,
        0.        ],
       [0.38235294, 0.75      , 0.48035069, ..., 0.        , 0.5       ,
        0.5       ],
       [0.23529412, 0.75      , 0.55885415, ..., 0.        , 0.5       ,
        0.5       ]])

In [50]:
data_for_train = pd.concat([dataset_train[['Id', 'SalePrice']].reset_index(drop = True), pd.DataFrame(scalar_for_train.transform(dataset_train[scaling_features_for_train]), columns = scaling_features_for_train)], axis = 1)

In [51]:
data_for_train.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,12.247694,0.0,1.0,0.593445,0.56636,0.0,0.5,0.0,1.0,1.0,0.25,0.5,0.8,0.5,0.0,0.0,0.6,0.444444,0.625,0.384615,0.822581,0.0,0.0,0.5,0.363636,0.5,0.0,1.0,0.666667,0.5,1.0,0.5,0.0,0.666667,0.116708,0.833333,0.094364,0.126168,0.173111,1.0,0.25,1.0,0.666667,0.312253,0.0,0.0,0.312253,0.0,0.0,0.25,0.0,0.333333,0.5,0.25,0.166667,0.75,0.0,0.2,0.5,0.792994,0.666667,0.2,0.490591,0.666667,1.0,1.0,0.098315,0.0,0.0,0.0,0.208333,0.0,1.0,0.5,0.5,0.0,0.454545,1.0,0.5,0.5
1,1462,12.109011,0.0,0.75,0.598957,0.622527,0.0,0.5,0.333333,1.0,1.0,0.5,0.5,0.8,0.333333,0.0,0.0,0.6,0.555556,0.625,0.407692,0.870968,0.5,0.0,0.6,0.454545,0.25,0.083721,1.0,0.666667,0.5,1.0,0.5,0.0,0.833333,0.230175,0.666667,0.0,0.18972,0.260844,1.0,0.25,1.0,0.666667,0.468253,0.0,0.0,0.468253,0.0,0.0,0.25,0.5,0.5,0.5,0.5,0.25,0.75,0.0,0.2,0.5,0.802548,0.666667,0.2,0.209677,0.666667,1.0,1.0,0.275983,0.048518,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.735294,0.454545,1.0,0.5,0.5
2,1463,12.317167,0.235294,0.75,0.558854,0.614005,0.0,0.5,0.333333,1.0,1.0,0.25,0.5,0.95,0.333333,0.0,0.0,0.4,0.444444,0.5,0.107692,0.225806,0.0,0.0,0.5,0.363636,0.5,0.0,1.0,0.666667,0.75,0.5,0.5,0.0,0.333333,0.197257,0.666667,0.0,0.064019,0.182139,1.0,1.0,1.0,0.666667,0.326139,0.376477,0.0,0.548792,0.0,0.0,0.5,0.5,0.5,0.5,0.25,0.25,0.75,0.25,1.0,0.5,0.678344,0.333333,0.4,0.323925,0.666667,1.0,1.0,0.148876,0.045822,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.0,0.181818,1.0,0.5,0.5
3,1464,11.849398,0.235294,0.75,0.582212,0.524583,0.0,0.5,0.333333,1.0,1.0,0.25,0.5,0.95,0.333333,0.0,0.0,0.4,0.555556,0.625,0.1,0.225806,0.0,0.0,0.5,0.363636,0.25,0.015504,1.0,0.666667,0.75,1.0,0.5,0.0,0.333333,0.150125,0.666667,0.0,0.151402,0.181747,1.0,0.75,1.0,0.666667,0.325285,0.364125,0.0,0.542672,0.0,0.0,0.5,0.5,0.5,0.5,0.5,0.333333,0.75,0.25,0.4,0.5,0.675159,0.333333,0.4,0.31586,0.666667,1.0,1.0,0.252809,0.048518,0.0,0.0,0.0,0.0,1.0,0.25,0.5,0.0,0.454545,1.0,0.5,0.5
4,1465,12.429216,0.588235,0.75,0.317987,0.335596,0.0,0.5,0.333333,0.666667,1.0,0.25,0.5,1.0,0.333333,0.0,0.5,0.6,0.777778,0.5,0.146154,0.322581,0.0,0.0,0.9,0.727273,0.5,0.0,0.0,0.666667,0.75,0.5,0.5,0.0,0.833333,0.065586,0.666667,0.0,0.475234,0.251227,1.0,0.75,1.0,0.666667,0.453388,0.0,0.0,0.453388,0.0,0.0,0.5,0.0,0.333333,0.5,0.5,0.166667,0.75,0.0,0.2,0.5,0.694268,1.0,0.4,0.340054,0.666667,1.0,1.0,0.0,0.110512,0.0,0.0,0.25,0.0,1.0,0.25,0.5,0.0,0.0,1.0,0.5,0.5


In [52]:
data_for_train.to_csv("X_train.csv", index = False)

In [53]:
# storing the features from dataset_test for feature scaling

scaling_feature_for_test = [feature for feature in dataset_test.columns if feature not in ["Id", "SalePrice"]]
len(scaling_feature_for_test)

79

In [54]:
scalar_for_test = MinMaxScaler()
scalar_for_test.fit(dataset_test[scaling_feature_for_test])

In [55]:
scalar_for_test.transform(dataset_test[scaling_feature_for_test])

array([[0.        , 1.        , 0.59344538, ..., 1.        , 0.5       ,
        0.5       ],
       [0.        , 0.75      , 0.59895721, ..., 1.        , 0.5       ,
        0.5       ],
       [0.23529412, 0.75      , 0.55885415, ..., 1.        , 0.5       ,
        0.5       ],
       ...,
       [0.        , 0.75      , 0.90099208, ..., 0.        , 0.5       ,
        0.        ],
       [0.38235294, 0.75      , 0.48035069, ..., 0.        , 0.5       ,
        0.5       ],
       [0.23529412, 0.75      , 0.55885415, ..., 0.        , 0.5       ,
        0.5       ]])

In [56]:
data_for_test = pd.concat([dataset_test[["Id", "SalePrice"]].reset_index(drop = False), pd.DataFrame(scalar_for_test.transform(dataset_test[scaling_feature_for_test]), columns = scaling_feature_for_test)], axis = 1)

In [57]:
data_for_test.head()

Unnamed: 0,index,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0,1461,12.247694,0.0,1.0,0.593445,0.56636,0.0,0.5,0.0,1.0,1.0,0.25,0.5,0.8,0.5,0.0,0.0,0.6,0.444444,0.625,0.625954,0.183333,0.0,0.0,0.5,0.363636,0.5,0.0,1.0,0.666667,0.5,1.0,0.5,0.0,0.666667,0.116708,0.833333,0.094364,0.126168,0.173111,1.0,0.25,1.0,0.666667,0.312253,0.0,0.0,0.312253,0.0,0.0,0.25,0.0,0.333333,0.5,0.25,0.166667,0.75,0.0,0.2,0.5,0.211538,0.666667,0.2,0.490591,0.666667,1.0,1.0,0.098315,0.0,0.0,0.0,0.208333,0.0,1.0,0.5,0.5,0.0,0.454545,1.0,0.5,0.5
1,1,1462,12.109011,0.0,0.75,0.598957,0.622527,0.0,0.5,0.333333,1.0,1.0,0.5,0.5,0.8,0.333333,0.0,0.0,0.6,0.555556,0.625,0.603053,0.133333,0.5,0.0,0.6,0.454545,0.25,0.083721,1.0,0.666667,0.5,1.0,0.5,0.0,0.833333,0.230175,0.666667,0.0,0.18972,0.260844,1.0,0.25,1.0,0.666667,0.468253,0.0,0.0,0.468253,0.0,0.0,0.25,0.5,0.5,0.5,0.5,0.25,0.75,0.0,0.2,0.5,0.201923,0.666667,0.2,0.209677,0.666667,1.0,1.0,0.275983,0.048518,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.735294,0.454545,1.0,0.5,0.5
2,2,1463,12.317167,0.235294,0.75,0.558854,0.614005,0.0,0.5,0.333333,1.0,1.0,0.25,0.5,0.95,0.333333,0.0,0.0,0.4,0.444444,0.5,0.900763,0.8,0.0,0.0,0.5,0.363636,0.5,0.0,1.0,0.666667,0.75,0.5,0.5,0.0,0.333333,0.197257,0.666667,0.0,0.064019,0.182139,1.0,1.0,1.0,0.666667,0.326139,0.376477,0.0,0.548792,0.0,0.0,0.5,0.5,0.5,0.5,0.25,0.25,0.75,0.25,1.0,0.5,0.326923,0.333333,0.4,0.323925,0.666667,1.0,1.0,0.148876,0.045822,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.0,0.181818,1.0,0.5,0.5
3,3,1464,11.849398,0.235294,0.75,0.582212,0.524583,0.0,0.5,0.333333,1.0,1.0,0.25,0.5,0.95,0.333333,0.0,0.0,0.4,0.555556,0.625,0.908397,0.8,0.0,0.0,0.5,0.363636,0.25,0.015504,1.0,0.666667,0.75,1.0,0.5,0.0,0.333333,0.150125,0.666667,0.0,0.151402,0.181747,1.0,0.75,1.0,0.666667,0.325285,0.364125,0.0,0.542672,0.0,0.0,0.5,0.5,0.5,0.5,0.5,0.333333,0.75,0.25,0.4,0.5,0.330128,0.333333,0.4,0.31586,0.666667,1.0,1.0,0.252809,0.048518,0.0,0.0,0.0,0.0,1.0,0.25,0.5,0.0,0.454545,1.0,0.5,0.5
4,4,1465,12.429216,0.588235,0.75,0.317987,0.335596,0.0,0.5,0.333333,0.666667,1.0,0.25,0.5,1.0,0.333333,0.0,0.5,0.6,0.777778,0.5,0.862595,0.7,0.0,0.0,0.9,0.727273,0.5,0.0,0.0,0.666667,0.75,0.5,0.5,0.0,0.833333,0.065586,0.666667,0.0,0.475234,0.251227,1.0,0.75,1.0,0.666667,0.453388,0.0,0.0,0.453388,0.0,0.0,0.5,0.0,0.333333,0.5,0.5,0.166667,0.75,0.0,0.2,0.5,0.310897,1.0,0.4,0.340054,0.666667,1.0,1.0,0.0,0.110512,0.0,0.0,0.25,0.0,1.0,0.25,0.5,0.0,0.0,1.0,0.5,0.5


In [58]:
data_for_test.to_csv("X_test.csv", index = False)