In [222]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn 
import seaborn as sns

# Model training

###  Load the dataset

In [223]:
data = pd.read_csv("data/train.csv")
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [224]:
data.shape

(1460, 81)

#### check correlation 

In [225]:
data.corr()['SalePrice'].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

In [226]:
# data=data.drop(['BsmtFinSF2', 'BsmtHalfBath','MiscVal','Id','LowQualFinSF','YrSold','OverallCond','MSSubClass','EnclosedPorch','KitchenAbvGr'], axis = 1)
# data

In [227]:
data=data[['SalePrice','GrLivArea','1stFlrSF','MasVnrArea','LotFrontage', 'GarageArea','TotalBsmtSF','OverallQual','Street','MSZoning','HouseStyle']]
data

Unnamed: 0,SalePrice,GrLivArea,1stFlrSF,MasVnrArea,LotFrontage,GarageArea,TotalBsmtSF,OverallQual,Street,MSZoning,HouseStyle
0,208500,1710,856,196.0,65.0,548,856,7,Pave,RL,2Story
1,181500,1262,1262,0.0,80.0,460,1262,6,Pave,RL,1Story
2,223500,1786,920,162.0,68.0,608,920,7,Pave,RL,2Story
3,140000,1717,961,0.0,60.0,642,756,7,Pave,RL,2Story
4,250000,2198,1145,350.0,84.0,836,1145,8,Pave,RL,2Story
5,143000,1362,796,0.0,85.0,480,796,5,Pave,RL,1.5Fin
6,307000,1694,1694,186.0,75.0,636,1686,8,Pave,RL,1Story
7,200000,2090,1107,240.0,,484,1107,7,Pave,RL,2Story
8,129900,1774,1022,0.0,51.0,468,952,7,Pave,RM,1.5Fin
9,118000,1077,1077,0.0,50.0,205,991,5,Pave,RL,1.5Unf


#### Prepare features

In [228]:
X = data.drop('SalePrice', axis=1)

y = data['SalePrice'].values.reshape(-1,1)

### Data set split

In [229]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [230]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1095, 10)
(365, 10)
(1095, 1)
(365, 1)


In [231]:
len(X_train),len(X_test)

(1095, 365)

In [232]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1095 entries, 106 to 893
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   GrLivArea    1095 non-null   int64  
 1   1stFlrSF     1095 non-null   int64  
 2   MasVnrArea   1088 non-null   float64
 3   LotFrontage  901 non-null    float64
 4   GarageArea   1095 non-null   int64  
 5   TotalBsmtSF  1095 non-null   int64  
 6   OverallQual  1095 non-null   int64  
 7   Street       1095 non-null   object 
 8   MSZoning     1095 non-null   object 
 9   HouseStyle   1095 non-null   object 
dtypes: float64(2), int64(5), object(3)
memory usage: 94.1+ KB


### Data Preprocessing

#### Check missing value

In [233]:

pd.set_option('display.max_rows', None)
null = pd.DataFrame(X_train.isnull().sum().sort_values(ascending = False), columns = ['null_sum'])
null['null_percentage'] = null['null_sum']/X_train.shape[0]*100
null = null[null['null_sum']>0]
null

Unnamed: 0,null_sum,null_percentage
LotFrontage,194,17.716895
MasVnrArea,7,0.639269


#### Let's drop some features that are missing a lot of values

In [234]:

X_train=X_train.drop('LotFrontage', axis = 1)
X_train

Unnamed: 0,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual,Street,MSZoning,HouseStyle
106,1047,1047,0.0,273,641,4,Pave,RM,1Story
538,1159,1159,188.0,336,1051,5,Pave,RL,1Story
1153,816,816,0.0,432,816,6,Pave,RM,1Story
1299,1246,1246,0.0,305,1246,5,Pave,RL,1Story
1226,1933,894,74.0,668,894,6,Pave,RL,2Story
476,1493,1493,215.0,508,1478,6,Pave,RL,1Story
1047,990,990,0.0,672,990,5,Pave,RL,1Story
997,1442,1442,571.0,615,1442,6,Pave,RL,1Story
72,1718,885,40.0,427,832,7,Pave,RL,2Story
889,1505,1505,90.0,505,1505,6,Pave,RL,1Story


#### Filter missing data

In [235]:

def replace_missing_value(data):
    for col in data.columns:
        if data[col].dtypes=='object':
            data[col].fillna(data[col].mode()[0],inplace=True)
        elif data[col].dtypes=='int64' or data[col].dtypes=='float64':
            data[col].fillna(data[col].mean(),inplace=True) 
    return data



In [236]:
X_train= replace_missing_value(X_train)

In [237]:
X_train.isnull().values.any()

False

#### Filter the number of numerical features and the number of categorical features

In [238]:
def classify_category(dataset):
    cat_features = dataset.select_dtypes(include = ['object']).columns.tolist()
    print(len(cat_features))
    num_features = dataset.select_dtypes(exclude = ['object']).columns.tolist()
    print(len(num_features))
    return cat_features, num_features

In [239]:
categorical_col_train,numerical_col_train=classify_category(X_train)

3
6


In [240]:
print(numerical_col_train)

['GrLivArea', '1stFlrSF', 'MasVnrArea', 'GarageArea', 'TotalBsmtSF', 'OverallQual']


In [241]:
print(categorical_col_train)

['Street', 'MSZoning', 'HouseStyle']


### Encoding categorical_features

In [242]:


from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse = False)
housing_caterogy_onehot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(X_train[categorical_col_train]))
housing_caterogy_onehot_encoded.columns = one_hot_encoder.get_feature_names_out(categorical_col_train)
housing_caterogy_onehot_encoded.index = np.arange(1, len(X_train)+1)                
housing_caterogy_onehot_encoded

X_train.drop(categorical_col_train, axis=1, inplace=True)

X_train= pd.concat([X_train.reset_index(), housing_caterogy_onehot_encoded.reset_index()], axis=1)
X_train.head(6)


Unnamed: 0,index,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual,index.1,Street_Grvl,Street_Pave,...,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,106,1047,1047,0.0,273,641,4,1,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,538,1159,1159,188.0,336,1051,5,2,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1153,816,816,0.0,432,816,6,3,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1299,1246,1246,0.0,305,1246,5,4,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1226,1933,894,74.0,668,894,6,5,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,476,1493,1493,215.0,508,1478,6,6,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [243]:
X_train

Unnamed: 0,index,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual,index.1,Street_Grvl,Street_Pave,...,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,106,1047,1047,0.0,273,641,4,1,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,538,1159,1159,188.0,336,1051,5,2,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1153,816,816,0.0,432,816,6,3,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1299,1246,1246,0.0,305,1246,5,4,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1226,1933,894,74.0,668,894,6,5,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,476,1493,1493,215.0,508,1478,6,6,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,1047,990,990,0.0,672,990,5,7,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,997,1442,1442,571.0,615,1442,6,8,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,72,1718,885,40.0,427,832,7,9,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,889,1505,1505,90.0,505,1505,6,10,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [244]:
X_train.isnull().values.any()

False

In [245]:
# to sure that we didn't have any null
# X_train.fillna(0, inplace=True)

Scaling
We will scale the features that have a spread distribution:

In [246]:
X_train[numerical_col_train].head(6)

Unnamed: 0,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual
0,1047,1047,0.0,273,641,4
1,1159,1159,188.0,336,1051,5
2,816,816,0.0,432,816,6
3,1246,1246,0.0,305,1246,5
4,1933,894,74.0,668,894,6
5,1493,1493,215.0,508,1478,6


### Scaling numeric features

In [247]:
from sklearn.preprocessing import MinMaxScaler

scalar=MinMaxScaler(copy=True)
df = X_train.loc[ : , X_train.columns]
columns=df.columns
X_train[columns]=scalar.fit_transform(X_train[columns])
X_train

Unnamed: 0,index,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual,index.1,Street_Grvl,Street_Pave,...,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,0.072653,0.134326,0.163607,0.0,0.192525,0.10491,0.333333,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.368746,0.155426,0.189307,0.13643,0.236953,0.172013,0.444444,0.000914,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.790267,0.090806,0.110601,0.0,0.304654,0.133552,0.555556,0.001828,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.890336,0.171816,0.20927,0.0,0.215092,0.203928,0.444444,0.002742,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.840302,0.301243,0.128499,0.053701,0.471086,0.146318,0.555556,0.003656,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.326251,0.21835,0.265948,0.156023,0.358251,0.241899,0.555556,0.00457,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0.717615,0.123587,0.150528,0.0,0.473907,0.162029,0.444444,0.005484,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,0.683345,0.208742,0.254245,0.414369,0.433709,0.236007,0.555556,0.006399,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.049349,0.260739,0.126434,0.029028,0.301128,0.13617,0.666667,0.007313,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0.609321,0.22061,0.268701,0.065312,0.356135,0.246318,0.555556,0.008227,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [248]:
X_train.head(6)

Unnamed: 0,index,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual,index.1,Street_Grvl,Street_Pave,...,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,0.072653,0.134326,0.163607,0.0,0.192525,0.10491,0.333333,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.368746,0.155426,0.189307,0.13643,0.236953,0.172013,0.444444,0.000914,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.790267,0.090806,0.110601,0.0,0.304654,0.133552,0.555556,0.001828,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.890336,0.171816,0.20927,0.0,0.215092,0.203928,0.444444,0.002742,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.840302,0.301243,0.128499,0.053701,0.471086,0.146318,0.555556,0.003656,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.326251,0.21835,0.265948,0.156023,0.358251,0.241899,0.555556,0.00457,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Training the Algorithm

In [249]:
len(X_train),len(y_train)

(1095, 1095)

In [250]:

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

print ('Coefficients: ', regressor.intercept_)

print ('Coefficients: ', regressor.coef_)

Coefficients:  [-67314.51957968]
Coefficients:  [[  -5182.40129978  371571.16797005 -105146.06727582   43100.29886823
    67199.10433637   73534.29984092  235057.53185978   -5255.30086357
     -576.62139902     576.62139902   -4353.30519133    9170.10224997
    -8278.58315198    9889.96457977   -6428.17848642   -3305.28383175
    14692.3320902    26567.280977    -41395.96273799  -25628.81869612
    -8439.0977118    28414.72241394    9094.82749652]]


In [251]:
len(X_train),len(y_train)

(1095, 1095)

# Model evaluation

In [252]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365 entries, 1126 to 365
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   GrLivArea    365 non-null    int64  
 1   1stFlrSF     365 non-null    int64  
 2   MasVnrArea   364 non-null    float64
 3   LotFrontage  300 non-null    float64
 4   GarageArea   365 non-null    int64  
 5   TotalBsmtSF  365 non-null    int64  
 6   OverallQual  365 non-null    int64  
 7   Street       365 non-null    object 
 8   MSZoning     365 non-null    object 
 9   HouseStyle   365 non-null    object 
dtypes: float64(2), int64(5), object(3)
memory usage: 31.4+ KB


In [253]:
X_test.columns

Index(['GrLivArea', '1stFlrSF', 'MasVnrArea', 'LotFrontage', 'GarageArea',
       'TotalBsmtSF', 'OverallQual', 'Street', 'MSZoning', 'HouseStyle'],
      dtype='object')

### Data Preprocessing

#### Check missing value

In [254]:
pd.set_option('display.max_rows', None)
null = pd.DataFrame(X_test.isnull().sum().sort_values(ascending = False), columns = ['null_sum'])
null['null_percentage'] = null['null_sum']/X_test.shape[0]*100
null = null[null['null_sum']>0]
null

Unnamed: 0,null_sum,null_percentage
LotFrontage,65,17.808219
MasVnrArea,1,0.273973


#### Let's drop some features that are missing a lot of values

In [255]:
X_test=X_test.drop('LotFrontage', axis = 1)
X_test

Unnamed: 0,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual,Street,MSZoning,HouseStyle
1126,1555,1555,130.0,660,1373,7,Pave,RL,1Story
236,1414,1414,98.0,494,1414,7,Pave,RL,1Story
120,988,988,0.0,540,938,6,Pave,RL,SLvl
824,1489,1489,0.0,776,1489,8,Pave,FV,1Story
998,1077,1077,0.0,210,1007,3,Pave,RM,1Story
1149,1482,832,0.0,324,650,7,Pave,RM,2Story
1159,1876,943,270.0,540,901,6,Pave,RL,2Story
1354,1865,992,0.0,839,992,7,Pave,RL,2Story
592,816,816,0.0,816,816,5,Pave,RL,1Story
114,2320,1436,0.0,180,1028,6,Pave,RL,2Story


#### Replace missing data

In [256]:
X_test= replace_missing_value(X_test)

In [257]:
X_test.isnull().values.any()

False

#### Filter the number of numerical features and the number of categorical features

In [258]:
categorical_col_test,numerical_col_test=classify_category(X_test)

3
6


In [259]:
print(numerical_col_test)

['GrLivArea', '1stFlrSF', 'MasVnrArea', 'GarageArea', 'TotalBsmtSF', 'OverallQual']


In [260]:
print(categorical_col_test)

['Street', 'MSZoning', 'HouseStyle']


### Encoding categorical_features

In [261]:
one_hot_encoder = OneHotEncoder(sparse = False)
housing_caterogy_onehot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(X_test[categorical_col_test]))
housing_caterogy_onehot_encoded.columns = one_hot_encoder.get_feature_names_out(categorical_col_test)
housing_caterogy_onehot_encoded.index = np.arange(1, len(X_test)+1)                
housing_caterogy_onehot_encoded

X_test.drop(categorical_col_test, axis=1, inplace=True)

X_test= pd.concat([X_test.reset_index(), housing_caterogy_onehot_encoded.reset_index()], axis=1)
X_test.head(6)


Unnamed: 0,index,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual,index.1,Street_Grvl,Street_Pave,...,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,1126,1555,1555,130.0,660,1373,7,1,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,236,1414,1414,98.0,494,1414,7,2,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,120,988,988,0.0,540,938,6,3,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,824,1489,1489,0.0,776,1489,8,4,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,998,1077,1077,0.0,210,1007,3,5,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,1149,1482,832,0.0,324,650,7,6,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [262]:
X_test

Unnamed: 0,index,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual,index.1,Street_Grvl,Street_Pave,...,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,1126,1555,1555,130.0,660,1373,7,1,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,236,1414,1414,98.0,494,1414,7,2,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,120,988,988,0.0,540,938,6,3,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,824,1489,1489,0.0,776,1489,8,4,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,998,1077,1077,0.0,210,1007,3,5,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,1149,1482,832,0.0,324,650,7,6,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,1159,1876,943,270.0,540,901,6,7,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,1354,1865,992,0.0,839,992,7,8,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,592,816,816,0.0,816,816,5,9,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,114,2320,1436,0.0,180,1028,6,10,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [263]:
X_test.isnull().values.any()

False

### Scaling numeric feature

In [264]:
from sklearn.preprocessing import MinMaxScaler

scalar=MinMaxScaler(copy=True)
df = X_test.loc[ : , X_test.columns]
columns=df.columns
X_test[columns]=scalar.fit_transform(X_test[columns])
X_test

Unnamed: 0,index,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual,index.1,Street_Grvl,Street_Pave,...,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,0.774483,0.272655,0.390528,0.08125,0.627376,0.42826,0.666667,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.16069,0.235511,0.339162,0.06125,0.469582,0.441048,0.666667,0.002747,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.08069,0.123288,0.183971,0.0,0.513308,0.292576,0.555556,0.005495,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.566207,0.255269,0.366485,0.0,0.737643,0.464442,0.777778,0.008242,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.686207,0.146733,0.216393,0.0,0.19962,0.314099,0.222222,0.010989,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.790345,0.253425,0.12714,0.0,0.307985,0.202745,0.666667,0.013736,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,0.797241,0.357218,0.167577,0.16875,0.513308,0.281036,0.555556,0.016484,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.931724,0.35432,0.185428,0.0,0.797529,0.30942,0.666667,0.019231,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,0.406207,0.077977,0.121311,0.0,0.775665,0.254523,0.444444,0.021978,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,0.076552,0.474183,0.347177,0.0,0.171103,0.320649,0.555556,0.024725,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [265]:
# from sklearn import preprocessing
# min_max_scaler = preprocessing.MinMaxScaler()
# X_test.loc[:,numerical_col_test] = min_max_scaler.fit_transform(X_test[numerical_col_test])
# X_test[numerical_col_test].head(6)

In [266]:
# pd.options.mode.chained_assignment = None  # default='warn'
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_test.loc[:,numerical_col_test] = scaler.fit_transform(X_test[numerical_col_test])
# X_test[numerical_col_test].head(6)

In [267]:
X_test.head(6)

Unnamed: 0,index,GrLivArea,1stFlrSF,MasVnrArea,GarageArea,TotalBsmtSF,OverallQual,index.1,Street_Grvl,Street_Pave,...,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
0,0.774483,0.272655,0.390528,0.08125,0.627376,0.42826,0.666667,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.16069,0.235511,0.339162,0.06125,0.469582,0.441048,0.666667,0.002747,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.08069,0.123288,0.183971,0.0,0.513308,0.292576,0.555556,0.005495,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.566207,0.255269,0.366485,0.0,0.737643,0.464442,0.777778,0.008242,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.686207,0.146733,0.216393,0.0,0.19962,0.314099,0.222222,0.010989,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.790345,0.253425,0.12714,0.0,0.307985,0.202745,0.666667,0.013736,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Make prediction AND and evaluation

In [268]:
y_pred = regressor.predict(X_test)
y_pred[:6]

array([[259811.87231272],
       [244052.25627791],
       [164861.94108952],
       [288881.78451914],
       [ 70302.53194102],
       [187333.77572339]])

In [269]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score, explained_variance_score

def Performance_metrics(y_pred, y_val):
    y_pred = y_pred.squeeze()
#     y_pred cua test ( validation) data
    y_val = y_val.squeeze()

    print("Mean square error (MSE): %.2f" % np.mean((y_pred - y_val) ** 2))
    print("Root mean square error (RMSE): %.2f" % np.sqrt(np.mean((y_pred - y_val) ** 2)))
    print("Mean absolute error (MAE): %.2f" % np.mean(abs(y_pred - y_val)))
    print("Coefficient of determination (R^2): %.2f" % r2_score(y_val, y_pred))

In [270]:
Performance_metrics(y_pred,y_test)

Mean square error (MSE): 1998440624.80
Root mean square error (RMSE): 44703.92
Mean absolute error (MAE): 35726.51
Coefficient of determination (R^2): 0.69
