In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.shape , test.shape

((1460, 81), (1459, 80))

###### LotFrontage: Linear feet of street connected to property
###### Alley: Type of alley access
###### FireplaceQu: Fireplace quality
###### PoolQC: Pool quality
###### Fence: Fence quality
###### MiscFeature: Miscellaneous feature not covered in other categories

In [4]:
train.isnull().sum()[train.isnull().sum()>1000]

Alley          1369
PoolQC         1453
Fence          1179
MiscFeature    1406
dtype: int64

In [5]:
test.isnull().sum()[test.isnull().sum()>1000]

Alley          1352
PoolQC         1456
Fence          1169
MiscFeature    1408
dtype: int64

# IMPUTATION

In [6]:
train = train.drop(['Alley','PoolQC','Fence','MiscFeature'] , axis = 1)
test = test.drop(['Alley','PoolQC','Fence','MiscFeature'] , axis = 1)

In [7]:
# len(train.isnull().sum().index)
train.isnull().sum()[train.isnull().sum()>0].index

Index(['LotFrontage', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical',
       'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageQual', 'GarageCond'],
      dtype='object')

In [8]:
# len(test.isna().sum().index)
test.isnull().sum()[test.isnull().sum()>0].index

Index(['MSZoning', 'LotFrontage', 'Utilities', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'SaleType'],
      dtype='object')

In [9]:
num_imp_median = SimpleImputer(strategy='median')
# train[['LotFrontage','MasVnrArea']] = num_imp_median.fit_transform(train[['LotFrontage','MasVnrArea']])
# test[['LotFrontage','MasVnrArea']] = num_imp_median.transform(test[['LotFrontage','MasVnrArea']])

train[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']] = num_imp_median.fit_transform(train[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']])
test[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']] = num_imp_median.transform(test[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']])

# num_imp_mean = SimpleImputer(strategy='mean')
# train[['GarageYrBlt']] = num_imp_mean.fit_transform(train[['GarageYrBlt']])
# test[['GarageYrBlt']] = num_imp_mean.transform(test[['GarageYrBlt']])

In [10]:
cat_col = SimpleImputer(strategy='most_frequent')
# train[['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
#        'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType',
#        'GarageFinish', 'GarageQual', 'GarageCond']] = cat_col.fit_transform(train[['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
#        'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType',
#        'GarageFinish', 'GarageQual', 'GarageCond']])

# test[['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
#        'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish',
#        'GarageQual', 'GarageCond','Utilities']] = cat_col.transform(test[['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
#        'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish',
#        'GarageQual', 'GarageCond','Utilities']])

train[['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType','Electrical']] = cat_col.fit_transform(train[['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType','Electrical']])
test[['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType','Electrical']] = cat_col.fit_transform(test[['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType','Electrical']])

In [11]:
low_cardinality_cols_train = [cname for cname in train.columns if train[cname].nunique() < 5 and train[cname].dtype == "object"]
low_cardinality_cols_test = [cname for cname in test.columns if train[cname].nunique() < 5 and test[cname].dtype == "object"]
num_col_train = train.select_dtypes(include = 'number').columns
num_col_test = test.select_dtypes(include = 'number').columns
len(low_cardinality_cols_train),len(low_cardinality_cols_test) , len(num_col_train) , len(num_col_test)

(14, 14, 38, 37)

In [12]:
total_train_col = np.concatenate((low_cardinality_cols_train,num_col_train))
total_test_col = np.concatenate((low_cardinality_cols_test,num_col_test))

In [13]:
ftrain = train[total_train_col]
ftest = test[total_test_col]
ftrain.shape , ftest.shape 

((1460, 52), (1459, 51))

# ENCODING

In [14]:
ftrain.shape , ftest.shape

((1460, 52), (1459, 51))

In [15]:
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
ftrain_ohe = pd.DataFrame(ohe.fit_transform(ftrain[['Street', 'LotShape', 'LandContour', 'Utilities', 'LandSlope',
       'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'CentralAir', 'KitchenQual', 'GarageFinish', 'PavedDrive']]))
ftest_ohe = pd.DataFrame(ohe.transform(ftest[['Street', 'LotShape', 'LandContour', 'Utilities', 'LandSlope',
       'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'CentralAir', 'KitchenQual', 'GarageFinish', 'PavedDrive']]))
ftrain_ohe.index = ftrain.index
ftest_ohe.index = ftest.index

In [16]:
ftrain = pd.concat([ftrain,ftrain_ohe],axis = 1)
ftest = pd.concat([ftest,ftest_ohe],axis = 1)
ftrain = ftrain.iloc[:1459]

In [17]:
ftrain.shape , ftest.shape

((1459, 99), (1459, 98))

In [18]:
# ftrain.drop(['Street', 'LotShape', 'LandContour', 'Utilities', 'LandSlope',
#        'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
#        'CentralAir', 'KitchenQual', 'GarageFinish', 'PavedDrive'],axis = 1,inplace = True)
# ftest.drop(['Street', 'LotShape', 'LandContour', 'Utilities', 'LandSlope',
#        'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
#        'CentralAir', 'KitchenQual', 'GarageFinish', 'PavedDrive'],axis = 1,inplace = True)
ftrain.drop(['Street', 'LotShape', 'LandContour', 'Utilities', 'LandSlope',
       'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'CentralAir', 'KitchenQual', 'GarageFinish', 'PavedDrive',46],axis = 1,inplace = True)
ftest.drop(['Street', 'LotShape', 'LandContour', 'Utilities', 'LandSlope',
       'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'CentralAir', 'KitchenQual', 'GarageFinish', 'PavedDrive',46],axis = 1,inplace = True)

In [19]:
ftrain

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,36,37,38,39,40,41,42,43,44,45
0,1,60,65.0,8450,7,5,2003,2003,196.0,706.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1455,20,62.0,7500,7,5,2004,2005,0.0,410.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [20]:
ftrain.shape , ftest.shape

((1459, 84), (1459, 83))

In [25]:
x = ftrain.drop('SalePrice',axis = 1)
y = ftrain.SalePrice
x.shape , y.shape

((1459, 83), (1459,))

In [26]:
xtrain , xtest , ytrain , ytest = train_test_split(x,y,train_size=0.7)

In [27]:
xtrain.shape , xtest.shape , ytrain.shape , ytest.shape

((1021, 83), (438, 83), (1021,), (438,))

# MODEL ACCURACY FUNCTION

In [54]:
def model_acc(estimator,xtrain,ytrain,xtest,ytest):
    estimator = estimator
    estimator.fit(xtrain,ytrain)
    ypred = estimator.predict(xtest)
    print('R-Score: ',np.round(estimator.score(xtest,ytest),5))
    print('MEAN ABSOLUTE ERROR: ', np.round(mean_absolute_error(ytest,ypred)))
    print('MEAN SQUARED ERROR: ', np.round(mean_squared_error(ytest,ypred)))
    print('R2_Score: ', np.round(r2_score(ytest,ypred),5))    

# LinearRegression

In [55]:
model_acc(LinearRegression(),xtrain,ytrain,xtest,ytest)

R-Score:  0.77005
MEAN ABSOLUTE ERROR:  19421.0
MEAN SQUARED ERROR:  1758167152.0
R2_Score:  0.77005


# DecisionTreeRegressor

In [56]:
model_acc(DecisionTreeRegressor(),xtrain,ytrain,xtest,ytest)

R-Score:  0.78812
MEAN ABSOLUTE ERROR:  27096.0
MEAN SQUARED ERROR:  1620057112.0
R2_Score:  0.78812


# RandomForestRegressor

In [57]:
model_acc(RandomForestRegressor(),xtrain,ytrain,xtest,ytest)

R-Score:  0.867
MEAN ABSOLUTE ERROR:  18623.0
MEAN SQUARED ERROR:  1016931815.0
R2_Score:  0.867


# SupportVectorMachine

In [58]:
model_acc(SVR(kernel='linear'),xtrain,ytrain,xtest,ytest)

R-Score:  0.65668
MEAN ABSOLUTE ERROR:  26733.0
MEAN SQUARED ERROR:  2624987718.0
R2_Score:  0.65668
