### Importing required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

### Loading the dataset

In [2]:
df = pd.read_csv("HousePrices.csv")
df

Unnamed: 0,Id,Dwell_Type,Zone_Class,LotFrontage,LotArea,Road_Type,Alley,Property_Shape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Property_Sale_Price
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2068,942,60,RL,,8755,Pave,,IR1,Lvl,AllPub,...,0,,GdPrv,,0,6,2009,WD,Normal,214000
2069,943,90,RL,63.0,7711,Pave,,IR1,Lvl,AllPub,...,0,,,,0,8,2007,Oth,Abnorml,150000
2070,944,90,RL,313.0,25000,Pave,,Reg,Low,AllPub,...,0,,,,0,6,2007,WD,Normal,143000
2071,945,20,RL,52.0,14375,Pave,,IR1,Lvl,NoSeWa,...,0,,,,0,1,2009,COD,Abnorml,137500


### EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2073 entries, 0 to 2072
Data columns (total 81 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   2073 non-null   int64  
 1   Dwell_Type           2073 non-null   int64  
 2   Zone_Class           2073 non-null   object 
 3   LotFrontage          1753 non-null   float64
 4   LotArea              2073 non-null   int64  
 5   Road_Type            2073 non-null   object 
 6   Alley                129 non-null    object 
 7   Property_Shape       2073 non-null   object 
 8   LandContour          2073 non-null   object 
 9   Utilities            2073 non-null   object 
 10  LotConfig            2073 non-null   object 
 11  LandSlope            2073 non-null   object 
 12  Neighborhood         2073 non-null   object 
 13  Condition1           2073 non-null   object 
 14  Condition2           2073 non-null   object 
 15  Dwelling_Type        2073 non-null   o

In [4]:
# finding column names that have null values
[(i,df[i].isnull().sum()) for i in df.columns if df[i].isnull().sum()>0]

[('LotFrontage', 320),
 ('Alley', 1944),
 ('MasVnrType', 14),
 ('MasVnrArea', 14),
 ('BsmtQual', 59),
 ('BsmtCond', 59),
 ('BsmtExposure', 61),
 ('BsmtFinType1', 59),
 ('BsmtFinType2', 60),
 ('Electrical', 1),
 ('FireplaceQu', 988),
 ('GarageType', 113),
 ('GarageYrBlt', 113),
 ('GarageFinish', 113),
 ('GarageQual', 113),
 ('GarageCond', 113),
 ('PoolQC', 2065),
 ('Fence', 1669),
 ('MiscFeature', 1993)]

In [5]:
# Creating new age and remodage columns 
df["Age"] = df["YearBuilt"] - df["YrSold"]
df["RemodAge"] = df["YearRemodAdd"] - df["YrSold"]

In [6]:
df.corr()

Unnamed: 0,Id,Dwell_Type,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,Property_Sale_Price,Age,RemodAge
Id,1.0,0.043264,0.066902,0.004201,0.083367,0.195635,-0.080204,0.087397,-0.027147,-0.016424,...,0.001317,-0.045446,-0.00469,0.031303,-0.007318,0.000234,0.043219,-0.002491,-0.082004,0.08473
Dwell_Type,0.043264,1.0,-0.075636,-0.033609,0.046278,0.033023,0.027194,0.042864,-0.004832,-0.034072,...,-0.006529,-0.011966,-0.012021,0.001625,-0.003942,-0.020589,0.018891,-0.017399,0.02633,0.041706
LotFrontage,0.066902,-0.075636,1.0,0.311156,0.108095,-0.011213,0.09268,0.101034,0.12962,0.139424,...,-0.002309,0.042993,0.0215,0.124087,-0.005094,0.02511,0.025584,0.279836,0.091479,0.099587
LotArea,0.004201,-0.033609,0.311156,1.0,0.078232,0.014209,-0.010551,0.017766,0.06437,0.164989,...,-0.023199,0.010594,0.03537,0.062458,0.030949,-0.001707,-0.006478,0.211572,-0.010253,0.018219
OverallQual,0.083367,0.046278,0.108095,0.078232,1.0,-0.010092,0.335326,0.379579,0.283981,0.163501,...,-0.080219,0.000451,0.021561,0.045138,-0.034206,0.033685,-0.02113,0.523553,0.335839,0.38161
OverallCond,0.195635,0.033023,-0.011213,0.014209,-0.010092,1.0,-0.245303,0.084102,-0.083282,-0.035849,...,0.044501,0.017358,0.012021,-0.016993,0.041833,-0.025281,0.03505,-0.049926,-0.24654,0.081962
YearBuilt,-0.080204,0.027194,0.09268,-0.010551,0.335326,-0.245303,1.0,0.536129,0.263968,0.207941,...,-0.322195,0.034056,-0.049156,0.005788,-0.03862,0.027765,-0.006211,0.437662,0.999034,0.537457
YearRemodAdd,0.087397,0.042864,0.101034,0.017766,0.379579,0.084102,0.536129,1.0,0.166549,0.101401,...,-0.177739,0.039945,-0.036832,-0.009607,-0.011749,0.033514,0.058932,0.475565,0.532875,0.99788
MasVnrArea,-0.027147,-0.004832,0.12962,0.06437,0.283981,-0.083282,0.263968,0.166549,1.0,0.238535,...,-0.097651,0.029774,0.063607,0.002936,-0.035352,-0.017798,0.020028,0.482038,0.262774,0.165527
BsmtFinSF1,-0.016424,-0.034072,0.139424,0.164989,0.163501,-0.035849,0.207941,0.101401,0.238535,1.0,...,-0.089326,0.045092,0.046826,0.099504,-0.00152,-0.019242,0.030208,0.373276,0.206356,0.099607


In [7]:
# dropping unwanted columns
df.drop(["Id", "YearBuilt", "YearRemodAdd", "MoSold", "GarageYrBlt"], axis = 1, inplace = True)

In [8]:
columns = []
for i in df.columns:
    columns.append(i)
print(columns)

['Dwell_Type', 'Zone_Class', 'LotFrontage', 'LotArea', 'Road_Type', 'Alley', 'Property_Shape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'Dwelling_Type', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal

In [9]:
# filling missing values
df["MasVnrType"].fillna(value = df["MasVnrType"].mode()[0], inplace = True)
df["LotFrontage"].fillna(value = round(df["LotFrontage"].mean(), 1), inplace = True)
df["MasVnrArea"].fillna(value = round(df["MasVnrArea"].mean(), 1), inplace = True)
df["Electrical"].fillna(value = df["Electrical"].mode()[0], inplace = True)
df.Alley.fillna("None",inplace=True)
df.BsmtQual.fillna("None",inplace=True)
df.BsmtCond.fillna("None",inplace=True)
df.BsmtExposure.fillna('None',inplace=True)
df.BsmtFinType1.fillna("None",inplace=True)
df.BsmtFinType2.fillna("None",inplace=True)
df.FireplaceQu.fillna("None",inplace=True)
df.GarageType.fillna("None",inplace=True)
df.GarageFinish.fillna("None",inplace=True)
df.GarageQual.fillna("None",inplace=True)
df.GarageCond.fillna("None",inplace=True)
df.PoolQC.fillna("None",inplace=True)
df.MiscVal.fillna("None",inplace=True)
df.Fence.fillna("None",inplace=True)

In [10]:
df.drop(["Dwell_Type", "OverallCond", "BsmtFinSF2", "LowQualFinSF", "BsmtHalfBath", "BedroomAbvGr", "KitchenAbvGr", "EnclosedPorch",
"ScreenPorch", "PoolArea", "MiscVal", "YrSold", ], axis = 1, inplace = True)

In [11]:
obj_col = df.select_dtypes(include=['object']).columns
obj_col

Index(['Zone_Class', 'Road_Type', 'Alley', 'Property_Shape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'Dwelling_Type', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC',
       'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object')

In [12]:
float_col = df.select_dtypes(include=['float64']).columns
float_col

Index(['LotFrontage', 'MasVnrArea'], dtype='object')

In [13]:
int_col = df.select_dtypes(include=['int64']).columns
int_col

Index(['LotArea', 'OverallQual', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
       'HalfBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF', '3SsnPorch', 'Property_Sale_Price', 'Age',
       'RemodAge'],
      dtype='object')

In [14]:
# encoding ordinal data
LE = OrdinalEncoder()
df[['OverallQual', 'ExterQual', 'ExterCond',  'BsmtQual',
    'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'HeatingQC', 
    'KitchenQual', 'Functional', 'GarageQual', 'GarageCond','FireplaceQu', 'PoolQC']] = LE.fit_transform(df[['OverallQual', 
                                                                       'ExterQual', 'ExterCond',  'BsmtQual',
                                                        'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'HeatingQC', 
                                            'KitchenQual', 'Functional', 'GarageQual', 'GarageCond','FireplaceQu', 'PoolQC']])

In [15]:
# dropping columns after encoding
df.drop(['OverallQual', 'ExterQual', 'ExterCond',  'BsmtQual',
    'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'HeatingQC', 
    'KitchenQual', 'Functional', 'GarageQual', 'GarageCond','FireplaceQu', 'PoolQC'], axis = 1, inplace = True)

In [16]:
# one hot encoding 
df = pd.get_dummies(df, columns=['Alley', 'Zone_Class', 'Road_Type',  'Property_Shape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'Dwelling_Type', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 
       'Foundation', 'Heating', 'CentralAir', 'Electrical',
         'GarageType',
       'GarageFinish', 'PavedDrive',  "MiscFeature", 
         'SaleType', 'SaleCondition', 'Fence'], drop_first = True)
df.sample(10)

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,...,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_None
1167,58.0,10852,0.0,786,173,959,959,712,1671,1,...,1,0,0,0,1,0,0,0,0,1
760,70.0,9100,0.0,612,252,864,864,0,864,0,...,1,0,0,0,1,0,0,0,0,1
853,72.1,12095,115.0,564,563,1127,1445,0,1445,0,...,1,0,0,0,1,0,0,1,0,0
724,86.0,13286,340.0,1234,464,1698,1698,0,1698,1,...,1,0,0,0,1,0,0,0,0,1
595,69.0,11302,238.0,1422,392,1814,1826,0,1826,1,...,0,0,0,0,0,1,0,0,0,1
125,60.0,6780,0.0,490,30,520,520,0,754,1,...,1,0,0,0,1,0,0,0,0,1
797,57.0,7677,0.0,570,203,773,773,0,773,0,...,1,0,0,0,0,0,0,0,0,1
1282,61.0,8800,0.0,532,364,1040,1040,0,1040,0,...,1,0,0,0,1,0,0,0,0,1
457,72.1,53227,0.0,1116,248,1364,1663,0,1663,1,...,1,0,0,0,1,0,0,0,0,1
1836,44.0,13758,117.0,902,254,1156,1187,530,1717,0,...,1,0,0,0,1,0,0,0,0,1


In [17]:
X = df.drop(labels= ["Property_Sale_Price"] , axis = 1)
Y = df["Property_Sale_Price"]

###  train and test data split

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 2)

### Using robust scaler

In [19]:
from sklearn.preprocessing import RobustScaler
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20,random_state=0)
st_x= RobustScaler()    
X_train= st_x.fit_transform(X_train)    
X_test= st_x.transform(X_test)

### Linear Regression

In [20]:
model = LinearRegression()

In [21]:
model.fit(X_train, Y_train)

LinearRegression()

In [22]:
model.score(X_train, Y_train)

0.9017555701370908

In [23]:
model.score(X_test, Y_test)

0.7784901938843787