In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns


# Selecting the predictors and the target variable

In [3]:
houses_df = pd.read_csv('/workspaces/House_Sale/Houses_Cleaned_data.csv')

In [4]:
houses_df.columns

Index(['Unnamed: 0', 'Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
       'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPo

In [5]:
y = houses_df[['SalePrice']]
y.columns

Index(['SalePrice'], dtype='object')

In [6]:
x = houses_df.iloc[:, :-1]
x.columns


Index(['Unnamed: 0', 'Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
       'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPo

# Testing Assumptions
**Linearity: The relationship between the predictors and the target should be linear.**

**Independence: Observations should be independent of each other.**

**Homoscedasticity: The variance of errors should be constant across all levels of the independent variables.**

**Normality of residuals: Residuals (the difference between predicted and actual values) should be normally distributed.**

In [7]:
# Creating Dummy variables
categorical_columns = x.select_dtypes(include=['object', 'category', 'bool']).columns

print("Categorical Features:", categorical_columns)

Categorical Features: Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [8]:

categorical_columns = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
                       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
                       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
                       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
                       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
                       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
                       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
                       'SaleType', 'SaleCondition']


df_encoded = pd.get_dummies(houses_df, columns=categorical_columns, drop_first=True)


In [9]:
correlation_with_target = df_encoded.corr()['SalePrice']
correlation_with_target


Unnamed: 0              -0.021917
Id                      -0.021917
MSSubClass              -0.084284
LotFrontage              0.320006
LotArea                  0.263843
                           ...   
SaleCondition_AdjLand   -0.050686
SaleCondition_Alloca    -0.015525
SaleCondition_Family    -0.046480
SaleCondition_Normal    -0.153990
SaleCondition_Partial    0.352060
Name: SalePrice, Length: 232, dtype: float64

In [10]:
filtered_correlation = correlation_with_target[(correlation_with_target > 0.4) | (correlation_with_target < -0.4)]
filtered_correlation

OverallQual             0.790982
YearBuilt               0.522897
YearRemodAdd            0.507101
MasVnrArea              0.475890
TotalBsmtSF             0.613581
1stFlrSF                0.605852
GrLivArea               0.708624
FullBath                0.560664
TotRmsAbvGrd            0.533723
Fireplaces              0.466929
GarageCars              0.640409
GarageArea              0.623431
SalePrice               1.000000
Neighborhood_NridgHt    0.402149
ExterQual_Gd            0.452466
ExterQual_TA           -0.589044
Foundation_PConc        0.497734
BsmtQual_TA            -0.494737
BsmtFinType1_GLQ        0.434622
KitchenQual_TA         -0.519298
GarageType_Detchd      -0.404563
GarageFinish_Unf       -0.506542
Name: SalePrice, dtype: float64

In [11]:
x_features = [
    'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', 
    '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 
    'GarageCars', 'GarageArea', 'Neighborhood_NridgHt', 'ExterQual_Gd', 
    'ExterQual_TA', 'Foundation_PConc', 'BsmtQual_TA', 'BsmtFinType1_GLQ', 
    'KitchenQual_TA', 'GarageType_Detchd', 'GarageFinish_Unf'
]

x_features_df = df_encoded[x_features]

x_features_df


Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,Fireplaces,...,GarageArea,Neighborhood_NridgHt,ExterQual_Gd,ExterQual_TA,Foundation_PConc,BsmtQual_TA,BsmtFinType1_GLQ,KitchenQual_TA,GarageType_Detchd,GarageFinish_Unf
0,7,2003,2003,196.0,856,856,1710,2,8,0,...,548,False,True,False,True,False,True,False,False,False
1,6,1976,1976,0.0,1262,1262,1262,2,6,1,...,460,False,False,True,False,False,False,True,False,False
2,7,2001,2002,162.0,920,920,1786,2,6,1,...,608,False,True,False,True,False,True,False,False,False
3,7,1915,1970,0.0,756,961,1717,1,7,1,...,642,False,False,True,False,True,False,False,True,True
4,8,2000,2000,350.0,1145,1145,2198,2,9,1,...,836,False,True,False,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,6,1999,2000,0.0,953,953,1647,2,7,1,...,460,False,False,True,True,False,False,True,False,False
1456,6,1978,1988,119.0,1542,2073,2073,2,7,2,...,500,False,False,True,False,False,False,True,False,True
1457,7,1941,2006,0.0,1152,1188,2340,2,9,2,...,252,False,False,False,False,True,True,False,False,False
1458,5,1950,1996,0.0,1078,1078,1078,1,5,0,...,240,False,False,True,False,True,True,False,False,True


In [12]:
y_target = df_encoded['SalePrice']

**Independence**

In [None]:
highly_correlated_variables = df_encoded[['OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', 
    '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 
    'GarageCars', 'GarageArea', 'Neighborhood_NridgHt', 'ExterQual_Gd', 
    'ExterQual_TA', 'Foundation_PConc', 'BsmtQual_TA', 'BsmtFinType1_GLQ', 
    'KitchenQual_TA', 'GarageType_Detchd', 'GarageFinish_Unf']]
correlation_matrix = highly_correlated_variables.corr()


correlation_matrix

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,Fireplaces,...,GarageArea,Neighborhood_NridgHt,ExterQual_Gd,ExterQual_TA,Foundation_PConc,BsmtQual_TA,BsmtFinType1_GLQ,KitchenQual_TA,GarageType_Detchd,GarageFinish_Unf
OverallQual,1.0,0.572323,0.550684,0.411074,0.537808,0.476224,0.593007,0.5506,0.427452,0.396765,...,0.562022,0.368724,0.547731,-0.646247,0.568211,-0.556755,0.434125,-0.553891,-0.399385,-0.527719
YearBuilt,0.572323,1.0,0.592855,0.315726,0.391452,0.281986,0.19901,0.468271,0.095589,0.147716,...,0.478954,0.268898,0.55791,-0.589126,0.651199,-0.649605,0.480674,-0.459792,-0.543072,-0.616193
YearRemodAdd,0.550684,0.592855,1.0,0.18006,0.291066,0.240379,0.287389,0.439046,0.19174,0.112581,...,0.3716,0.243558,0.531482,-0.565424,0.569728,-0.533899,0.402723,-0.576964,-0.331601,-0.462122
MasVnrArea,0.411074,0.315726,0.18006,1.0,0.362811,0.342337,0.390104,0.276193,0.279875,0.247818,...,0.372971,0.297337,0.145728,-0.261284,0.193938,-0.206403,0.227309,-0.196706,-0.209341,-0.271764
TotalBsmtSF,0.537808,0.391452,0.291066,0.362811,1.0,0.81953,0.454868,0.323722,0.285573,0.339519,...,0.486665,0.293445,0.283254,-0.390398,0.307701,-0.313182,0.313518,-0.31119,-0.344318,-0.368231
1stFlrSF,0.476224,0.281986,0.240379,0.342337,0.81953,1.0,0.566024,0.380637,0.409516,0.410531,...,0.489782,0.254693,0.208732,-0.312844,0.197188,-0.223524,0.243613,-0.273566,-0.344948,-0.316488
GrLivArea,0.593007,0.19901,0.287389,0.390104,0.454868,0.566024,1.0,0.630012,0.825489,0.461679,...,0.468997,0.179892,0.302516,-0.386146,0.292764,-0.302291,0.194555,-0.350179,-0.261229,-0.319531
FullBath,0.5506,0.468271,0.439046,0.276193,0.323722,0.380637,0.630012,1.0,0.554784,0.243671,...,0.405656,0.203038,0.425119,-0.466339,0.461667,-0.495636,0.259089,-0.418698,-0.334431,-0.438946
TotRmsAbvGrd,0.427452,0.095589,0.19174,0.279875,0.285573,0.409516,0.825489,0.554784,1.0,0.326114,...,0.337822,0.168094,0.166488,-0.242485,0.192609,-0.162767,0.064335,-0.210428,-0.175447,-0.208845
Fireplaces,0.396765,0.147716,0.112581,0.247818,0.339519,0.410531,0.461679,0.243671,0.326114,1.0,...,0.269141,0.151202,0.152868,-0.200168,0.10351,-0.16716,0.115473,-0.186137,-0.284338,-0.267282


In [19]:
dependent_variables = correlation_matrix[(correlation_matrix < 0.4) | (correlation_matrix > -0.4)]
dependent_variables

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,Fireplaces,...,GarageArea,Neighborhood_NridgHt,ExterQual_Gd,ExterQual_TA,Foundation_PConc,BsmtQual_TA,BsmtFinType1_GLQ,KitchenQual_TA,GarageType_Detchd,GarageFinish_Unf
OverallQual,1.0,0.572323,0.550684,0.411074,0.537808,0.476224,0.593007,0.5506,0.427452,0.396765,...,0.562022,0.368724,0.547731,-0.646247,0.568211,-0.556755,0.434125,-0.553891,-0.399385,-0.527719
YearBuilt,0.572323,1.0,0.592855,0.315726,0.391452,0.281986,0.19901,0.468271,0.095589,0.147716,...,0.478954,0.268898,0.55791,-0.589126,0.651199,-0.649605,0.480674,-0.459792,-0.543072,-0.616193
YearRemodAdd,0.550684,0.592855,1.0,0.18006,0.291066,0.240379,0.287389,0.439046,0.19174,0.112581,...,0.3716,0.243558,0.531482,-0.565424,0.569728,-0.533899,0.402723,-0.576964,-0.331601,-0.462122
MasVnrArea,0.411074,0.315726,0.18006,1.0,0.362811,0.342337,0.390104,0.276193,0.279875,0.247818,...,0.372971,0.297337,0.145728,-0.261284,0.193938,-0.206403,0.227309,-0.196706,-0.209341,-0.271764
TotalBsmtSF,0.537808,0.391452,0.291066,0.362811,1.0,0.81953,0.454868,0.323722,0.285573,0.339519,...,0.486665,0.293445,0.283254,-0.390398,0.307701,-0.313182,0.313518,-0.31119,-0.344318,-0.368231
1stFlrSF,0.476224,0.281986,0.240379,0.342337,0.81953,1.0,0.566024,0.380637,0.409516,0.410531,...,0.489782,0.254693,0.208732,-0.312844,0.197188,-0.223524,0.243613,-0.273566,-0.344948,-0.316488
GrLivArea,0.593007,0.19901,0.287389,0.390104,0.454868,0.566024,1.0,0.630012,0.825489,0.461679,...,0.468997,0.179892,0.302516,-0.386146,0.292764,-0.302291,0.194555,-0.350179,-0.261229,-0.319531
FullBath,0.5506,0.468271,0.439046,0.276193,0.323722,0.380637,0.630012,1.0,0.554784,0.243671,...,0.405656,0.203038,0.425119,-0.466339,0.461667,-0.495636,0.259089,-0.418698,-0.334431,-0.438946
TotRmsAbvGrd,0.427452,0.095589,0.19174,0.279875,0.285573,0.409516,0.825489,0.554784,1.0,0.326114,...,0.337822,0.168094,0.166488,-0.242485,0.192609,-0.162767,0.064335,-0.210428,-0.175447,-0.208845
Fireplaces,0.396765,0.147716,0.112581,0.247818,0.339519,0.410531,0.461679,0.243671,0.326114,1.0,...,0.269141,0.151202,0.152868,-0.200168,0.10351,-0.16716,0.115473,-0.186137,-0.284338,-0.267282


In [20]:
dependent_variables.columns

Index(['OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF',
       '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageCars', 'GarageArea', 'Neighborhood_NridgHt', 'ExterQual_Gd',
       'ExterQual_TA', 'Foundation_PConc', 'BsmtQual_TA', 'BsmtFinType1_GLQ',
       'KitchenQual_TA', 'GarageType_Detchd', 'GarageFinish_Unf'],
      dtype='object')

In [23]:
final_x_features = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF',
       '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageCars', 'GarageArea', 'Neighborhood_NridgHt', 'ExterQual_Gd',
       'ExterQual_TA', 'Foundation_PConc', 'BsmtQual_TA', 'BsmtFinType1_GLQ',
       'KitchenQual_TA', 'GarageType_Detchd', 'GarageFinish_Unf']

predictor_variables = df_encoded[final_x_features]
predictor_variables

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,Fireplaces,...,GarageArea,Neighborhood_NridgHt,ExterQual_Gd,ExterQual_TA,Foundation_PConc,BsmtQual_TA,BsmtFinType1_GLQ,KitchenQual_TA,GarageType_Detchd,GarageFinish_Unf
0,7,2003,2003,196.0,856,856,1710,2,8,0,...,548,False,True,False,True,False,True,False,False,False
1,6,1976,1976,0.0,1262,1262,1262,2,6,1,...,460,False,False,True,False,False,False,True,False,False
2,7,2001,2002,162.0,920,920,1786,2,6,1,...,608,False,True,False,True,False,True,False,False,False
3,7,1915,1970,0.0,756,961,1717,1,7,1,...,642,False,False,True,False,True,False,False,True,True
4,8,2000,2000,350.0,1145,1145,2198,2,9,1,...,836,False,True,False,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,6,1999,2000,0.0,953,953,1647,2,7,1,...,460,False,False,True,True,False,False,True,False,False
1456,6,1978,1988,119.0,1542,2073,2073,2,7,2,...,500,False,False,True,False,False,False,True,False,True
1457,7,1941,2006,0.0,1152,1188,2340,2,9,2,...,252,False,False,False,False,True,True,False,False,False
1458,5,1950,1996,0.0,1078,1078,1078,1,5,0,...,240,False,False,True,False,True,True,False,False,True


In [24]:
X_train, X_test, y_train, y_test = train_test_split(predictor_variables, y_target, test_size=0.2, random_state=42)


In [25]:
model = LinearRegression()

model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)
y_pred

array([150257.65127594, 304318.51634342, 118960.51547911, 169215.29341873,
       281780.34132415,  53046.67608718, 227876.69518909, 162229.85065257,
        55485.65408036, 120564.88426007, 144819.06237638, 102970.27305976,
       129698.24025186, 249151.26238927, 187211.64349475, 121638.14665174,
       210421.27462282, 125501.38482017, 119939.12597127, 218580.77909371,
       180584.54440431, 210790.6218439 , 188389.04970636, 112104.56041769,
       213064.98810487, 179858.61281489, 207103.0944254 ,  87703.66573761,
       182646.29431462, 204898.29450396, 126672.78755676, 288096.92199893,
       207196.26982268,  82041.43503154, 270588.08371896, 143886.63237428,
       151164.7189717 , 219957.62462546, 295870.8748346 ,  83195.88383179,
       140804.7563445 , 254078.27991918, 102296.15568277, 345913.12753596,
       116689.52533746, 157086.26083686, 100622.88617485, 106055.31449936,
       387590.85252315, 127378.50905613,  99570.67448875, 203606.83889745,
       127567.59071816, 2

In [27]:
regr = LinearRegression() 
regr.fit(X_train, y_train) 
print(regr.score(X_test, y_test))

0.823540979827996


# Testing for Homoskedasticity