In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns


# Selecting the predictors and the target variable

In [5]:
houses_df = pd.read_csv('/workspaces/House_Sale/Houses_Cleaned_data.csv')

In [6]:
houses_df.columns

Index(['Unnamed: 0', 'Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
       'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPo

In [11]:
y = houses_df[['SalePrice']]
y.columns

Index(['SalePrice'], dtype='object')

In [8]:
x = houses_df.iloc[:, :-1]
x.columns


Index(['Unnamed: 0', 'Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
       'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPo

# Testing Assumptions
**Linearity: The relationship between the predictors and the target should be linear.**

**Independence: Observations should be independent of each other.**

**Homoscedasticity: The variance of errors should be constant across all levels of the independent variables.**

**Normality of residuals: Residuals (the difference between predicted and actual values) should be normally distributed.**

In [None]:
# Creating Dummy variables
categorical_columns = x.select_dtypes(include=['object', 'category', 'bool']).columns

print("Categorical Features:", categorical_columns)

Categorical Features: Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [None]:

categorical_columns = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
                       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
                       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
                       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
                       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
                       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
                       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
                       'SaleType', 'SaleCondition']


df_encoded = pd.get_dummies(houses_df, columns=categorical_columns, drop_first=True)



   Unnamed: 0  Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  \
0           0   1          60         65.0     8450            7            5   
1           1   2          20         80.0     9600            6            8   
2           2   3          60         68.0    11250            7            5   
3           3   4          70         60.0     9550            7            5   
4           4   5          60         84.0    14260            8            5   

   YearBuilt  YearRemodAdd  MasVnrArea  ...  SaleType_ConLI  SaleType_ConLw  \
0       2003          2003       196.0  ...           False           False   
1       1976          1976         0.0  ...           False           False   
2       2001          2002       162.0  ...           False           False   
3       1915          1970         0.0  ...           False           False   
4       2000          2000       350.0  ...           False           False   

   SaleType_New  SaleType_Oth  SaleTyp

In [None]:

correlation_with_target = df_encoded.corr()['SalePrice']
correlation_with_target


Unnamed: 0              -0.021917
Id                      -0.021917
MSSubClass              -0.084284
LotFrontage              0.320006
LotArea                  0.263843
                           ...   
SaleCondition_AdjLand   -0.050686
SaleCondition_Alloca    -0.015525
SaleCondition_Family    -0.046480
SaleCondition_Normal    -0.153990
SaleCondition_Partial    0.352060
Name: SalePrice, Length: 232, dtype: float64

In [37]:
filtered_correlation = correlation_with_target[(correlation_with_target > 0.4) | (correlation_with_target < -0.4)]
filtered_correlation

OverallQual             0.790982
YearBuilt               0.522897
YearRemodAdd            0.507101
MasVnrArea              0.475890
TotalBsmtSF             0.613581
1stFlrSF                0.605852
GrLivArea               0.708624
FullBath                0.560664
TotRmsAbvGrd            0.533723
Fireplaces              0.466929
GarageCars              0.640409
GarageArea              0.623431
SalePrice               1.000000
Neighborhood_NridgHt    0.402149
ExterQual_Gd            0.452466
ExterQual_TA           -0.589044
Foundation_PConc        0.497734
BsmtQual_TA            -0.494737
BsmtFinType1_GLQ        0.434622
KitchenQual_TA         -0.519298
GarageType_Detchd      -0.404563
GarageFinish_Unf       -0.506542
Name: SalePrice, dtype: float64