In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

%matplotlib inline
sns.set(color_codes=True)
plt.rcParams["figure.figsize"]=[10,5]

In [2]:
url = 'https://raw.githubusercontent.com/HasnainTariq1/Machine-Learning/main/Linear%20Regression%20model%20to%20predict%20House%20price/train.csv'

full_data=pd.read_csv(url)
full_data.head(50)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [3]:
full_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

# Creating new features to enhance the model's predictive capabilities

In [4]:

# Calculating the total number of half bathrooms by summing basement and above-grade half baths
full_data['TotalHalfBaths']=full_data['BsmtHalfBath'] + full_data['HalfBath']

# Calculating the total number of full bathrooms by summing basement and above-grade full baths
full_data['TotalFullBaths']=full_data['BsmtFullBath'] + full_data['FullBath']

# Calculating total square footage of the house by summing the square footage of all levels
full_data['TotalSquareFeet']=full_data['TotalBsmtSF'] + full_data['1stFlrSF'] + full_data['2ndFlrSF'] + full_data['GrLivArea']

# Selecting relevant features for the model based on their importance for predicting sale price
relevant_data=full_data[['Id','LotArea', 'TotalSquareFeet','BedroomAbvGr','TotRmsAbvGrd','TotalHalfBaths' ,'TotalFullBaths','GarageArea','PoolArea','YearBuilt','SalePrice']]

In [5]:
# Display the first 50 rows of the relevant_data DataFrame
relevant_data.head(50)

Unnamed: 0,Id,LotArea,TotalSquareFeet,BedroomAbvGr,TotRmsAbvGrd,TotalHalfBaths,TotalFullBaths,GarageArea,PoolArea,YearBuilt,SalePrice
0,1,8450,4276,3,8,1,3,548,0,2003,208500
1,2,9600,3786,3,6,1,2,460,0,1976,181500
2,3,11250,4492,3,6,1,3,608,0,2001,223500
3,4,9550,4190,3,7,0,2,642,0,1915,140000
4,5,14260,5541,4,9,1,3,836,0,2000,250000
5,6,14115,3520,1,5,1,2,480,0,1993,143000
6,7,10084,5074,3,7,0,3,636,0,2004,307000
7,8,10382,5287,3,7,1,3,484,0,1973,200000
8,9,6120,4500,2,8,0,2,468,0,1931,129900
9,10,7420,3145,2,5,0,2,205,0,1939,118000


In [6]:
relevant_data.shape

(1460, 11)

In [7]:
# Identify duplicated rows in the relevant_data DataFrame
df_duplicate=relevant_data[relevant_data.duplicated()]

# Print the shape of the DataFrame containing duplicates
print(df_duplicate.shape)

(0, 11)


In [8]:
# Check for missing values in the relevant_data DataFrame
print(relevant_data.isnull().sum())

Id                 0
LotArea            0
TotalSquareFeet    0
BedroomAbvGr       0
TotRmsAbvGrd       0
TotalHalfBaths     0
TotalFullBaths     0
GarageArea         0
PoolArea           0
YearBuilt          0
SalePrice          0
dtype: int64


In [9]:
# Create a DataFrame X containing all columns except the target variable 'SalePrice'
X = relevant_data.drop('SalePrice',axis=True)

# Create a Series Y containing only the target variable 'SalePrice'
Y = relevant_data['SalePrice']

# Print the shape of X & Y DataFrame
print(X.shape)
print(Y.shape)

(1460, 10)
(1460,)


In [10]:
# Reshape Y to a 2D array for scaling
Y = Y.values.reshape(-1,1)

# Create an instance of StandardScaler for feature scaling of X
preprocess_x= preprocessing.StandardScaler()
# Fit the scaler on X and transform the data
X_transform = preprocess_x.fit_transform(X)

# Create an instance of StandardScaler for feature scaling of Y
preprocess_y= preprocessing.StandardScaler()
# Fit the scaler on Y and transform the data
Y_transform = preprocess_y.fit_transform(Y)

In [11]:
#Spliting the Data
x_train,x_test,y_train,y_test=train_test_split(X_transform,Y_transform,test_size=.20,random_state=101)

In [12]:
print("Shape of x_train :",x_train.shape)
print("Shape of y_train :",y_train.shape)
print('---------------------------------------')
print("Shape of x_test :",x_test.shape)
print("Shape of y_test :",y_test.shape)

Shape of x_train : (1168, 10)
Shape of y_train : (1168, 1)
---------------------------------------
Shape of x_test : (292, 10)
Shape of y_test : (292, 1)


In [13]:


#creating instance of model
linear_reg=LinearRegression()

#Passing training data to model
linear_reg.fit(x_train,y_train)

In [14]:
# Model Testing
y_pred=linear_reg.predict(x_test)


In [15]:
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = linear_reg.score(x_test, y_test)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.48678978966221537
R-squared: 0.4842869469723783
