In [1]:
import pandas as pd
import matplotlib
import numpy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [3]:
data = pd.read_csv('cleaned_RealEstateData.csv')
data

Unnamed: 0,Date,Region_Name,Area_Code,New_Build_Average_Price,New_Build_Index,New_Build_Monthly_Change,New_Build_Annual_Change,New_Build_Sales_Volume,Existing_Property_Average_Price,Existing_Property_Index,Existing_Property_Monthly_Change,Existing_Property_Annual_Change,Existing_Property_Sales_Volume
0,01/01/1996,Cumbria,E10000006,57787.96259,32.914004,-0.585386,1.425448,44,42193.57096,30.438916,-1.728058,-1.521743,335
1,01/01/1996,Sutton,E09000029,77884.03610,24.708490,0.175607,0.029447,10,69692.71573,22.672059,-0.316870,-2.453548,206
2,01/01/1996,Croydon,E09000008,67333.56841,24.767420,-0.087002,0.583562,7,67799.35123,23.390220,-0.603791,-1.743658,375
3,01/01/1996,Brent,E09000005,71314.19489,17.897684,1.223518,4.950428,3,73525.84789,17.237387,0.340368,1.935006,248
4,01/01/1996,Bromley,E09000006,85655.00173,22.602077,-0.288070,3.107497,12,82194.71320,22.057493,-0.949693,0.509837,340
...,...,...,...,...,...,...,...,...,...,...,...,...,...
73199,01/06/2022,Northumberland,E06000057,293944.88970,153.461353,-0.570438,14.324706,5,177313.25730,127.632415,-0.077461,1.908557,208
73200,01/06/2022,Portsmouth,E06000044,307808.94460,161.815711,-0.248941,16.021367,1,246100.48570,144.610517,0.301049,8.147350,167
73201,01/06/2022,Bournemouth Christchurch and Poole,E06000058,336650.69190,152.361529,1.310216,20.528874,3,333783.20360,140.183031,1.582028,10.216051,338
73202,01/06/2022,Darlington,E06000005,251520.18520,150.608543,-1.106401,19.673326,1,149338.12530,125.852799,-0.155702,7.116072,88


In [4]:
# Convert the 'Date' column to a numerical feature: year
data['Year'] = pd.to_datetime(data['Date']).dt.year
data

Unnamed: 0,Date,Region_Name,Area_Code,New_Build_Average_Price,New_Build_Index,New_Build_Monthly_Change,New_Build_Annual_Change,New_Build_Sales_Volume,Existing_Property_Average_Price,Existing_Property_Index,Existing_Property_Monthly_Change,Existing_Property_Annual_Change,Existing_Property_Sales_Volume,Year
0,01/01/1996,Cumbria,E10000006,57787.96259,32.914004,-0.585386,1.425448,44,42193.57096,30.438916,-1.728058,-1.521743,335,1996
1,01/01/1996,Sutton,E09000029,77884.03610,24.708490,0.175607,0.029447,10,69692.71573,22.672059,-0.316870,-2.453548,206,1996
2,01/01/1996,Croydon,E09000008,67333.56841,24.767420,-0.087002,0.583562,7,67799.35123,23.390220,-0.603791,-1.743658,375,1996
3,01/01/1996,Brent,E09000005,71314.19489,17.897684,1.223518,4.950428,3,73525.84789,17.237387,0.340368,1.935006,248,1996
4,01/01/1996,Bromley,E09000006,85655.00173,22.602077,-0.288070,3.107497,12,82194.71320,22.057493,-0.949693,0.509837,340,1996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73199,01/06/2022,Northumberland,E06000057,293944.88970,153.461353,-0.570438,14.324706,5,177313.25730,127.632415,-0.077461,1.908557,208,2022
73200,01/06/2022,Portsmouth,E06000044,307808.94460,161.815711,-0.248941,16.021367,1,246100.48570,144.610517,0.301049,8.147350,167,2022
73201,01/06/2022,Bournemouth Christchurch and Poole,E06000058,336650.69190,152.361529,1.310216,20.528874,3,333783.20360,140.183031,1.582028,10.216051,338,2022
73202,01/06/2022,Darlington,E06000005,251520.18520,150.608543,-1.106401,19.673326,1,149338.12530,125.852799,-0.155702,7.116072,88,2022


In [5]:
# Encoding categorical feature: 'Region_Name'
label_encoder = LabelEncoder()
data['Region_Code'] = label_encoder.fit_transform(data['Region_Name'])

In [6]:
data.columns

Index(['Date', 'Region_Name', 'Area_Code', 'New_Build_Average_Price',
       'New_Build_Index', 'New_Build_Monthly_Change',
       'New_Build_Annual_Change', 'New_Build_Sales_Volume',
       'Existing_Property_Average_Price', 'Existing_Property_Index',
       'Existing_Property_Monthly_Change', 'Existing_Property_Annual_Change',
       'Existing_Property_Sales_Volume', 'Year', 'Region_Code'],
      dtype='object')

In [15]:
# Feature scaling using StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data[['Year', 
'Existing_Property_Average_Price','New_Build_Annual_Change','New_Build_Monthly_Change','Existing_Property_Monthly_Change',
                                            'Existing_Property_Annual_Change','Region_Code','New_Build_Index',
                                           'Existing_Property_Average_Price', 'Existing_Property_Index',
                                            'Existing_Property_Sales_Volume']])

In [16]:
# Splitting dataset into training and testing sets (80% train, 20% test)
X = scaled_features
y = data['New_Build_Average_Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Model Building

from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

# Predict New_Build_Average_Price for the test set
y_pred = model.predict(X_test)

In [18]:
# Evaluation of the model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize and train the linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = regressor.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared: {r2}")

Mean Absolute Error (MAE): 20352.52083785068
Mean Squared Error (MSE): 816951756.3286821
R-squared: 0.8829157337476989


#### check for Model Underfitting 

In [17]:
y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)

In [21]:
# testing model performance on testing data
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
print(mse)

896181618.389524


In [23]:
#testing model accuracy on training data
tmae = mean_absolute_error(y_train, y_train_pred)
tmse = mean_absolute_error(y_train, y_train_pred)
print(tmse)

21904.229704912017


Since the MSE score of test data is significantly greater than the MSE score of the train data, the model is **Overfitted** and to deal with that, **Regularisation** is one way to do if nothing happens after reducing features

Features include: **Year, 'Region_Code', 
'Existing_Property_Average_Price', 'Existing_Property_Sales_Volume'**

In [17]:
# Checking relevance of the Year feature

#Check for correlation
data['Year'].corr(data['New_Build_Average_Price'])

0.674472226338768

In [18]:
# Checking relevance of the Region Code feature

#Check for correlation
data['Region_Code'].corr(data['New_Build_Average_Price'])

0.044522404849476484

No correlation for Region code and target variable 

In [19]:
# Checking relevance of the Exisitng_property average price feature

#Check for correlation
data['Existing_Property_Average_Price'].corr(data['New_Build_Average_Price'])

0.9192995936786021

In [20]:
# Checking relevance of the Exisitng_property average price feature

#Check for correlation
data['Existing_Property_Sales_Volume'].corr(data['New_Build_Average_Price'])

-0.06862576401422382

No correlation as well

#### Implementing Lasso Regularisation

In [12]:
from sklearn.linear_model import Lasso

In [13]:
lasso_model = Lasso()

In [14]:
lasso_model.fit(X_train, y_train)

In [15]:
y_pred = lasso_model.predict(X_test)
y_pred

array([ 68029.98815202, 196881.2126461 , 368428.79855908, ...,
       259401.32106488, 273431.637582  , 372686.42194787])

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [17]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared: {r2}")

Mean Absolute Error (MAE): 21702.374309377825
Mean Squared Error (MSE): 896181657.6763707
R-squared: 0.8715606264323974


In [18]:
lasso_model.coef_

array([16491.54200948, 67055.58077802])

In [19]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (58563, 2)
X_test shape: (14641, 2)
y_train shape: (58563,)
y_test shape: (14641,)


#### Implementing Ridge Regression

In [11]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [12]:
ridge = Ridge()

In [13]:
ridge.fit(X_train, y_train)

In [14]:
y_pred = ridge.predict(X_test)
y_pred

array([ 68045.96009859, 196835.59816046, 368384.87170054, ...,
       259352.89968103, 273352.65680844, 372621.52296825])

#### Performing cross validation as a form of testing the training algorithm

In [22]:
model = LinearRegression()


k = 5  # You can change the number of folds as needed
cv = KFold(n_splits=k, shuffle=True, random_state=42)  # K-Fold Cross-Validation

# Perform cross-validation and obtain scores
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')  

# Print the cross-validation scores
print("Cross-Validation Scores:", scores)

# Calculate the mean and standard deviation of scores
mean_score = scores.mean()
std_score = scores.std()

# Print the mean and standard deviation of scores
print("Mean Score:", mean_score)
print("Standard Deviation Score:", std_score)

Cross-Validation Scores: [0.88115993 0.87997984 0.88117713 0.88294377 0.8780762 ]
Mean Score: 0.8806673740686547
Standard Deviation Score: 0.0016044534054848025
