Fit a linear model with individual property data, response:cost

In [1]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE

In [2]:
df = pd.read_csv('../../data/curated/individual_property_final.csv')
# Select features for prediction
df = df[['Cost', 'Property Type', 'Bedrooms', 'Bathrooms', 
         'Age under 20', 'Age 20-39', 'Age 40-59', 'Age 60+', 'CBD Distance', 'Train Distance', 
         'Electricity Distance', 'Hospital Distance', 'Library Distance', 'Park Distance', 
         'Tourist Attraction Distance', 'Grocery Distance', 'Year', 'Population', 'Income', 
         'Incidents Recorded', 'Gov Secondary Distance']]


In [3]:
categorical_columns = ['Property Type']
# Apply LabelEncoder to each categorical column
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [4]:
df_train = df[(df['Year'] >= 2015) & (df['Year'] <= 2024)]
df_predict = df[(df['Year'] >= 2025) & (df['Year'] <= 2027)]

In [5]:
X = df_train.drop(columns=['Cost'])
y = df_train['Cost']

# LR

In [6]:
seed = 37 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)

In [7]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [8]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

# Fit the model on the scaled training data
lr_model.fit(X_train_scaled, y_train)

In [9]:
baseline_pred = np.full_like(y_val, y_val.mean())
baseline_rmse = root_mean_squared_error(y_val, baseline_pred)
print(f'Baseline RMSE (mean prediction): {baseline_rmse}')

Baseline RMSE (mean prediction): 154.34533204277167


In [10]:
# Make predictions on the test set
y_pred = lr_model.predict(X_val_scaled)

# Calculate and print Root Mean Squared Error (RMSE)
rmse = root_mean_squared_error(y_val, y_pred)
print(f'LR Root Mean Squared Error: {rmse}')

LR Root Mean Squared Error: 125.4713415288683


Use cross validation to enhance

In [11]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)
print(f'Mean RMSE from cross-validation: {rmse_scores.mean()}')


Mean RMSE from cross-validation: 124.99616778346201


Still bad, try Feature selection in modelling

In [12]:


# Initialize the linear regression model
lr_model = LinearRegression()

# Use RFE to select the top n_features (e.g., 10)
rfe = RFE(estimator=lr_model, n_features_to_select=1)

# Fit the RFE model
rfe.fit(X_train_scaled, y_train)


In [13]:
# Get the feature rankings
feature_ranking = rfe.ranking_

# Get the feature names (assuming X is a DataFrame)
feature_names = X.columns

# Create a DataFrame for better visualization
ranking_df = pd.DataFrame({
    'Feature': feature_names,
    'Ranking': feature_ranking
})

# Sort by ranking to see the most important features (rank = 1 is selected)
ranking_df = ranking_df.sort_values(by='Ranking')
print(ranking_df)  # Shows the feature rankings


                        Feature  Ranking
17                       Income        1
1                      Bedrooms        2
8                Train Distance        3
15                         Year        4
2                     Bathrooms        5
3                  Age under 20        6
10            Hospital Distance        7
7                  CBD Distance        8
13  Tourist Attraction Distance        9
5                     Age 40-59       10
4                     Age 20-39       11
6                       Age 60+       12
12                Park Distance       13
9          Electricity Distance       14
16                   Population       15
18           Incidents Recorded       16
14             Grocery Distance       17
19       Gov Secondary Distance       18
0                 Property Type       19
11             Library Distance       20


In [14]:
# Select the indices of the top 10 features based on ranking
top_indices = np.argsort(rfe.ranking_)[:10]  # Get indices of top 10 ranked features

# Filter the dataset to include only the top 10 features
X_train_selected = X_train_scaled[:, top_indices]
X_val_selected = X_val_scaled[:, top_indices] 

In [15]:
# Fit the linear regression model using only the top 10 features
lr_model.fit(X_train_selected, y_train)
# Make predictions on the test set
y_pred = lr_model.predict(X_val_selected)

# Calculate RMSE
rmse = root_mean_squared_error(y_val, y_pred)
print(f'RMSE: {rmse}')

RMSE: 126.06164985779246


Doesn't seem to improve, now try remove feature in random forest less than 0.03 importance

In [16]:
# Copy code from above to fit on selected subset
df_train = df_train.drop(['Property Type', 'Age under 20', 'Age 20-39', 'Age 40-59', 'Age 60+', 'Population', 
         'Incidents Recorded', 'Gov Secondary Distance'], axis=1)
X = df_train.drop(columns=['Cost'])
y = df_train['Cost']
seed = 37 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred = lr_model.predict(X_val_scaled)
rmse = root_mean_squared_error(y_val, y_pred)
print(f'LR selected Root Mean Squared Error: {rmse}')

LR selected Root Mean Squared Error: 126.59475986986939


Slightly worse, meaning those feature might be useful, move on to next model as this is worse than RF.