In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [9]:
df = pd.read_csv("../data/mockdata.csv")
df

Unnamed: 0,age,housingtype,yearly_income,cpf_balance,yearly_expenditure,savings,quality_of_life,disaster_preparedness,retirement_readiness
0,23,4-Room Flat,45000,49950.0,6000,13500.0,1,3,2
1,28,1&2-Room Flat,75000,222000.0,20000,60000.0,3,4,3
2,54,3-Room Flat,102000,1283160.0,60000,346800.0,10,10,7
3,50,1&2-Room Flat,77000,854700.0,84000,231000.0,7,4,9
4,41,Executive Flat,109000,846930.0,96000,228900.0,8,10,8
...,...,...,...,...,...,...,...,...,...
49995,35,Apartment,71000,394050.0,84000,106500.0,3,5,4
49996,31,Executive Flat,67000,272690.0,72000,73700.0,2,5,2
49997,58,Landed Property,84000,1143240.0,60000,319200.0,10,9,10
49998,22,Executive Flat,50000,37000.0,5000,10000.0,1,1,4


In [10]:
# Encode the housingtype
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

df[["housingtype"]] = encoder.fit_transform(df[["housingtype"]])
df.describe()

Unnamed: 0,age,housingtype,yearly_income,cpf_balance,yearly_expenditure,savings,quality_of_life,disaster_preparedness,retirement_readiness
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,42.56376,3.27058,80103.84,750446.0,68395.42,207727.504,5.47526,5.44068,5.438
std,13.269787,2.283802,32231.120895,585125.5,33034.401591,165324.489147,3.747339,3.437481,3.437492
min,20.0,0.0,30000.0,0.0,5000.0,0.0,1.0,1.0,1.0
25%,31.0,1.0,54000.0,236060.0,60000.0,63800.0,2.0,2.0,2.0
50%,43.0,3.0,72000.0,643800.0,72000.0,174000.0,5.0,5.0,5.0
75%,54.0,5.0,105000.0,1196850.0,96000.0,330000.0,10.0,9.0,9.0
max,65.0,7.0,150000.0,2272500.0,108000.0,675000.0,10.0,10.0,10.0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into features (X) and target (y)
X = df.drop(columns=["quality_of_life", "disaster_preparedness", "retirement_readiness"])
y = df["quality_of_life"]
# y = df["disaster_preparedness"]
# y = df["retirement_readiness"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Since the standard deviation between each data point is high, we need to standardise/normalise it
# Since we have outliers & n>30 so CLT -> norm dist, we just choose standardisation over normalisation.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

In [12]:
# SVR model.
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# param_grid = {
#     "C": np.logspace(-2, 3, num=7),
#     "gamma": np.logspace(-3, 2, num=6),
#     'epsilon': np.logspace(-3, 2, num=6)
# }
# svr = svm.SVR()
# grid = GridSearchCV(
#     svr,
#     param_grid=param_grid,
#     scoring="neg_mean_absolute_error"
# )
# grid.fit(X_train, y_train)
# print(grid.best_params_)  # {'C': 3.1622776601683795, 'epsilon': 1.0, 'gamma': 1.0}
# y_pred = grid.predict(X_test)

svr = svm.SVR(
    kernel='rbf', 
    gamma=1.0, 
    C=3.1622776601683795, 
    epsilon=1.0,
)
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 0.4595386995651437
Mean Squared Error (MSE): 0.2874136377400039
Root Mean Squared Error (RMSE): 0.5361097254667219
R-squared (R²): 0.9796121351808064


In [13]:
# KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
from sklearn.metrics import f1_score
accuracy = f1_score(y_test, y_pred, average='micro')
print("F1:", accuracy)

F1: 0.9352


In [14]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(max_iter=300)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy", accuracy)

Accuracy 0.9558
