In [1874]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [1875]:
df = pd.read_csv("../data/mockdata.csv")
df

Unnamed: 0,age,housingtype,yearly income,cpf balance,yearly expenditure,savings,quality of life,disaster preparedness,retirement readiness
0,40,4-Room Flat,78000,606060.0,96000,156000.0,1,6,1
1,33,Executive Flat,57000,295260.0,84000,74100.0,2,1,3
2,52,Condominium,67000,818070.0,96000,214400.0,3,1,3
3,51,3-Room Flat,113000,1337920.0,60000,350300.0,10,8,10
4,57,Landed Property,122000,1633580.0,84000,451400.0,8,5,8
...,...,...,...,...,...,...,...,...,...
4995,62,5-Room,148000,2144520.0,108000,621600.0,4,5,4
4996,40,Executive Flat,60000,466200.0,96000,120000.0,4,2,1
4997,32,Condominium,59000,283790.0,60000,70800.0,4,5,1
4998,45,3-Room Flat,139000,1337180.0,108000,347500.0,9,8,9


In [1876]:
# Encode the housingtype
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

df[["housingtype"]] = encoder.fit_transform(df[["housingtype"]])
df.describe()

Unnamed: 0,age,housingtype,yearly income,cpf balance,yearly expenditure,savings,quality of life,disaster preparedness,retirement readiness
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,42.5232,3.292,80039.0,770556.2,68554.8,207201.94,5.5268,5.5406,5.5574
std,13.269449,2.293423,32337.913154,584688.1,33542.40921,165006.890308,3.201333,3.212229,3.207049
min,20.0,0.0,30000.0,11100.0,5000.0,0.0,1.0,1.0,1.0
25%,31.0,1.0,54000.0,249750.0,60000.0,62325.0,3.0,3.0,3.0
50%,43.0,3.0,72000.0,678025.0,72000.0,177000.0,6.0,6.0,6.0
75%,54.0,5.0,104250.0,1225868.0,96000.0,327125.0,8.0,9.0,9.0
max,65.0,7.0,150000.0,2272500.0,108000.0,675000.0,10.0,10.0,10.0


In [1877]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into features (X) and target (y)
X = df.drop(columns=["quality of life", "disaster preparedness", "retirement readiness"])
y = df["quality of life"]
# y = df["disaster preparedness"]
# y = df["retirement readiness"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Since the standard deviation between each data point is high, we need to standardise/normalise it
# Since we have outliers & n>30 so CLT -> norm dist, we just choose standardisation over normalisation.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

In [1878]:
# SVR model.
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# param_grid = {
#     "C": np.logspace(-2, 3, num=7),
#     "gamma": np.logspace(-3, 2, num=6),
#     'epsilon': np.logspace(-3, 2, num=6)
# }
# svr = svm.SVR()
# grid = GridSearchCV(
#     svr,
#     param_grid=param_grid,
#     scoring="neg_mean_absolute_error"
# )
# grid.fit(X_train, y_train)
# print(grid.best_params_)  # {'C': 3.1622776601683795, 'epsilon': 1.0, 'gamma': 1.0}
# y_pred = grid.predict(X_test)

svr = svm.SVR(
    kernel='rbf', 
    gamma=1.0, 
    C=3.1622776601683795, 
    epsilon=1.0,
)
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 1.788802826396273
Mean Squared Error (MSE): 4.633585401181629
Root Mean Squared Error (RMSE): 2.1525764565240486
R-squared (R²): 0.5589608628710401


In [1879]:
# KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
from sklearn.metrics import f1_score
accuracy = f1_score(y_test, y_pred, average='micro')
print("F1:", accuracy)

F1: 0.254


In [1880]:
# Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy", accuracy)

Accuracy 0.341
