In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# For XGBoost
from xgboost import XGBRegressor

# For CatBoost
from catboost import CatBoostRegressor

In [2]:
df = pd.read_csv('/home/jonnyoh/code/JonnyPOH/portfolio/projects/mlproject_stu_score/data/homeless_prep.csv')

In [3]:
df_cleaned = df.drop_duplicates(subset='CLIENT_KEY')
df_cleaned.reset_index(drop=True, inplace=True)
mean_value = df_cleaned['NIGHTS'].mean()
df_cleaned.loc[:, 'NIGHTS'] = df_cleaned['NIGHTS'].fillna(mean_value)

In [4]:
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(df_cleaned[['GENDER','VETERAN','substanceabuse','completed','probation']])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['GENDER', 'VETERAN', 'substanceabuse', 'completed', 'probation']))
df_encoded = pd.concat([df_cleaned.drop(['CLIENT_KEY','GENDER','VETERAN','substanceabuse','completed','probation','assistancetype','required'], axis=1), encoded_df], axis=1)

In [5]:
X = df_encoded.drop(columns=['NIGHTS'],axis=1)
y = df_encoded['NIGHTS']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square


In [8]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'XGB': XGBRegressor(),
    'Cat Boost': CatBoostRegressor(),
    'Ada Boost': AdaBoostRegressor()
}
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('model perf for training')
    print(" RMSE: {:.4f}". format(model_train_rmse))
    print("-MSE {:.4f}". format(model_train_mae))
    print(" R2: {:.4f}". format(model_train_r2))
    r2_list.append(model_train_r2)

    print('='*35)
    print('\n')


Linear Regression
model perf for training
 RMSE: 71.0980
-MSE 45.4278
 R2: 0.4818


Lasso
model perf for training
 RMSE: 71.4462
-MSE 45.4477
 R2: 0.4767


Ridge
model perf for training
 RMSE: 71.1053
-MSE 45.3087
 R2: 0.4817


K-Nearest Neighbors
model perf for training
 RMSE: 83.9504
-MSE 53.4162
 R2: 0.2775


Decision Tree
model perf for training
 RMSE: 19.6098
-MSE 2.8913
 R2: 0.9606


Random Forest
model perf for training
 RMSE: 36.3389
-MSE 18.5327
 R2: 0.8646


XGB
model perf for training
 RMSE: 19.6098
-MSE 2.9102
 R2: 0.9606


Learning rate set to 0.028084
0:	learn: 97.7246455	total: 58ms	remaining: 57.9s
1:	learn: 96.6611365	total: 61.3ms	remaining: 30.6s
2:	learn: 95.8542826	total: 64.4ms	remaining: 21.4s
3:	learn: 94.8194438	total: 68.5ms	remaining: 17.1s
4:	learn: 94.0895293	total: 70.7ms	remaining: 14.1s
5:	learn: 93.0111465	total: 72.6ms	remaining: 12s
6:	learn: 92.1826140	total: 77.8ms	remaining: 11s
7:	learn: 91.1768256	total: 80.5ms	remaining: 9.98s
8:	learn: 90.33494

In [9]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by='R2_Score', ascending=False)

Unnamed: 0,Model Name,R2_Score
4,Decision Tree,0.960576
6,XGB,0.960576
7,Cat Boost,0.955626
5,Random Forest,0.864618
8,Ada Boost,0.793118
0,Linear Regression,0.48176
2,Ridge,0.481652
1,Lasso,0.476671
3,K-Nearest Neighbors,0.27746
