In [20]:
#### Preamble ####
# Purpose: Model the data to predict total income
# Author: Jiazhou(Justin) Bi and Weiyang Li
# Date: 4 October 2024
# Contact: justin.bi@mail.utoronto.ca or weiyang.li@mail.utoronto.ca
# License: MIT
# Pre-requisites: python 3.10.5 or above, with pandas, and sklearn installed for python
# Any other information needed? None

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
# Adding progress bar for the model's training
from sklearn.utils import parallel_backend
from tqdm import tqdm
import joblib
df = pd.read_csv('../data/02-analysis_data/cleaned_data.csv')
df.head()

Unnamed: 0,STATEICP,GQ,OWNERSHP,MORTGAGE,SEX,AGE,MARST,EDUC,SCHLTYPE,OCC2010,IND1990,INCTOT,VETSTAT
0,41,3,0,0,2,85,5,7,1,9920,0,18800,1
1,41,3,0,0,1,51,5,6,1,5620,591,12500,1
2,41,3,0,0,2,36,6,2,1,8800,100,16400,1
3,41,4,0,0,1,74,6,0,1,9920,0,8600,1
4,41,3,0,0,1,49,4,7,1,6230,60,5000,2


# Building the Model

In [32]:
X = df.drop(columns=['INCTOT'])
y = df['INCTOT']
categorical_features = [
#                        'STATEICP',
#                        'GQ',
                        'OWNERSHP',
                        'MORTGAGE',
                        'SEX',
                        'MARST',
                        'EDUC',
                        'SCHLTYPE',
#                        'OCC2010',
                        'IND1990',
                        'VETSTAT'
                        ]
numerical_features = [
#                        'AGE'
                        ]
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), categorical_features),
#        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=210)

In [33]:
#Random Forest
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=210, max_depth=15))
])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'Mean Squared Error: {mse}')
print(f'Rooted Mean Squared Error: {rmse}')
# Mean Squared Error: 3502719043.7029905
# Rooted Mean Squared Error: 59183.77348313464

Mean Squared Error: 3871376844.632726
Rooted Mean Squared Error: 62220.38929991298


In [34]:
#Linear Regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')

# Extract the coefficients
coefficients = model.named_steps['regressor'].coef_
feature_names = model.named_steps['preprocessor'].get_feature_names_out()
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)
print(coef_df)


Mean Squared Error: 4614594307.386337
Root Mean Squared Error: 67930.80529028298
         Feature   Coefficient
4      cat__EDUC   9129.981930
1  cat__MORTGAGE   8468.247273
0  cat__OWNERSHP   4043.911698
6   cat__IND1990     95.734667
3     cat__MARST  -3558.243379
7   cat__VETSTAT  -6033.222226
5  cat__SCHLTYPE -14652.187929
2       cat__SEX -26123.944333
