In [11]:
import os
import pandas as pd

data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,Age,Gender,Marital Status,Number of Dependents,Household Size,Education Level,Occupation,Years in Current Job,Income Level,Credit Score,...,Local Unemployment Rate,Inflation Rate,Interest Rates,Economic Sentiment,Risk Tolerance,Financial Planner Involvement,Debt-to-Income Ratio,Life Insurance Adequacy,Long-term Financial Goals,Risk Profile
0,56,1,2,4,5,2,2,11,0,425,...,8.302827,0.642391,6.393131,0,3,1,0.326709,0,2,1
1,69,0,1,2,6,1,0,18,2,569,...,1.295888,4.587659,5.369076,2,4,0,0.875858,1,1,2
2,46,1,2,1,2,0,4,17,2,717,...,5.52417,2.491157,4.059213,1,7,0,0.433555,1,1,0
3,32,0,0,2,6,0,0,20,2,586,...,5.98515,3.214968,0.592691,0,6,1,0.071399,0,1,1
4,60,0,1,1,5,0,2,18,0,675,...,2.511423,0.323239,3.687253,0,7,0,0.364897,1,2,0


In [12]:
data.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
Marital Status,0
Number of Dependents,0
Household Size,0
Education Level,0
Occupation,0
Years in Current Job,0
Income Level,0
Credit Score,0


Random Forest on using with Training Data


In [13]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


data = pd.read_csv('train.csv')


data = pd.get_dummies(
    data,
    columns=[
        'Gender',
        'Marital Status',
        'Education Level',
        'Occupation',
        'Housing Status',
        'City or Region of Residence',
        'Previous Bankruptcy Status',
        'Health Condition',
        'Family Health History',
        'Residency Stability',
        'Financial Stability of Parents',
        'Tax Filing History',
        'Utility Bills Payment History',
        'Job Loss',
        'Divorce History',
        'Major Medical Emergency',
        'Adoption History',
        'Bankruptcy History',
        'Health-related Legal Claims',
        'Domestic or International Relocation',
        'Economic Sentiment',
        'Financial Planner Involvement',
        'Life Insurance Adequacy',
        'Long-term Financial Goals',
    ],
    drop_first=True,
)
X = data.drop('Risk Profile', axis=1)
y = (
    data['Risk Profile']
    if 'Risk Profile' in data
    else pd.Series([0] * len(data))
)


if isinstance(y.iloc[0], str) or isinstance(y.iloc[0], object):
    le = LabelEncoder()
    y = le.fit_transform(y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


# SMOTE Oversampling
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print('SMOTE Resampled dataset shape %s' % Counter(y_train_smote))

# Train Random Forest with SMOTE data
model_smote = RandomForestClassifier(n_estimators=100, random_state=42)
model_smote.fit(X_train_smote, y_train_smote)

# class weights in Random Forest
model_weighted = RandomForestClassifier(
    n_estimators=100, random_state=42, class_weight='balanced'
)
model_weighted.fit(X_train, y_train)


SMOTE Resampled dataset shape Counter({np.int64(0): 23359, np.int64(1): 23359, np.int64(2): 23359})


In [14]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score


rf_model = RandomForestClassifier(
    n_estimators=100, random_state=42, class_weight='balanced'
)
rf_model.fit(X_train, y_train)



y_pred = rf_model.predict(X_test)


print('Classification Report:\n', classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


try:
    roc_auc = roc_auc_score(
        y_test, rf_model.predict_proba(X_test), multi_class='ovr'
    )
    print(f'ROC AUC: {roc_auc}')
except Exception as e:
    print(f'ROC AUC could not be computed: {e}')

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.34      0.34      9951
           1       0.34      0.34      0.34     10114
           2       0.33      0.32      0.33      9935

    accuracy                           0.34     30000
   macro avg       0.34      0.34      0.34     30000
weighted avg       0.34      0.34      0.34     30000

Accuracy: 0.3352
ROC AUC: 0.504436236711046


Random Forest on using with Testing Data

In [14]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


data = pd.read_csv('test.csv')


data = pd.get_dummies(
    data,
    columns=[
        'Gender',
        'Marital Status',
        'Education Level',
        'Occupation',
        'Housing Status',
        'City or Region of Residence',
        'Previous Bankruptcy Status',
        'Health Condition',
        'Family Health History',
        'Residency Stability',
        'Financial Stability of Parents',
        'Tax Filing History',
        'Utility Bills Payment History',
        'Job Loss',
        'Divorce History',
        'Major Medical Emergency',
        'Adoption History',
        'Bankruptcy History',
        'Health-related Legal Claims',
        'Domestic or International Relocation',
        'Economic Sentiment',
        'Financial Planner Involvement',
        'Life Insurance Adequacy',
        'Long-term Financial Goals',
    ],
    drop_first=True,
)
X = data.drop('Risk Profile', axis=1)
y = (
    data['Risk Profile']
    if 'Risk Profile' in data
    else pd.Series([0] * len(data))
)


if isinstance(y.iloc[0], str) or isinstance(y.iloc[0], object):
    le = LabelEncoder()
    y = le.fit_transform(y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


# SMOTE Oversampling
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print('SMOTE Resampled dataset shape %s' % Counter(y_train_smote))

# Train Random Forest with SMOTE data
model_smote = RandomForestClassifier(n_estimators=100, random_state=42)
model_smote.fit(X_train_smote, y_train_smote)

# class weights in Random Forest
model_weighted = RandomForestClassifier(
    n_estimators=100, random_state=42, class_weight='balanced'
)
model_weighted.fit(X_train, y_train)


SMOTE Resampled dataset shape Counter({np.int64(2): 4679, np.int64(0): 4679, np.int64(1): 4679})


In [15]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score


rf_model = RandomForestClassifier(
    n_estimators=100, random_state=42, class_weight='balanced'
)
rf_model.fit(X_train, y_train)



y_pred = rf_model.predict(X_test)


print('Classification Report:\n', classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


try:
    roc_auc = roc_auc_score(
        y_test, rf_model.predict_proba(X_test), multi_class='ovr'
    )
    print(f'ROC AUC: {roc_auc}')
except Exception as e:
    print(f'ROC AUC could not be computed: {e}')

Classification Report:
               precision    recall  f1-score   support

           0       0.31      0.34      0.33      1948
           1       0.33      0.32      0.33      1998
           2       0.34      0.32      0.33      2054

    accuracy                           0.33      6000
   macro avg       0.33      0.33      0.33      6000
weighted avg       0.33      0.33      0.33      6000

Accuracy: 0.3278333333333333
ROC AUC: 0.5005002443100421


training with major features alone