In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

: 

### Data Cleaning

In [None]:
# Import data
df = pd.read_csv('investor_data.csv')
df.head()

: 

In [None]:
df.info()

: 

In [None]:
# Check for missing values
print(df.isnull().sum())

: 

In [None]:
# Drop missing values
df.dropna(inplace = True)
print(df.isnull().sum())

: 

In [None]:
# Check duplicates
df.duplicated().sum()

: 

In [None]:
# Drop column 'clientID' since it's not related with risk prediction
df = df.drop(columns = ['clientID'])

: 

In [None]:
# Change 'dateID' object to float
df['dateID'] = pd.to_datetime(df['dateID'], format = '%Y-%m-%d')
df['Day'] = df['dateID'].dt.day
df['Month'] = df['dateID'].dt.month
df['Year'] = df['dateID'].dt.year
df.drop(columns = ['dateID'], inplace = True)

: 

In [None]:
# Get dummy values for categorical predictor 'education'
df = pd.get_dummies(df, prefix = ['education'], columns = ['education'])

: 

In [None]:
df.info()

: 

In [None]:
# Plot the relationship between the response variable 'risk' and predictors to see if they may be linearly related
plt.scatter(df['avgMonthlyIncome'], y = df['risk'])

: 

In [None]:
plt.scatter(df['expSavings'], y = df['risk'])

: 

In [None]:
plt.scatter(df['expTransport'], y = df['risk'])

: 

It's unlikely that linear regression will be a good model from these graphs. However, let's first try linear regression model to see how it fits the data.

In [None]:
# Split train and test sets
from sklearn.model_selection import train_test_split
x = df.drop(['risk'], axis = 1)
y = df['risk']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 10)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

: 

In [None]:
train = pd.concat([x_train, y_train], axis = 1)

: 

In [None]:
test = pd.concat([x_test, y_test], axis = 1)

: 

In [None]:
# Linear regression model
import statsmodels.formula.api as smf
LR_model = smf.ols(formula = 'risk ~' + '+'.join(train.columns.difference(['risk'])), data = train).fit()

: 

In [None]:
LR_model.summary()

: 

We can observe from the p-value that variables 'amountLoan', 'avgMonthlyIncome', 'avgNumTransactions', 'avgTransaction', 'cardLevel', 'education_BSc', 'education_MSc', 'education_PhD', 'largestSingleTransaction' can be concluded as significant variables. 

In [None]:
# Regression tree model
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# Range for the complexity parameter value
grid_values = {'ccp_alpha': np.linspace(0, 0.001, 51)}

# The decision tree regressor
dtr = DecisionTreeRegressor(min_samples_leaf = 5, min_samples_split = 10, random_state = 10)

# To find the optimal complexity parameter value using 5-fold cross validation
dtr_cv = GridSearchCV(dtr, param_grid = grid_values, scoring = 'r2', cv = 5 , verbose = 0)
dtr_cv.fit(x_train, y_train)

: 

In [None]:
# Print the optimal complexity parameter value for our model
print('Best ccp_alpha', dtr_cv.best_params_) 

: 

In [None]:
# Regression tree model with the optimal ccp alpha value
dtr_final = DecisionTreeRegressor(min_samples_leaf = 5, 
                                  min_samples_split = 20, 
                                  ccp_alpha = 2e-05,
                                  random_state = 10)
dtr_final.fit(x_train, y_train)

: 

In [None]:
# Random forest model
from sklearn.ensemble import RandomForestRegressor

# Range for the max_features
grid_values = {'max_features': np.linspace(1,18,9, dtype = 'int32')} 

# Random forest model
rf = RandomForestRegressor(min_samples_leaf = 5, n_estimators = 500, random_state = 10) 

# Choosing the max features by a 5-fold cross validation
rf_cv = GridSearchCV(rf, param_grid = grid_values, scoring = 'r2', cv = 5)
rf_cv.fit(x_train, y_train)

: 

In [None]:
print(rf_cv.best_params_)

: 

In [None]:
# Random forest model with the optimal max_features value
rf_final = RandomForestRegressor(min_samples_leaf = 5, 
                                 n_estimators = 500, 
                                 max_features = 15,
                                 random_state = 10) 
rf_final.fit(x_train, y_train)

: 

In [None]:
# Define a function to compute the out-of-sample R-squared of a model using the test set
def OSR2(model, df_train, df_test, dependent_var):   
    
    y_test = df_test[dependent_var]
    y_pred = model.predict(df_test)
    SSE = np.sum((y_test - y_pred)**2)
    SST = np.sum((y_test - np.mean(df_train[dependent_var]))**2)    
    
    return 1 - SSE/SST

: 

In [None]:
# OSR^2 for linear regression
print('OSR2 for linear regression:', OSR2(LR_model, train, test, 'risk'))

: 

In [None]:
# OSR^2 for regression tree
from sklearn.metrics import r2_score
print('OSR2 for regression tree:', round(r2_score(y_test, dtr_final.predict(x_test)), 5))

: 

In [None]:
# OSR^2 for random forest
print('OSR2 for random forest model:', round(r2_score(y_test, rf_final.predict(x_test)), 5))

: 

Based on $OSR^2$ values, random forest model performs the best.