## 0. Configs

In [1]:
import sys
sys.path.append("../../util/")

from calculate_scores import calculate_scores

In [2]:
# previously used when showing MSE and MAE

import locale
locale.setlocale(locale.LC_ALL, '')

def as_currency(amount):
    return locale.currency(amount, grouping=True)

In [3]:
import warnings

warnings.filterwarnings('ignore')

## 1. Import libraries and data

In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import scipy.stats as stats

df = pd.read_csv("../../data/cleaned/train_V2_cleaned.csv")
df.head()

Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,credit_use_ic,gluten_ic,lactose_ic,insurance_ic,...,shop_use,retired,gold_status,outcome_profit,outcome_damage_inc,outcome_damage_amount,gender_M,gender_V,combined_pos_score,combined_neg_score
0,268.0,16.0,1682.0,0.0,0.0,750.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1672.78,1,829.66,1,0,0.0,7.955259
1,283.0,23.0,1673.0,0.0,0.0,750.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1001.4,0,0.0,1,0,0.33433,1.842549
2,227.0,0.0,1685.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1785.59,0,0.0,0,1,0.0,0.889793
3,227.0,0.0,1620.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.151873,0.0,0.0,1127.05,0,0.0,0,1,0.0,0.0
4,229.0,810.0,1620.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1640.31,0,0.0,1,0,0.0,6.894609


## 2. Preparation for model training

### **X, y split**

Split the columns which I will need to train a model. 

In [5]:
X = df.drop(['outcome_profit', 'outcome_damage_inc', 'outcome_damage_amount'], axis=1)
y = df['outcome_profit']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

## 3. Train both models

In [7]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### **1. GradientBoostingRegressor**

1. Get the best hyperparameters with RandomizedSearchCV

In [17]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()

gb_param_grid = {
    'n_estimators': stats.randint(50, 200),
    'learning_rate': stats.uniform(0.01, 0.5),
    'max_depth': [3, 4, 5, 6, None],
    'min_samples_split': stats.randint(2, 20),
    'min_samples_leaf': stats.randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2', None]
}

gb_random_search = RandomizedSearchCV(gb, param_distributions=gb_param_grid, n_iter=100, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
gb_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [18]:
gb_best_params = gb_random_search.best_params_
gb_best_params

{'learning_rate': 0.29846864385826716,
 'max_depth': 6,
 'max_features': 'sqrt',
 'min_samples_leaf': 6,
 'min_samples_split': 7,
 'n_estimators': 166}

2. Refined **GridSearchCV** with hyperparameters from RandomizedSearchCV

In [19]:
gb_refined_param_grid = {
    'learning_rate': [gb_best_params['learning_rate']],
    'max_depth': [gb_best_params['max_depth'] - 2, gb_best_params['max_depth'], gb_best_params['max_depth'] + 2] if gb_best_params['max_depth'] is not None else [None],
    'max_features': [gb_best_params['max_features']],
    'min_samples_leaf': [gb_best_params['min_samples_leaf'] - 1, gb_best_params['min_samples_leaf'], gb_best_params['min_samples_leaf'] + 1],
    'min_samples_split': [gb_best_params['min_samples_split'] - 1, gb_best_params['min_samples_split'], gb_best_params['min_samples_split'] + 1],
    'n_estimators': [gb_best_params['n_estimators'] - 2, gb_best_params['n_estimators'], gb_best_params['n_estimators'] + 2] if gb_best_params['max_depth'] is not None else [None],
}

gb_grid_search_refined = GridSearchCV(gb, gb_refined_param_grid, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
gb_grid_search_refined.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


3. Get the best model and make predictions on the test set (from X,y split)

In [20]:
gb_best = gb_grid_search_refined.best_estimator_
gb_pred = gb_best.predict(X_test)

### **2. RandomForestRegressor**

1. Get the best hyperparameters with RandomizedSearchCV

In [22]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

rf_param_grid = {
    'n_estimators': stats.randint(50, 200),
    'max_depth': [3, 4, 5, 6, None],
    'min_samples_split': stats.randint(2, 20),
    'min_samples_leaf': stats.randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

rf_random_search = RandomizedSearchCV(rf, param_distributions=rf_param_grid, n_iter=100, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
rf_random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [24]:
rf_best_params = rf_random_search.best_params_
rf_best_params

{'bootstrap': True,
 'max_depth': 5,
 'max_features': 'log2',
 'min_samples_leaf': 18,
 'min_samples_split': 5,
 'n_estimators': 181}

2. Refined **GridSearchCV** with hyperparameters from RandomizedSearchCV

In [25]:
rf_refined_param_grid = {
    'n_estimators': [rf_best_params['n_estimators'] - 2, rf_best_params['n_estimators'], rf_best_params['n_estimators'] + 2],
    'max_depth': [rf_best_params['max_depth'] - 2, rf_best_params['max_depth'], rf_best_params['max_depth'] + 2] if rf_best_params['max_depth'] is not None else [None],
    'min_samples_split': [rf_best_params['min_samples_split'] - 1, rf_best_params['min_samples_split'], rf_best_params['min_samples_split'] + 1],
    'min_samples_leaf': [rf_best_params['min_samples_leaf'] - 1, rf_best_params['min_samples_leaf'], rf_best_params['min_samples_leaf'] + 1],
    'max_features': [rf_best_params['max_features']],
    'bootstrap': [rf_best_params['bootstrap']],
}

rf_grid_search_refined = GridSearchCV(rf, rf_refined_param_grid, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
rf_grid_search_refined.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


3. Get the best model and make predictions on the test set (from X,y split)

In [26]:
rf_best = rf_grid_search_refined.best_estimator_
rf_pred = rf_best.predict(X_test)

### **Compare**

In [27]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [28]:
# Metrics for GradientBoostingRegressor
mse_gb = np.sqrt(mean_squared_error(y_test, gb_pred))
mae_gb = mean_absolute_error(y_test, gb_pred)
r2_gb = r2_score(y_test, gb_pred)

# Metrics for RandomForestRegressor
mse_rf = np.sqrt(mean_squared_error(y_test, rf_pred))
mae_rf = mean_absolute_error(y_test, rf_pred)
r2_rf = r2_score(y_test, rf_pred)

In [29]:
all_MSEs = [mse_gb, mse_rf]
all_MAEs = [mae_gb, mae_rf]
all_R2s  = [ r2_gb,  r2_rf]

models = pd.DataFrame({ 'Model': ['GradientBoosting','RandomForestRegressor'],
                        'Root MSE': all_MSEs,
                        'MAE': all_MAEs,
                        'R2 Score': all_R2s,
                        })

# New feature to compare models
FE = "Root MSE + MAE"
models[FE] = ((models["Root MSE"] + models["MAE"]))  

models.sort_values(by = FE, ascending = True, ignore_index = True)

Unnamed: 0,Model,Root MSE,MAE,R2 Score,Root MSE + MAE
0,GradientBoosting,496.661726,378.504066,0.473086,875.165791
1,RandomForestRegressor,604.716336,452.587032,0.218873,1057.303368


## 3. Make prediction on `score.csv` using best model

### **Import applicant data CSV**, and make adjustments

In [30]:
applicants_data = pd.read_csv("../../data/score.csv")

"`get_dummies`" the same columns as we did for the training data

In [31]:
applicants_data = pd.get_dummies(applicants_data, columns=['gender'])

# execute `calculate_scores` on the DF, to calculate the scores
applicants_data = calculate_scores(applicants_data.copy())

applicants_data.fillna(0, inplace=True)

### **Make prediction on the test set**

In [32]:
predicted_revenue = gb_grid_search_refined.predict(applicants_data)

## 4. Score applicants

I'm not actually scoring applicants (read `IMPORTANT`)

### **Make new DataFrame with `predicted_revenue`**

In [33]:
applicants_data["predicted_revenue"] = predicted_revenue
applicants_data["applicant_id"] = applicants_data.index

applicants_with_projected_revenue = applicants_data[["applicant_id", 'predicted_revenue']]

In [34]:
applicants_with_projected_revenue

Unnamed: 0,applicant_id,predicted_revenue
0,0,1039.082571
1,1,2597.720755
2,2,1579.118807
3,3,2496.358789
4,4,1626.727706
...,...,...
495,495,2674.283195
496,496,4365.196865
497,497,1869.162869
498,498,1956.991603


### **Export dataframe**

In [35]:
applicants_with_projected_revenue.to_csv("../../data/exported/predicted_revenue.csv", index=False)

# **IMPORTANT**

in the future I will merge all 3 dataframes with each other, and then calculate the outcome_profit and select the top 200