In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import features
with open("../Data/features.txt") as file:
    features = json.load(file)
# Initialize Standard Scaler
scaler = StandardScaler() 
# Initialize Feature Values
feature_values = {}
# Import Data
data = pd.read_csv("../Data/prosper_final.csv")

In [3]:
X = data[features.get("Borrower")+features.get("Loan")+features.get("Lender")].fillna(-1).as_matrix()
y = data['RepaidOrNot'].as_matrix()

In [4]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

class_weight = [{0: 2, 1: 1}, {0: 3, 1: 1}, {0: 4, 1: 1}]

# Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)

In [5]:
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
# Fit the grid search to the data
grid_search.fit(train_features, train_labels)
grid_search.best_params_

Fitting 3 folds for each of 4320 candidates, totalling 12960 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 130.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 255.8min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 391.6min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 617.5min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 968.7min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 1349.8min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 1719.6min
[Parallel(n_jobs=-1)]: Done 3273 tasks      | elapsed: 2117.7min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 2703.9min


KeyboardInterrupt: 

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, test_features, test_labels)

In [None]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))