### Random Forest Classification

### 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### 2. Importing Dataset

In [2]:
pca_df = pd.read_csv("./pca_df.csv", index_col = 0)
y = pd.read_csv("./2020_Competition_Training.csv", usecols = ["transportation_issues"])

### 3. Preparing Data For Training

In [35]:
X_train, X_test, y_train, y_test = train_test_split(pca_df, y, test_size=0.2, random_state=42, stratify = y)

### 4. Feature Scaling

In [7]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### 5. Training the Algorithm

In [58]:
from sklearn.ensemble import RandomForestClassifier

# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100, 
                               random_state=42, 
                               max_features = 'auto',
                               n_jobs=-1, verbose = 1)

# Fit on training data
model.fit(X_train, y_train.values.ravel())

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   50.5s finished


RandomForestClassifier(n_jobs=-1, random_state=42, verbose=1)

### Find the best parameters

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = { 
   'n_estimators': [10, 500],
   'max_features': ['auto', 'sqrt', 'log2'],
   'min_samples_leaf' : [len(pca_df)//10000, len(pca_df)//28000, 
                         len(pca_df)//50000, len(pca_df)//100000]
}

CV_rfc = GridSearchCV(estimator=model, 
                     param_grid=param_grid, 
                     scoring = 'f1',
                     cv=10, 
                     n_jobs=10,
                     verbose=2,
                     pre_dispatch='2*n_jobs',
                     refit=False)
CV_rfc.fit(X_train, y_train)

CV_rfc.best_params_

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


In [59]:
n_nodes = []
max_depths = []

for ind_tree in model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 9104
Average maximum depth 57


### Random Forest Results

In [60]:
X_train_rf_predictions = model.predict(X_train)
X_train_rf_probs = model.predict_proba(X_train)[:, 1]

rf_predictions = model.predict(X_test)
rf_probs = model.predict_proba(X_test)[:, 1]

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.4s finished


In [61]:
np.unique(rf_predictions,return_counts=True)

(array([0, 1]), array([13863,    52]))

In [62]:
rf_probs

array([0.31, 0.29, 0.06, ..., 0.13, 0.07, 0.11])

In [55]:
from sklearn.metrics import roc_auc_score

In [63]:
roc_auc_score(y_test, rf_probs)

0.7159528839036563