In [2]:
# linear algebra
import numpy as np

# data processing
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# algorithms
from sklearn.ensemble import RandomForestClassifier

# https://towardsdatascience.com/evaluation-metrics-for-classification-cdea775c97d4
# https://towardsdatascience.com/beyond-accuracy-precision-and-recall-3da06bea9f6c

In [3]:
train_df = pd.read_csv('C:/Users/ACER/Desktop/pull github repo/LIE-Thomas-1.26/content/additional_resources/datasets/US Income/cleaned/data_train.csv')

df = train_df.copy()


In [6]:
X = df.drop('income', axis=1)
y= df['income']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=24)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
from sklearn.metrics import accuracy_score

# Baseline accuracy model
print("Random Forests accuracy", accuracy_score(y_test, y_pred))

Random Forests accuracy 0.8518347919545525


In [None]:
# Multiple evaluation metrics

In [9]:
# Precision and recall
'''
Precision metric represents the probability of the model being correct 
out of all the times the model said yes.
Recall metric represents the ability of a model to find all the relevant
cases within a dataset.
'''

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score

predictions = cross_val_predict(clf, X_train, y_train, cv=3)

print('Precision:', precision_score(y_train, predictions))
print('Recall:', recall_score(y_train, predictions))

Precision: 0.7467282525019245
Recall: 0.6211975664425232


In [12]:
# F1 score

from sklearn.metrics import f1_score
print("F1 score:", f1_score(y_train, predictions))

F1 score: 0.6782031113441706


In [11]:
# Accuracy

print("Random Forests accuracy", accuracy_score(y_test, y_pred))

Random Forests accuracy 0.8518347919545525


In [18]:
# Specificity
'''
Sensitivity of predicting the negative cases.
Probability of predicting negatives out of all negative cases.
'''

from sklearn.metrics import confusion_matrix

def specificity_score(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    return cm[0, 0] / (cm[0, 0] + cm[0, 1])

print("Specificity score:", specificity_score(y_test, y_pred))
print(specificity_score(y_test, y_pred) *100, "% of all negative cases will be predicted correctly")

Specificity score: 0.9265961773078487
92.65961773078487 % of all negative cases will be predicted correctly


In [None]:
# Hyper parameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold

n_estimators = np.arange(100, 1000, 100)
max_features = np.arange(1, 10, 1)
min_samples_leaf = np.arange(2, 10, 1)
kfold = KFold(n_splits = 3)
start_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    }

rf = RandomForestClassifier()

test_rf = RandomizedSearchCV(estimator=rf, param_distributions=start_grid, cv=kfold)
print(start_grid)

In [None]:
for n, m,s in start_grid.items():
    final_model = RandomForestClassifier(n_estimators=n,min_samples_leaf=m, max_features=s, random_state=24)
    final_model.fit(X_train, y_train)
    predictions = final_model.predict(X_test)
    print(accuracy_score(y_test, predictions))

In [20]:
final_model = RandomForestClassifier(n_estimators=450, min_samples_leaf=2, max_features=3, random_state=24)
final_model.fit(X_train, y_train)

predictions = final_model.predict(X_test)
print(accuracy_score(y_test, predictions))

0.8592046675879011


In [None]:
# Validation strategies

In [19]:
# K-Fold cross validation:

from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_train, y_train, cv=10, scoring = "accuracy")

print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.85988484 0.86564299 0.86487524 0.85834933 0.86602687 0.85143954
 0.85834933 0.8571977  0.85023041 0.86328725]
Mean: 0.859528350006339
Standard Deviation: 0.005315617448158119


In [None]:
# Random Search Cross Validation

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

logistic = LogisticRegression(solver=)