In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import DecisionTreeClassifier, RandomForestClassifier

from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report, f1_score, accuracy_score, roc_curve, roc_auc_score, mean_absolute_error, r2_score


import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
trainset_values = 'data/trainset_values.csv'
trainset_labels = 'data/trainset_labels.csv'
testset_values = 'data/testset_values.csv'

df_trainset_values = pd.read_csv(trainset_values)
df_trainset_labels = pd.read_csv(trainset_labels)
df_testset_values = pd.read_csv(testset_values)

In [None]:
# combine trainset values and trainset labels into one dataset to become X

In [None]:
df.head()

In [None]:
df.shape

In [None]:
X = df.drop(columns=['PE'], axis=1)
y = df_testset_values

In [None]:
y.value_counts().sort_index()

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
print('X_train: {}'.format(len(X_train)))
print('y_train: {}'.format(len(y_train)))
print('X_test: {}'.format(len(X_test)))
print('y_test: {}'.format(len(y_test)))

In [None]:
# MVP
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)
preds_dtc_test = dtc.predict(X_test)

In [None]:
# Model Eval
print('Mean Squared Error:', mean_squared_error(y_test, preds_dtc_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, preds_dtc_test))
print('R-squared:', r2_score(y_test, preds_dtc_test))

In [None]:
# Coonfusion Matrix
cm_dtc = confusion_matrix(y_test, preds_dtc_test)
print(cm_dtc)
tn = cm_dtc[0,0]
tp = cm_dtc[1,1]
fp = cm_dtc[0,1]
fn = cm_dtc[1,0]
sns.heatmap(cm_dtc, cmap='coolwarm', annot=True)
plt.xlabel('predictions')
plt.ylabel('actuals')
plt.show()

precision = tp/(tp+fp)
print('Precision: {}'.format(precision))

# Your code here to calculate recall - how often the model's prediction of 'winner' was correct
recall = tp/ (fp+fn)
print('Recall: {}'.format(recall))

# Your code here to calculate F-1 score
f1 = (2 * precision * recall) / (precision + recall)
print('F-1 Score: {}'.format(f1))

#### Evaluate metrics in this cell

In [2]:
# Possible scaling

In [1]:
# GridSearch for hyoerparameter testing
rfc = RandomForestClassifier(random_state=42)
param_grid = {'max_depth':[7,9,20],
             'n_estimators':[50,100,150],
             'min_samples_split':[1,2,5]}
cv_rfc = GridSearchCV(rfc, param_grid, cv=5) 

In [None]:
cv_rfc.fit(X,y)

In [None]:
preds_rfc_test = cv_rfc.predict(X_test_sc) # predictions

In [None]:
# Model Eval
print('Mean Squared Error:', mean_squared_error(y_test, preds_rfc_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, preds_rfc_test))
print('R-squared:', r2_score(y_test, preds_rfc_test))

In [None]:
# Coonfusion Matrix
cm_rfc = confusion_matrix(y_test, preds_rfc_test)
print(cm_rfc)
tn = cm_rfc[0,0]
tp = cm_rfc[1,1]
fp = cm_rfc[0,1]
fn = cm_rfc[1,0]
sns.heatmap(cm_rfc, cmap='coolwarm', annot=True)
plt.xlabel('predictions')
plt.ylabel('actuals')
plt.show()

precision = tp/(tp+fp)
print('Precision: {}'.format(precision))

# Your code here to calculate recall - how often the model's prediction of 'winner' was correct
recall = tp/ (fp+fn)
print('Recall: {}'.format(recall))

# Your code here to calculate F-1 score
f1 = (2 * precision * recall) / (precision + recall)
print('F-1 Score: {}'.format(f1))