# Modeling

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score

import main
from model import model_data

from warnings import filterwarnings
filterwarnings('ignore')

## Split the data

In [2]:
X_train, y_train, X_validate, y_validate, X_test, y_test = model_data('../data/model')

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_validate.shape)
print(y_validate.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
dataset_len = len((X_train + X_validate + X_test))
train_len = X_train.shape[0]
validate_len = X_validate.shape[0]
test_len = X_test.shape[0]

In [None]:
print(f"Train Size: {train_len / dataset_len:.2%}")
print(f"Validate Size: {validate_len / dataset_len:.2%}")
print(f"Test Size: {test_len / dataset_len:.2%}")
print("-" * 20)
print(f"Total {(train_len + validate_len + test_len) / dataset_len:.2%}")

## Baseline Model

In [None]:
baseline_prediction = y_train.value_counts().nlargest(1).index[0]

baseline_accuracy = (y_train == baseline_prediction).mean()
print(f"The baseline accuracy is {baseline_accuracy:.0%}")
print(f"{baseline_prediction}")

## Training Set

### Cross Validation

In [None]:
boost_params = {'learning_rate': [0.0001, 0.001, 0.01, 0.1]}

In [None]:
### Ridge Classifier

clf = RidgeClassifierCV()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

### Random Forest

tree = RandomForestClassifier()
tree.fit(X_train, y_train)
tree.score(X_train, y_train)

### Gradient Boost

ml = GridSearchCV(GradientBoostingClassifier(), boost_params, cv=5)
ml.fit(X_train, y_train)
ml.score(X_train, y_train)

## Validation Set

The best 3 models from the training set.

1. Ridge Classifier 
1. Random Forest
1. Gradient Boost

In [None]:
### Ridge Classifier

clf.score(X_validate, y_validate)

### Random Forest

tree.score(X_validate, y_validate)

### Gradient Boost

ml.score(X_validate, y_validate)

In [5]:
## Test Set
## 1. Gradient Boost

ml.score(X_test, y_test)

NameError: name 'ml' is not defined

# Evaluate

In [None]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

train['predicted'] = ml.predict(X_train)
validate['predicted'] = ml.predict(X_validate)
test['predicted'] = ml.predict(X_test)

In [None]:
print("Training Set")
print("")
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

In [None]:
print("Validation Set")
print("")
print('Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.predicted))

In [None]:
print("Test Set")
print("")
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))