# Modeling

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score

import main
from model import model_data

## Read in the dataset

In [None]:
X_train, y_train, X_validate, y_validate, X_test, y_test = model_data()

## Transform text data using TF-IDF

## Split the data

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_validate.shape)
print(y_validate.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
len((X_train + X_validate + X_test))

## Baseline Model

In [None]:
baseline_prediction = y_train.value_counts().nlargest(1).index[0]

In [None]:
baseline_accuracy = (y_train == baseline_prediction).mean()
print(f"The baseline accuracy is {baseline_accuracy:.0%}")
print(f"{baseline_accuracy}")

## Training Set

### Cross Validation

In [None]:
logit_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

svm_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10]}

boost_params = {'learning_rate': [0.0001, 0.001, 0.01, 0.1]}


mlp_params = {'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],
              'activation' : ['identity', 'logistic', 'tanh', 'relu'],
              'solver' : ['lbfgs', 'sgd', 'adam'],
              'hidden_layer_sizes': [(1,),(2,),(3,),(4,),(5,),(6,),(7,),(8,),(9,),(10,),(11,),
                                     (12,),(13,),(14,),(15,),(16,),(17,),(18,),(19,),(20,),(21,)]
             }

### AdaBoost

In [None]:
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

### Logistic Regression

In [None]:
logit_grid_search = GridSearchCV(LogisticRegression(), logit_params, cv=5)
logit_grid_search.fit(X_train, y_train)
logit_grid_search.score(X_train, y_train)

In [None]:
logit_grid_search.best_estimator_

### Support Vector Classifier

In [None]:
svm_grid_search = GridSearchCV(SVC(), svm_params, cv=5)
svm_grid_search.fit(X_train, y_train)
svm_grid_search.score(X_train, y_train)

In [None]:
svm_grid_search.best_estimator_

### Naive Bayes

In [None]:
gaus = GaussianNB()
gaus.fit(X_train, y_train)
gaus.score(X_train, y_train)

### Ridge Classifier

In [None]:
clf = RidgeClassifierCV()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

### Gradient Boost

In [None]:
boost_grid_search = GridSearchCV(GradientBoostingClassifier(), boost_params, cv=5)
boost_grid_search.fit(X_train, y_train)
boost_grid_search.score(X_train, y_train)

In [None]:
boost_grid_search.best_estimator_

### Random Forest

In [None]:
tree = RandomForestClassifier()
tree.fit(X_train, y_train)
tree.score(X_train, y_train)

### Multilayer Perceptron

In [None]:
mlp = GridSearchCV(MLPClassifier(), mlp_params, cv=5)
mlp.fit(X_train, y_train)
mlp.score(X_train, y_train)

## Validation Set

The best 3 models from the training set.

1. Random Forest
1. Ridge Classifier
1. Gradient Boost

### Multilayer Perceptron

In [None]:
mlp.score(X_validate, y_validate)

### Random Forest

In [None]:
tree.score(X_validate, y_validate)

### Ridge Classifier

In [None]:
clf.score(X_validate, y_validate)

### Gradient Boost

In [None]:
boost_grid_search.score(X_validate, y_validate)

## Test Set
1. Gradient Boost

In [None]:
boost_grid_search.score(X_test, y_test)

# Evaluate

In [None]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

ml = boost_grid_search.fit(X_train, y_train)

train['predicted'] = ml.predict(X_train)
validate['predicted'] = ml.predict(X_validate)
test['predicted'] = ml.predict(X_test)

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.predicted))

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))