# Modeling

In [1]:
import pandas as pd
import numpy as np


from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score

import main
from prepare import prep_readme_data
from model import model_data

## Read in the dataset

In [4]:
X_train, y_train, X_validate, y_validate, X_test, y_test = model_data()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Transform text data using TF-IDF

## Split the data

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_validate.shape)
print(y_validate.shape)
print(X_test.shape)
print(y_test.shape)

## Cross Validation

In [None]:
logit_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

svm_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

boost_params = {'learning_rate': [0.0001, 0.001, 0.01, 0.1]}

### Logistic Regression

In [None]:
logit_grid_search = GridSearchCV(LogisticRegression(), logit_params, cv=5)

logit_grid_search.fit(X_train, y_train)

logit_grid_search.score(X_validate, y_validate)

In [None]:
logit_grid_search.best_estimator_

### Support Vector Classifier

In [None]:
# svm_grid_search = GridSearchCV(SVC(), svm_params, cv=5)
# svm_grid_search.fit(X_train, y_train)

# # svm_grid_search.score(X_validate, y_validate)

In [None]:
# svm_grid_search.best_estimator_

### Naive Bayes

In [None]:
gaus = GaussianNB()
gaus.fit(X_train, y_train)
gaus.score(X_validate, y_validate)

### Ridge Classifier

In [None]:
clf = RidgeClassifierCV()
clf.fit(X_train, y_train)
clf.score(X_validate, y_validate)

### Gradient Boost

In [None]:
boost_grid_search = GridSearchCV(GradientBoostingClassifier(), boost_params, cv=5)

In [None]:
# boost_grid_search.fit(X_train, y_train)
# boost_grid_search.score(X_validate, y_validate)

### Random Forest

In [None]:
tree = RandomForestClassifier()
tree.fit(X_train, y_train)
tree.score(X_validate, y_validate)

In [None]:
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

logit = LogisticRegression(C=10).fit(X_train, y_train)

train['predicted'] = logit.predict(X_train)
validate['predicted'] = logit.predict(X_validate)
test['predicted'] = logit.predict(X_test)

# Evaluate

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.predicted))

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))