## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from joblib import parallel_backend
from project_functions.modeling import run_model
from project_functions.utils import picklify, unpickle

## Loading Data

In [None]:
X_train_count_unigrams = unpickle('../data/cleaned/train/X_train_count_unigrams.pickle')
X_test_count_unigrams = unpickle('../data/cleaned/test/X_test_count_unigrams.pickle')
X_train_count_bigrams = unpickle('../data/cleaned/train/X_train_count_bigrams.pickle')
X_test_count_bigrams = unpickle('../data/cleaned/test/X_test_count_bigrams.pickle')
X_train_gensim_2gram_sparse = unpickle('../data/cleaned/train/X_train_gensim_2gram_sparse.pickle').T
X_test_gensim_2gram_sparse = unpickle('../data/cleaned/test/X_test_gensim_2gram_sparse.pickle').T
X_train_gensim_3gram_sparse = unpickle('../data/cleaned/train/X_train_gensim_3gram_sparse.pickle').T
X_test_gensim_3gram_sparse = unpickle('../data/cleaned/test/X_test_gensim_3gram_sparse.pickle').T
X_train_gensim_4gram_sparse = unpickle('../data/cleaned/train/X_train_gensim_4gram_sparse.pickle').T
X_test_gensim_4gram_sparse = unpickle('../data/cleaned/test/X_test_gensim_4gram_sparse.pickle').T
X_train_glove = unpickle('../data/cleaned/train/X_train_glove.pickle')
X_test_glove = unpickle('../data/cleaned/test/X_test_glove.pickle')
X_train_word2vec = unpickle('../data/cleaned/train/X_train_word2vec.pickle')
X_test_word2vec = unpickle('../data/cleaned/test/X_test_word2vec.pickle')
y_train = unpickle('../data/cleaned/train/y_train.pickle')
y_test = unpickle('../data/cleaned/test/y_test.pickle')

## Multinomial Naive Bayes Models

In [None]:
mnb_count_unigrams = run_model(X_train_count_unigrams, X_test_count_unigrams, y_train, y_test,
                               MultinomialNB(), plot_confusion = True, display_report = True,
#                                pickle_ = True, pickle_dest = '../data/models/mnb_count_unigrams.pickle'
                              )

In [None]:
mnb_count_bigrams = run_model(X_train_count_bigrams, X_test_count_bigrams, y_train, y_test,
                              MultinomialNB(), plot_confusion = True, display_report = True,
#                               pickle_ = True, pickle_dest = '../data/models/mnb_count_bigrams.pickle'
                             )

In [None]:
mnb_gensim_2gram = run_model(X_train_gensim_2gram_sparse, X_test_gensim_2gram_sparse, y_train, y_test,
                             MultinomialNB(), plot_confusion = True, display_report = True,
#                              pickle_ = True, pickle_dest = '../data/models/mnb_gensim_2gram.pickle'
                            )

In [None]:
mnb_gensim_3gram = run_model(X_train_gensim_3gram_sparse, X_test_gensim_3gram_sparse, y_train, y_test,
                             MultinomialNB(), plot_confusion = True, display_report = True,
#                              pickle_ = True, pickle_dest = '../data/models/mnb_gensim_3gram.pickle'
                            )

In [None]:
mnb_gensim_4gram = run_model(X_train_gensim_4gram_sparse, X_test_gensim_4gram_sparse, y_train, y_test,
                             MultinomialNB(), plot_confusion = True, display_report = True,
#                              pickle_ = True, pickle_dest = '../data/models/mnb_gensim_4gram.pickle'
                            )

## Untuned XGBoosted Models

In [None]:
baseline_xg_gensim_2gram = run_model(X_train_gensim_2gram_sparse, X_test_gensim_2gram_sparse, y_train, y_test,
                                     XGBClassifier(), plot_confusion = True, display_report = True,
#                                      pickle_ = True, pickle_dest = '../data/models/baseline_xg_gensim_2gram.pickle'
                                    )

In [None]:
baseline_xg_gensim_3gram = run_model(X_train_gensim_3gram_sparse, X_test_gensim_3gram_sparse, y_train, y_test,
                                     XGBClassifier(), plot_confusion = True, display_report = True,
#                                      pickle_ = True, pickle_dest = '../data/models/baseline_xg_gensim_3gram.pickle'
                                    )

In [None]:
baseline_xg_gensim_4gram = run_model(X_train_gensim_4gram_sparse, X_test_gensim_4gram_sparse, y_train, y_test,
                                     XGBClassifier(), plot_confusion = True, display_report = True,
#                                      pickle_ = True, pickle_dest = '../data/models/baseline_xg_gensim_4gram.pickle'
                                    )

In [None]:
baseline_xg_glove = run_model(X_train_glove, X_test_glove, y_train, y_test,
                              XGBClassifier(), plot_confusion = True, display_report = True,
#                               pickle_ = True, pickle_dest = '../data/models/baseline_xg_glove.pickle'
                             )

In [None]:
baseline_xg_word2vec = run_model(X_train_word2vec, X_test_word2vec, y_train, y_test,
                                 XGBClassifier(), plot_confusion = True, display_report = True,
#                                  pickle_ = True, pickle_dest = '../data/models/baseline_xg_word2vec.pickle'
                                )

## Gridsearch for Tuning XGBoosted Models

In [None]:
# Only tuning the XGBoosted model for the gensim bigram encoding.
# Tuning max depth and min child weight
params = {
    'n_estimators': [1000],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'alpha': [1],
    'gamma': [0]
}

pickle_dest = '../data/models/max_depth_min_child_xgboost_gensim_2gram.pickle'

xg_gridsearch_gensim = run_model(X_train_gensim_2gram_sparse, X_test_gensim_2gram_sparse, y_train, y_test,
                                 XGBClassifier(), grid_search = True, scoring = None,
                                 model_params = params, plot_confusion = True, display_report = True,
#                                  pickle_ = True, pickle_dest = pickle_dest
                                )

In [None]:
# Tuning gamma
params = {
    'n_estimators': [1000],
    'max_depth': [3],
    'min_child_weight': [1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'alpha': [1],
    'gamma': [i / 10.0 for i in range(0, 6)]
}

pickle_dest = '../data/models/gamma_xgboost_gensim_2gram.pickle'

xg_gridsearch_gensim = run_model(X_train_gensim_2gram_sparse, X_test_2gram_gensim_sparse, y_train, y_test,
                                 XGBClassifier(), grid_search = True, scoring = None,
                                 model_params = params, display_report = True,
#                                  pickle_ = True, pickle_dest = pickle_dest
                                )

In [None]:
# Tuning subsampling
params = {
    'n_estimators': [1000],
    'max_depth': [3],
    'min_child_weight': [1],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'alpha': [1],
    'gamma': [0.1]
}

pickle_dest = '../data/models/subsample_colsample_xgboost_gensim_2gram.pickle'

xg_gridsearch_gensim = run_model(X_train_gensim_2gram_sparse, X_test_gensim_2gram_sparse, y_train, y_test,
                                 XGBClassifier(), grid_search = True, scoring = None,
                                 model_params = params, display_report = True,
#                                  pickle_ = True, pickle_dest = pickle_dest
                                )

In [None]:
# Tuning subsampling further
params = {
    'n_estimators': [1000],
    'max_depth': [3],
    'min_child_weight': [1],
    'subsample':[0.9, 0.95, 1],
    'colsample_bytree':[0.9, 0.95, 1],
    'alpha': [1],
    'gamma': [0.1]
}

pickle_dest = '../data/models/subsample_colsample_higher_xgboost_gensim_2gram.pickle'

xg_gridsearch_gensim = run_model(X_train_gensim_2gram_sparse, X_test_gensim_2gram_sparse, y_train, y_test,
                                 XGBClassifier(), grid_search = True, scoring = None,
                                 model_params = params, display_report = True,
#                                  pickle_ = True, pickle_dest = pickle_dest
                                )

## Tuned XGBoosted Models

In [None]:
params = {
    'n_estimators': 1000,
    'max_depth': 3,
    'min_child_weight': 1,
    'subsample':1,
    'colsample_bytree':1,
    'alpha': 1,
    'gamma': 0.1
}

pickle_dest = '../data/models/best_xg_gensim_2gram.pickle'

best_xg_gensim_2gram = run_model(X_train_gensim_2gram_sparse, X_test_gensim_2gram_sparse, y_train, y_test,
                                 XGBClassifier(), model_params = params, display_report = True,
#                                  pickle_ = True, pickle_dest = pickle_dest
                                )

In [None]:
params = {
    'n_estimators': 1000,
    'max_depth': 3,
    'min_child_weight': 1,
    'subsample':1,
    'colsample_bytree':1,
    'alpha': 1,
    'gamma': 0.1
}

pickle_dest = '../data/models/best_xg_gensim_3gram.pickle'

best_xg_gensim_3gram = run_model(X_train_gensim_3gram_sparse, X_test_gensim_3gram_sparse, y_train, y_test,
                                 XGBClassifier(), model_params = params, display_report = True,
#                                  pickle_ = True, pickle_dest = pickle_dest
                                )

In [None]:
params = {
    'n_estimators': 1000,
    'max_depth': 3,
    'min_child_weight': 1,
    'subsample':1,
    'colsample_bytree':1,
    'alpha': 1,
    'gamma': 0.1
}

pickle_dest = '../data/models/best_xg_gensim_4gram.pickle'

best_xg_gensim_4gram = run_model(X_train_gensim_4gram_sparse, X_test_gensim_4gram_sparse, y_train, y_test,
                                 XGBClassifier(), model_params = params, display_report = True,
#                                  pickle_ = True, pickle_dest = pickle_dest
                                )

In [None]:
params = {
    'n_estimators': 500,
    'max_depth': 3,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree':1,
    'alpha': 1,
    'gamma': 0.1
}

pickle_dest = '../data/models/best_xg_glove.pickle'

best_xg_glove = run_model(X_train_glove, X_test_glove, y_train, y_test,
                                        XGBClassifier(), model_params = params,
#                           display_report = True,
                                        pickle_ = True, pickle_dest = pickle_dest)

In [None]:
params = {
    'n_estimators': 500,
    'max_depth': 3,
    'min_child_weight': 1,
    'subsample':1,
    'colsample_bytree':1,
    'alpha': 1,
    'gamma': 0.1
}

pickle_dest = '../data/models/best_xg_word2vec.pickle'

best_xg_word2vec = run_model(X_train_word2vec, X_test_word2vec, y_train, y_test,
                                        XGBClassifier(), model_params = params,
#                              display_report = True,
                                        pickle_ = True, pickle_dest = pickle_dest)