In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.manifold import TSNE
from joblib import parallel_backend
from project_functions.modeling import vectorize_data
import pickle

In [None]:
df = pd.read_parquet('../data/cleaned/tokenized_data_complete.parquet')

In [None]:
df.genre_name.value_counts()

In [None]:
def run_model(X_train, X_test, y_train, y_test, model, model_params = None, grid_search = False,
              random_state = 42, scoring = make_scorer(accuracy_score), cv = 3,
              plot_confusion = False, display_report = False, pickle = False, pickle_dest = None):
    
    with parallel_backend('threading', n_jobs = -1):
        if grid_search:
            classifier = GridSearchCV(estimator = model, param_grid = model_params,
                                 n_jobs = -1, scoring = scoring, cv = cv)
            if 'random_state' in classifier.get_params():
                classifier.set_params(**{'random_state': random_state})
        else:
            classifier = model
            if model_params is not None:
                classifier.set_params(**model_params)
            if 'random_state' in classifier.get_params():
                classifier.set_params(**{'random_state': random_state})
        
        classifier.fit(X_train, y_train)
                
        if any([plot_confusion, display_report]):
            y_pred = classifier.predict(X_test)
            y_score = classifier.score(X_test, y_test)
            print('Score: {}'.format(y_score))
        
        if plot_confusion:
            ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot();
            
        if display_report:
            print(classification_report(y_test, y_pred))
            
        if pickle:
            if pickle_dest is None:
                print('No pickle destination given, pickling skipped.')
            else:
                with open(pickle_dest, 'wb') as f:
                    pickle.dump(classifier, f)
        
    return classifier
    

In [None]:
vector_args = {
    'min_df': 0.05,
    'max_df': 0.95
}

X_train_count_unigram, X_test_count_unigram, y_train, y_test, count_vectorizer_unigram\
                        = vectorize_data(df, CountVectorizer(),
                                         vectorizer_args = vector_args,
                                         additional_features = True,
                                         min_samples = 2000, test_size = 0.2)
                                                               

In [None]:
vector_args = {
    'min_df': 0.05,
    'max_df': 0.95,
    'ngram_range': (1, 2)
}

X_train_count_bigram, X_test_count_bigram, y_train, y_test, count_vectorizer_bigram\
                        = vectorize_data(df, CountVectorizer(),
                                         vectorizer_args = vector_args,
                                         additional_features = True,
                                         min_samples = 2000, test_size = 0.2)

In [None]:
vector_args = {
    'min_df': 0.05,
    'max_df': 0.95
}

X_train_tfidf_unigram, X_test_tfidf_unigram, y_train, y_test, tfidf_vectorizer_unigram\
                        = vectorize_data(df, TfidfVectorizer(),
                                         vectorizer_args = vector_args,
                                         additional_features = True,
                                         min_samples = 2000, test_size = 0.2)
                                                               

In [None]:
vector_args = {
    'min_df': 0.05,
    'max_df': 0.95,
    'ngram_range': (1, 2)
}
X_train_tfidf_bigram, X_test_tfidf_bigram, y_train, y_test, tfidf_vectorizer_bigram\
                        = vectorize_data(df, TfidfVectorizer(),
                                         vectorizer_args = vector_args,
                                         additional_features = True,
                                         min_samples = 2000, test_size = 0.2)

In [None]:
pickle_dest = '../data/cleaned/multinomial_naive_bayes_count_unigram.pickle'

mnb_count_unigram = run_model(X_train_count_unigram, X_test_count_unigram, y_train, y_test,
                              MultinomialNB(), plot_confusion = True, display_report = True,
                              pickle = True, pickle_dest = pickle_dest)

In [None]:
pickle_dest = '../data/cleaned/multinomial_naive_bayes_count_bigram.pickle'

mnb_count_bigram = run_model(X_train_count_bigram, X_test_count_bigram, y_train, y_test,
                              MultinomialNB(), plot_confusion = True, display_report = True,
                              pickle = True, pickle_dest = pickle_dest)

In [None]:
pickle_dest = '../data/cleaned/multinomial_naive_bayes_tfidf_unigram.pickle'

mnb_tfidf_unigram = run_model(X_train_tfidf_unigram, X_test_tfidf_unigram, y_train, y_test,
                              MultinomialNB(), plot_confusion = True, display_report = True,
                              pickle = True, pickle_dest = pickle_dest)

In [None]:
pickle_dest = '../data/cleaned/multinomial_naive_bayes_tfidf_bigram.pickle'

mnb_tfidf_bigram = run_model(X_train_tfidf_bigram, X_test_tfidf_bigram, y_train, y_test,
                              MultinomialNB(), plot_confusion = True, display_report = True,
                              pickle = True, pickle_dest = pickle_dest)

In [None]:
pickle_dest = '../data/cleaned/baseline_xgboost_count_bigram.pickle'

baseline_xg_count_bigram = run_model(X_train_count_bigram, X_test_count_bigram, y_train, y_test,
                                     XGBClassifier(), plot_confusion = True, display_report = True,
                                     pickle = True, pickle_dest = pickle_dest)

In [None]:
# Tuning max depth and min child weight
params = {
    'n_estimators': [1000],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 3, 5]
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'alpha': [1],
    'gamma': [0],
    'scale_pos_weight': [1]
}

pickle_dest = '../data/cleaned/max_depth_min_child_xgboost_count_bigram.pickle'

xg_gridsearch_count_unigram = run_model(X_train_count_bigram, X_test_count_bigram, y_train, y_test,
                                        XGBClassifier(), grid_search = True, scoring = None,
                                        model_params = params, plot_confusion = True, display_report = True,
                                        pickle = True, pickle_dest = pickle_dest)