In [1]:
import os
import time
import numpy as np
import pandas as pd
from collections import defaultdict, OrderedDict

import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt

import ipywidgets as widgets
from IPython.display import display, Markdown, HTML, clear_output, display_html

import warnings
warnings.filterwarnings("ignore")

from src.train import Train
from src.train.ensemble import Model
from src.config import Config

In [2]:
import nltk
from sklearn.pipeline import Pipeline
from sklearn.utils import class_weight
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [3]:
CLASSFICATION_ALGORITHMS = dict(
    XGBC = dict(alg=XGBClassifier, args=dict(random_state=42, use_label_encoder=False, early_stopping=5, 
                                             enable_categorical=True,
                                             eval_metric="aucpr",
                                             sample_weight=None
                                            )),
    
    LGBMC = dict(alg=LGBMClassifier, args=dict(early_stopping=5,
                                               class_weight=None,
                                              )),
    
    XGBC_TUNED = dict(alg=XGBClassifier, args=dict(random_state=42, use_label_encoder=False, early_stopping=5,
                                                   enable_categorical=True,
                                                   eval_metric="aucpr",
                                                   sample_weight=None),
                      param_grid = {
#                           "scale_pos_weight": [0.5, 1.0, 2.0, 4.5],
                          "n_estimators": [25, 50, 100],
                          "max_delta_step": [0, 1.0, 3.0, 5.0],
                          "max_bin": [2, 5, 7, 10],
                          "max_depth": [5, 6, 8, None],
                          "gamma": [0.5, 1.5, 2.5, 4],
                          "min_child_weight": [0.05, 0.01, 1, 2],
                          "eta": [0.005, 0.01, 0.05],
                          "learning_rate": [0.01, 0.05, 0.1],
#                           "subsample": [0.5, 0.7],
#                           "colsample_bytree": [0.5, 0.7],
#                           "colsample_bylevel": [0.5, 0.7],
#                           "colsample_bynode": [0.5, 0.7],
#                           "alpha": [0.5, 0.7, 0.9, 1.3],
#                           "lambda": [0.5, 0.7, 0.9, 1.3],
#                           "reg_alpha": [0.5, 0.7, 0.9, 1.3],
#                           "reg_lambda": [0.5, 0.7, 0.9, 1.3],
                      }
                     ),
    
    LGBMC_TUNED = dict(alg=LGBMClassifier, args=dict(early_stopping=5,
                                                    class_weight=None,
                                                   ),
                      param_grid = {
#                           "scale_pos_weight": [0.5, 1.0, 2.0, 4.5],
                          "n_estimators": [25, 50, 100],
                          "max_delta_step": [0, 3, 6, 9],
                          "max_bin": [2, 5, 7, 10],
                          "max_depth": [5, 6, 8, None],
                          "gamma": [0.5, 1.5, 2.5, 4],
                          "min_child_weight": [0.05, 0.01, 1, 2],
                          "min_sum_hessian_in_leaf": [0.001, 0.005, 0.01, 0.05],
                          "min_data_in_leaf": [60, 120, 240],
                          "eta": [0.005, 0.01, 0.05],
                          "learning_rate": [0.01, 0.05, 0.1],
#                           "lambda_l1": [0.5, 0.7, 0.9, 1.3],
#                           "lambda_l2": [0.5, 0.7, 0.9, 1.3],
#                           "reg_alpha": [0.5, 0.7, 0.9, 1.3],
#                           "reg_lambda": [0.5, 0.7, 0.9, 1.3],
#                           "path_smooth": [0.5, 0.7, 0.9, 1.3],
#                           "bagging_fraction": [0.5, 0.7, 0.8],
#                           "feature_fraction": [0.5, 0.7, 0.8],
#                           "colsample_bytree": [0.5, 0.7],
                      }
                     ),
)

In [4]:
display(Markdown("<h2>Book Reviews Sentiment Analysis</h2>"))
loading_section       = ["Prepare Model Data"]
sections              = ["Ensemble Model", "LSTM", "BERT"]
conclusion_section    = ["Summary"]

train_sub_section   = ["Train", "Model Evaluation", "Model Interpretation"]
me_sub_section      = ["Best Model", "Interpretability"]

accordions = OrderedDict()
accordions["** Loading **"] = widgets.Accordion(children=[widgets.Output() for section in loading_section])
[accordions["** Loading **"].set_title(i, section) for i, section in enumerate(loading_section)]

for section in sections:
    if (section == "Ensemble Model") or (section == "LSTM") or (section == "BERT") :
        accordions[section] = widgets.Accordion(children=[widgets.Output() for sub_section in train_sub_section])
        [accordions[section].set_title(i, sub_section) for i, sub_section in enumerate(train_sub_section)]
    else:
        accordions[section] = widgets.Accordion(children=[widgets.Output() for sub_section in me_sub_section])
        [accordions[section].set_title(i, sub_section) for i, sub_section in enumerate(me_sub_section)]
        
accordions["** Conclusion **"] = widgets.Accordion(children=[widgets.Output() for section in conclusion_section])
[accordions["** Conclusion **"].set_title(i, section) for i, section in enumerate(conclusion_section)]
        
widget_fields = widgets.Tab(children=[accordions[t] for t in accordions])
[widget_fields.set_title(i, sub) for i, sub in enumerate(accordions.keys())]

<h2>Book Reviews Sentiment Analysis</h2>

[None, None, None, None, None]

In [5]:
widget_fields

Tab(children=(Accordion(children=(Output(),), titles=('Prepare Model Data',)), Accordion(children=(Output(), O…

In [6]:
train = Train(target_var="sentiment", predictives="reviews", suffix="")
self = train

In [7]:
%matplotlib agg

with accordions["** Loading **"].children[0]:
    clear_output()
    display(Markdown("<h2> Initiating Modelling Data Preparation ... </h2>"))
    train.prepare_model_data()

#     display(train.histogram_plot(xvar=train.token_lens, 
#                                  xlabel="Token count: Most of the reviews seem to contain less than 128 tokens, to be on the safe side and choose a maximum length of 160."))

In [8]:
%matplotlib agg
section = "Ensemble Model"
algorithms = ["LGBMC", "LGBMC_TUNED"]

with accordions[section].children[0]:
    clear_output()
    display(Markdown(r"<h2> Initiate Training of Ensemble Models on Sentiment Analysis: </h2>"))
    train.run(data=train.data["reviews_abt"], algorithms=algorithms)

In [10]:
%matplotlib agg
section = "Ensemble Model"
algorithms = ["LGBMC", "LGBMC_TUNED"]

with accordions[section].children[1]:
    clear_output()
    display(Markdown(r"<h2> Evaluate Model: </h2>"))
    display(train.data["metrics_df"])
    
    for alg in train.models:
        display(Markdown(r"<h4> Model: {} </h4>".format(alg)))
        display(train.confusion_matrix_plot(cf_matrix=train.models[alg]["test_metrics"]["CM"]))

In [None]:
%matplotlib agg
section = "Ensemble Model"
algorithms = ["LGBMC", "LGBMC_TUNED"]

with accordions[section].children[2]:
    clear_output()
    display(Markdown(r"<h2> Evaluate Model: </h2>"))
    display(train.data["metrics_df"])
    
    for alg in train.models:
        display(Markdown(r"<h4> Model: {} </h4>".format(alg)))
        display(train.confusion_matrix_plot(cf_matrix=train.models["LGBMC_TUNED"]["test_metrics"]["CM"]))