In [4]:
pip install wordcloud


Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install plotly

Collecting plotly
  Downloading plotly-4.13.0-py2.py3-none-any.whl (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 10.5 MB/s eta 0:00:01
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11430 sha256=151f29f1fd5424eebed69fb9ce2b68e2575febe0a3e7280d13ad7f329912a78f
  Stored in directory: /root/.cache/pip/wheels/f9/8d/8d/f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.13.0 retrying-1.3.3
Note: you may need to restart the kernel to use updated packages.


In [6]:
# data manipulation
import pandas as pd

# numpy arrays
import numpy as np

# data visualization
import seaborn as sns

import matplotlib.pyplot as plt

import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer



sns.set()

# NLP
import string


import nltk
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


import re

# machine learning
from sklearn.datasets import fetch_20newsgroups

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline


from sklearn.linear_model import LogisticRegression     # Logistic Regression
from sklearn.naive_bayes import MultinomialNB           # Naive Bayes
from sklearn.svm import LinearSVC                       # SVM
from sklearn.ensemble import RandomForestClassifier     # Random Forest

from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.calibration import CalibratedClassifierCV

# Model explainability

# other
from pprint import pprint
from time import time
import logging
from functools import partial
import joblib



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
def get_n_color_palette(palette_name, n_colors, as_hex=False):
    palette = sns.color_palette(palette=palette_name, n_colors=n_colors)
    if as_hex:
        palette = palette.as_hex()
    palette.reverse()
    return palette

In [8]:
en_stop_words = list(set(stopwords.words('english')))

In [9]:
def save_fig_as_div(fig_obj, file_name):
    with open(f'assets/{file_name}', 'w') as fig_file:
        fig_div_string = plotly.offline.plot(figure_or_data=fig_obj, output_type='div',
                                             include_plotlyjs='cdn')
        fig_file.write(fig_div_string)

In [10]:
def get_classification_report(y_true, y_pred, target_names):
    
    # calculate classification report, and convert it to DataFrame
    clf_report = classification_report(y_true=y_true, y_pred=y_pred,
                                       target_names=target_names,
                                       output_dict=True)
    clf_report_df = pd.DataFrame(data=clf_report)
    clf_report_df = clf_report_df.T
    clf_report_df.drop(columns=['support'], inplace=True)
    
    measures = clf_report_df.columns.tolist()
    classes = clf_report_df.index.tolist()
    
    # create plotly annotated heatmap, and update styling
    fig = ff.create_annotated_heatmap(clf_report_df.values,
                                      x=measures,
                                      y=classes)
    fig.update_layout(autosize=False,
                      width=800, height=800,
                      title_text='<i><b>Classification report</b></i>',
                      xaxis_title="Measures", yaxis_title="Class",
                      plot_bgcolor='rgba(0, 0, 0, 0)',
                      paper_bgcolor= 'rgba(0, 0, 0, 0)',
                      font={
                          'family': "Courier New, monospace",
                          'size': 14,
                          # 'color': "#eaeaea"
                      }
                     )
    fig.update_xaxes(tickangle=-45)
    fig['data'][0]['showscale'] = True

    return fig

In [11]:
def get_confusion_matrix(y_true, y_pred, labels):
    
    # claculate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true,
                                   y_pred=y_pred,
                                   labels=labels)
    conf_matrix = np.flipud(conf_matrix)
    
    # create annotated heat map of the confusion matrix
    fig = ff.create_annotated_heatmap(conf_matrix,
                                      x=labels.tolist(),
                                      y=labels.tolist()[::-1])
    fig.update_layout(autosize=False,
                      width=800, height=800,
                      title_text='<i><b>Confusion matrix</b></i>',
                      xaxis_title="Predicted category", yaxis_title="Real category",
                      plot_bgcolor='rgba(0, 0, 0, 0)',
                      paper_bgcolor= 'rgba(0, 0, 0, 0)',
                      font={
                          'family': "Courier New, monospace",
                          'size': 14,
                          # 'color': "#eaeaea"
                      }
                     )
    fig.update_xaxes(tickangle=-45)
    fig['data'][0]['showscale'] = True
    
    return fig

In [12]:
newsgroups_data = fetch_20newsgroups(subset='all')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
