## Imports and Settings


---

In [5]:
%%capture

# !pip install simpletransformers
# !pip install neattext
# !pip install spacy
# !pip install torch

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import neattext.functions as ntfx
import nltk
import spacy
import torch

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

from simpletransformers.classification import ClassificationModel, ClassificationArgs

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.utils.class_weight import compute_class_weight

nltk.download("punkt")
spacy.cli.download("en_core_web_sm")
sp = spacy.load("en_core_web_sm")
spacy_stopwords = list(sp.Defaults.stop_words)
stemmer = SnowballStemmer("english")

%matplotlib inline
plt.style.use("seaborn-notebook")

np.random.seed(42)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nsilvest\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
# load source data
raw_data = pd.read_csv("Cherwell Case Export - Assessment Sample.csv")

## Pre-processing


---

In [None]:
# --- Create initial df clean & filter function ---

def clean_and_filter(raw_data):
    """
    Function takes raw_data df and applies various cleaning (i.e.,
    column drops, column renames, remove duplicates etc.) and filtering
    steps to return a 'clean' version of the raw_data df.
    """
    # apply source and category filters
    df = raw_data.loc[
        (raw_data["Source"] == "E-mail") &
         (raw_data["Category"] != "Undeliverable Emails")]

    # remove irrelevant features
    df = df.iloc[:, [4,9,14]]

    # shorten column names
    df = df.rename(columns={"Case ID":"case_id",
                            "Description - Rich Text":"email_text",
                            "Owned By Team":"owned_by"})

    # remove duplicate rows
    df = df.drop_duplicates()

    # remove emails with no contents or label
    df = df.dropna()

    # remove emails with <100 class instances
    # commented out as incompatible with assessment sample data
    # df = df.groupby("owned_by").filter(lambda x: len(x) > 99)

    # remove 'please categorise this call auto-logged...' text
    df["email_text"] = df["email_text"].str[54:]

    df["email_text"] = df["email_text"].str.lower()

    return df

In [None]:
# --- Create initial pre-processing and outlier removal function ---

def remove_outliers(df):
    """
    Function applies additional filtering to the clean df to remove
    overly short/long emails.
    """
    # remove very short/long emails
    df = df.loc[
        (df["email_text"].str.len() > 50) & (df["email_text"].str.len() < 5000)]

    return df

def pre_processing(df):
    """
    Function applies a sequence of initial pre-processing steps to the clean
    df leveraging the functions provided in the neattext library:
    https://github.com/Jcharis/neattext
    """
    # remove line break indicators
    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_custom_pattern(x, r"\\r|\\n"))

    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_urls(x))

    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_terms_in_bracket(x, bracket_form="normal"))

    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_html_tags(x))

    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_emails(x))

    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_dates(x))

    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_special_characters(x))

    # remove date suffixes
    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_custom_pattern(x, r"(\d)(st|nd|rd|th)"))

    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_numbers(x))

    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_multiple_spaces(x))

    # second iteration of removing date suffixes after removing numbers
    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_custom_pattern(x, r"(\d)(st|nd|rd|th)"))

    # remove aberdeenshire's standard gaelic email footer
    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_custom_pattern(
            x,
            r"dhfhaodadh fiosrachadh sochaire a tha amhin airson an|"
            r"neach gu bheil am postdealain air a chur a bhith an|"
            r"seo ma tha thu air am postdealain fhaighinn mar|"
            r"mhearachd gabh ar leisgeul agus cuir fios chun an|"
            r"neach a chuir am postdealain agus dubh s am postdealain|"
            r"an didh sin s e beachdan an neach a chuir am postdealain|"
            r"a tha ann an gin sam bith a thid a chur an cill agus|"
            r"chan eil e a ciallachadh gu bheil iad a|"
            r"riochdachadh beachdan chomhairle shiorrachd obar dheathain"))

    # remove aberdeenshire's standard english email footer
    df["email_text"] = df["email_text"].apply(
        lambda x: ntfx.remove_custom_pattern(
            x,
            r"this email may contain privileged information intended solely|"
            r" for the use of the individual to whom it is addressed if|"
            r"you have received this email in error please accept our|"
            r"apologies and notify the sender deleting the email afterwards|"
            r"any views or opinions presented are solely those of the emails|"
            r"author and do not necessarily represent those of|"
            r"aberdeenshire council"))

    # effectively remove the "please consider this call categorised..." prefix
    df["email_text"] = df["email_text"].str[12:]

    return df

In [None]:
#  --- Prepare data for Pipeline 1 ---

# create clean/filtered df and apply first iter of outlier removal
df_baseline = raw_data.pipe(clean_and_filter).pipe(remove_outliers)

# apply initial pre-processing and second iter of outlier removal
df_baseline = df_baseline.pipe(pre_processing).pipe(remove_outliers)

In [None]:
# --- Prepare data for Pipeline 2 ---

# create copy of df_baseline
df_transformer = df_baseline

# encode numerical rep of "owned_by"
df_transformer["owned_id"] = df_transformer["owned_by"].astype(
    "category").cat.codes

# as data is imbalanced, capture class weights for model training
labels_transformer = df_transformer["owned_by"].values

labels_transformer_weights = list(compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels_transformer),
    y=labels_transformer))

# remove case_id and owned_by
df_transformer = df_transformer.iloc[:, [1,3]]

# remove columns to align with simpletransformer api
df_transformer = df_transformer.rename(columns={
    "email_text":"text",
    "owned_id":"label"})

In [None]:
# --- Create train/test split to prevent leakage into subsequent rep learning---

# create train/test splits for Pipeline 1

features_baseline = df_baseline["email_text"].values
labels_baseline = df_baseline["owned_by"].values

X_train_baseline, X_test_baseline, y_train_baseline, y_test_baseline = train_test_split(
    features_baseline,
    labels_baseline,
    test_size=0.20,
    shuffle=True,
    random_state=0)

# create train/test splits for Pipeline 2

df_transformer_train, df_transformer_eval = train_test_split(
    df_transformer,
    test_size=0.20,
    shuffle=True,
    random_state=0)

## Implementation Pipelines


---

### Pipeline 1: Baseline Approach


In [None]:
# --- Create tfidfvectorization class/function ---

class TFIDFVectorizer_Plus_Stemmer(TfidfVectorizer):
  """
  Customised version of sklearn's TFIDFVectorizer (utisiling the in-built
  build_analyzer method) so that a stemmer is applied to each word.
  """
  stemmer = SnowballStemmer("english")

  def build_analyzer(self):
        analyzer = super(TFIDFVectorizer_Plus_Stemmer, self).build_analyzer()
        return lambda email: (TFIDFVectorizer_Plus_Stemmer.stemmer.stem(word)
         for word in analyzer(email))

# create the tokenizer function to pass into the TFIDFVectorizer

def email_tokenizer(email):
  """
  Function applies the standard word_tokenize method from the NLTK library.
  Function is intended to be passed into the "tokenizer" parameter of the
  TFIDFVectorizer.
  """
  return [word for word in word_tokenize(email)]

In [None]:
# --- Create tfidf representation ---

# generate list of stopwords from the spacy library to use instead of the
# tfidfvectorizer defaults
spacy_stopwords = list(sp.Defaults.stop_words)

# instantiate the custom tfidfvectorizer
tfidf_vectorizer = TFIDFVectorizer_Plus_Stemmer(tokenizer=email_tokenizer,
                                                analyzer="word",
                                                stop_words=spacy_stopwords)

# fit and transform training features to a numerical representation
X_train_baseline_vector = tfidf_vectorizer.fit_transform(X_train_baseline)

# use the fitted vectorizer to transform the test features
X_test_baseline_vector = tfidf_vectorizer.transform(X_test_baseline)

In [None]:
%%capture

# --- Instantiate a range of potential baseline model options ---

# model selection made with reference to
# https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

models = [LinearSVC(),
          LogisticRegression(),
          MultinomialNB(),
          RandomForestClassifier()
          ]

cross_validation_n = 5

cross_validation_df = pd.DataFrame(index=range(
    cross_validation_n * len(models)))

cross_validation_df_input = []

# apply sklearn's cross_val_score method to each potential baseline models
for model in models:
    model_name = model.__class__.__name__

    f1_scores = cross_val_score(model,
                                X_train_baseline_vector,
                                y_train_baseline,
                                scoring="f1_macro",
                                cv=cross_validation_n)

    for fold_iter, f1 in enumerate(f1_scores):
        cross_validation_df_input.append((model_name, fold_iter, f1))

cross_validation_df = pd.DataFrame(cross_validation_df_input,
                                  columns=["model_name",
                                           "fold_iter",
                                           "f1-macro"])

In [None]:
# --- Assess performance of potential baseline models ---

cross_validation_df.groupby("model_name")["f1-macro"].mean()

# LinearSVC selected as a baseline model with highest F1 macro score

In [None]:
%%capture

# --- Determine most effective parameters for selected baseline model ---

baseline_model = LinearSVC()

baseline_model_paras = {"C":(0, 0.1, 1.0, 10),
                        "loss":("hinge", "squared_hinge")}

baseline_clf = GridSearchCV(baseline_model,
                            baseline_model_paras)

baseline_clf.fit(X_train_baseline_vector,
                 y_train_baseline)

In [None]:
# --- Fit data to selected baseline model and generate predictions ---

baseline_model_rev = LinearSVC(**baseline_clf.best_params_)

baseline_model_rev.fit(X_train_baseline_vector,
                       y_train_baseline)

y_baseline_pred = baseline_model_rev.predict(X_test_baseline_vector)

### Pipeline 2: Transformer-based Approach


In [None]:
# --- Create classification evaluation functions for transformer evaluation ---

def create_clf_report(labels, preds):
  """
  Function leverages the classification report method from sklearn metrics.
  Output is requested in dictionary format to enable subsequent manipulation
  for visualisation purposes.
  """
  return classification_report(labels, preds, output_dict=True)

def create_f1_score(labels, preds):
  """
  Function leverages the F1-Score method from sklearn metrics. Macro
  version of the metric is calculated using the "average" parameter.
  """
  return f1_score(labels, preds, average="macro")

def create_precision_score(labels, preds):
  """
  Function leverages the precision score method from sklearn metrics. Macro
  version of the metric is calculated using the "average" parameter.
  """
  return precision_score(labels, preds, average="macro")

def create_recall_score(labels, preds):
  """
  Function leverages the recall score method from sklearn metrics. Macro
  version of the metric is calculated using the "average" parameter.
  """
  return recall_score(labels, preds, average="macro")

In [None]:
# --- Instantiate roberta-base transformer ---

# Transformer pipeline uses the simpletransformers library which
# abstracts various Hugging Face methods into a standardised
# and simplified pattern of: initialise a task-specific model >
# train the model > evaluate the model > make predictions
# https://github.com/ThilinaRajapakse/simpletransformers/tree/master

# GPU availabilty check
cuda_available = torch.cuda.is_available

# simpletransformer step: initialise a task-specific model
transformer_model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=len(set(labels_transformer)),
    use_cuda=cuda_available,
    weight=labels_transformer_weights)
    # passing the computed class weights into the weight parameter to adjust
    # the loss calculations to take imbalanced class distribution into account

In [None]:
# --- Finetune/train checkpoint version of roberta ---

# Fine-tuning is required otherwise the roberta model will map its pretrained/
# default output to the embedded input. Additional training allows the model
# outputs to be adjusted to the labelled training data.

# limited parameter tuning / adjustments to ensure available GPU resources
# are not being underutilised
transformer_model_args = {
    "use_multiprocessing":False,
    "use_multiprocessing_for_evaluation":False,
    "overwrite_output_dir":True,
    "num_train_epochs":30,
    "train_batch_size":128, # default = 8 (revert if causes OOM error)
    "learning_rate":1e-5 # default 4e-5
}

# simpletransformer step: train the model
# additional evaluation metrics incorporated into default train_model method
transformer_model.train_model(df_transformer_train,
                              clf_report=create_clf_report,
                              f1_score=create_f1_score,
                              precision_score=create_clf_report,
                              recall_score=create_recall_score,
                              args=transformer_model_args)

# simpletransformer step: evaluate the model
# same evaluation metrics incorporated into default eval_model method
transformer_result, transformer_model_ouputs,\
 transformer_wrong_preds = transformer_model.eval_model(
    df_transformer_eval,
    clf_report=create_clf_report,
    f1_score=create_f1_score,
    precision_score=create_clf_report,
    recall_score=create_recall_score,)

## Results


---

In [None]:
# --- Create classification report -> dataframe function ---

def clf_report_to_df(clf_report, pipeline_name):
  """
  Function converts the classification report dictionary output generated by
  both models into a df to enable subsequent visualisation.
  """
  df = pd.DataFrame.from_dict(clf_report).T.head(
      len(set(df_baseline["owned_by"].values)))

  df = df.reset_index()
  df = df.rename(columns={"index":"owned_by"})

  clf_report_dict_map = {
      "0.0":"HR Advisor",
      "1.0":"HR Payroll (Local Government)",
      "2.0":"HR Payroll (Teaching)",
      "3.0":"HR Post Team",
      "4.0":"HR Resourcing",
      "5.0":"HR Reward and Analytics",
      "6.0":"HR Support (Local Government)",
      "7.0":"HR Support (Teaching)",
      "8.0":"HR Travel",
      "9.0":"askHR Advisor",
      "10.0":"iTrent Help"
  }

  df = df.replace({"owned_by": clf_report_dict_map})

  df.insert(0, "pipeline", pipeline_name)

  return df

In [None]:
# --- Generate evaluation metrics for Pipeline 1 ---

baseline_model_rev_classreport = classification_report(y_test_baseline,
                                                       y_baseline_pred,
                                                       output_dict=True)

print("Figure 1 / Pipeline 1 - Baseline Classification Report"), print()

df_clf_report_baseline = clf_report_to_df(baseline_model_rev_classreport,
                                          "Baseline")

df_clf_report_baseline["support"] = df_clf_report_baseline[
    "support"].astype("int")

df_clf_report_baseline.style.bar(subset=["precision", "recall", "f1-score"],
                                 color="lightgrey",
                                 align=0,
                                 height=80
                                 )

In [None]:
# --- Generate evaluation metrics for Pipeline 2 ---

print("Figure 2 / Pipeline 2 - Transformer Classification Report"), print()

df_clf_report_transformer = clf_report_to_df(transformer_result["clf_report"],
                                             "Transformer")

df_clf_report_transformer["support"] = df_clf_report_transformer[
    "support"].astype("int")

df_clf_report_transformer.style.bar(subset=["precision", "recall", "f1-score"],
                                    color="lightgrey",
                                    align=0,
                                    height=80
                                    )

In [None]:
# --- Format evaluation metrics into df for subsequent plotting ---

df_clf_plot = df_clf_report_baseline

df_clf_plot["transformer_precision"] = df_clf_report_transformer["precision"]
df_clf_plot["transformer_recall"] = df_clf_report_transformer["recall"]
df_clf_plot["transformer_f1"] = df_clf_report_transformer["f1-score"]

df_clf_plot = df_clf_plot.rename(columns={"precision":"baseline_precision",
                                          "recall":"baseline_recall",
                                          "f1-score":"baseline_f1"})

df_clf_plot = df_clf_plot.drop(df_clf_plot.iloc[:, [0,5]], axis=1)

In [None]:
# --- Generate visualisation of selected classification metrics ---

my_range=range(1,len(df_clf_plot.index)+1)
metrics = ["precision", "recall", "f1"]

fig, axs = plt.subplots(1,3, sharey=True)
fig.set_figwidth(15)

for i, metric in enumerate(metrics):

  axs[i].set_title(metric.title(), pad=10)
  axs[i].set_xlim(xmin=0, xmax=1.0)

  dict_baseline_key = "baseline_" + metric
  dict_transformer_key = "transformer_" + metric

  eval_func = metric + "_score"
  metric_func = globals()[eval_func] # enables eval_func to be used as func name

  axs[i].hlines(y=my_range,
                xmin=df_clf_plot[dict_baseline_key],
                xmax=df_clf_plot[dict_transformer_key],
                color="grey",
                alpha=0.25)

  axs[i].scatter(df_clf_plot[dict_baseline_key],
                 my_range,
                 color="blue",
                 alpha=1,
                 label="Baseline")

  axs[i].scatter(df_clf_plot[dict_transformer_key],
                 my_range,
                 color="red",
                 alpha=1,
                 label="Transformer",
                 marker="s")

  axs[i].axvline(
      x=metric_func(y_test_baseline,y_baseline_pred,average="macro"),
      color="blue",
      alpha=0.15,
      label="Baseline Macro"
  )

  # transformer results output labels f1 score as 'f1-score'
  if i == 2:
    transformer_metric = metric + "-score"
  else:
    transformer_metric = metric

  axs[i].axvline(
      x=transformer_result["clf_report"]["macro avg"][transformer_metric],
      color="red",
      alpha=0.15,
      label="Transformer Macro"
  )

plt.legend(loc="lower right", bbox_to_anchor=(1.20, 0.75))

plt.yticks(my_range, df_clf_plot['owned_by'])
plt.xlabel("")
plt.ylabel("")
plt.xlim(xmin=0, xmax=1.0)

fig.suptitle("Figure 3 / distribution of selected classification metrics by class label",
             fontsize=18, x=0.26, y=1.025)

plt.gca().invert_yaxis()

plt.show()