In [None]:
from collections import Counter

import emoji
import ktrain
import numpy as np
import pandas as pd
import seaborn as sns
from eli5.lime import TextExplainer
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

from twitterFunctions.cross_validation import run_SVM_CV
from twitterFunctions.performanceMetrics import get_performance
from twitterFunctions.processing import (
    fix_emotes,
    process_token_fin,
    process_tweet,
)
from twitterFunctions.training import predict_test, set_seeds, train_learner


plt.rcdefaults()


SEED = 1

In [None]:
# load training data
file = "./data/training_posts20201201_main_categories.tsv"
# read training data
df = pd.read_csv(file, sep="\t")

In [None]:
# filter relevant columns
df = df[["notserious_unclear", "focus", "type", "main_category", "contents"]]
# rename class and text columns
colNames = ["notserious_unclear", "focus", "type", "class", "text"]
df.columns = colNames

In [None]:
# here we print the class frequency distributions
freq_combined = Counter(df["class"].values)
objects = []
values = []
for i in freq_combined.keys():
    objects.append(i)
for i in freq_combined.values():
    values.append(i)

y_pos = np.arange(len(objects))

plt.bar(y_pos, values, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Frequency ')
plt.title('Class distribution tweets')
plt.xticks(rotation=90)
plt

In [None]:
# we run the preprocessing from the /twitterFunctions/processing.py
# here we use 3 functions (each loops over the entire dataset, they have to run sequentailly)
# if it takes too long one can make one function with all preprocessing - this recudes the runtime significantly
temp = df.text.apply(process_tweet)
# emojy mapping
emojis = temp.apply(emoji.emojize)
emojis = emojis.apply(fix_emotes)
cleaned_text = emojis
X_train_str = process_token_fin(cleaned_text)

In [None]:
# train val and test split
# stratify=df["class"] ensures that the subsets contain a similar distributions as original data
X_train, X_test, y_train, y_test = train_test_split(
    X_train_str, df["class"], test_size=0.2, random_state=1, stratify=df["class"])

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

class_names = set(y_train.values)

In [None]:
# visual analysis of number of tokens
ad = []
for i in X_train:
    ad.append(len(i.split(" ")))

_ = plt.hist(ad, bins='auto')
plt.xlabel("# of tokens")
plt.ylabel("frequency")
plt.title("Train Set: sequence length histogram")
plt.show()

# Majority Classifier

In [None]:
majority_class = Counter(y_train).most_common()[0][0]
y_val_predicted = np.repeat(majority_class, len(y_val))
y_test_predicted = np.repeat(majority_class, len(y_test))

In [None]:
validation_results = get_performance(
    y_val, y_val_predicted, list(Counter(y_train).keys()))
test_results = get_performance(
    y_test, y_test_predicted, list(Counter(y_train).keys()))

# TFIDF

In [None]:
print("Validation Performance: \n")
model = run_SVM_CV(X_train, X_val, y_train, y_val)
pred = model.predict(X_test)
print("Test Performance: \n")
mat = get_performance(y_test.values, pred, list(class_names))

# Training BERT / XLNET model
Set the *model_name* flag in the *train_learner* function to the model you want to train. For BERT use **bert-base-uncased**, for XLNET use **xlnet-base-cased**.

In [None]:
original_learner = train_learner(
    X_train, y_train.values,
    X_val, y_val.values,
    lr=3e-5, epoch=5, seed=1, text_length=80,  # parameters for the training
    #   checkpoint_folder="D:/models/test/",   # add the path where the checkpoints should be saved
    model_name="bert-base-uncased"  # xlnet-base-cased"
)

## After the training has finished, you will see the training statistics. I usually choose the epoch with the highest validation accuracy (if two performances were similar I chose the one with the lower validation loss).

If you used a checkpoint folder, you can load the model with the best performance rather than the last epoch. E.g. if you trained for 5 epochs, the default behaviour is that the model will use the last run, i.e. epoch 5. But if the performance of epoch 5 is worse than epoch 3, you can manually load the weights of epoch 3. See below:

In [None]:
# original_learner[4].load_weights("D:/models/test/weights-01.hdf5") # your path + weights-01.hdf5
# where weights-*BEST_EPOCH*.hdf5
learner_reloaded = ktrain.get_learner(
    original_learner[4], train_data=original_learner[2], val_data=original_learner[3], batch_size=2)

model_ = learner_reloaded
t_ = original_learner[1]
trn_ = original_learner[2]
# setting seeds (redundantly) before predicting validation set
set_seeds(SEED)
classNames2 = np.unique(y_train)
# usually you only have to do this if you are interested in inter-class statistics of the validation set:D
# PREDICT ON VALIDATION SET
pred = predict_test(X_val, model_,
                    t=t_,
                    trn=trn_)
predictor = pred[1]
val = t_.preprocess_test(X_val, y_val.values)
model_.validate(val_data=val)
mat = get_performance(y_val.values, pred[0], classNames2)

# predict TEST set, this one you must always do!
set_seeds(SEED)
pred = predict_test(X_test, model_,
                    t=t_,
                    trn=trn_)
predictor = pred[1]
test = t_.preprocess_test(X_test, y_test.values)
model_.validate(val_data=test, class_names=list(classNames2))
mat = get_performance(y_test.values, pred[0], classNames2)

## Saving a ktrain model

If the test performance was good, and you want to save the actual model (not only the weights of the training) call **predictor.save( "yourPath")**.

In [None]:
# predictor.save('D:/models/Twitter_6_final')

In [None]:
# this prints the confusion matrix for the test set and its predictions

mat = get_performance(y_test.values, pred, classNames2)

labels = y_test.values
capitalized = [i.capitalize() for i in classNames2]
index = columns = capitalized
cm_df = pd.DataFrame(mat/np.sum(mat), columns, index)
plt.figure(figsize=(10, 8.2))
sns.heatmap(cm_df, annot=True, fmt=".0%"
            )
plt.yticks(rotation=0, fontsize=12)
plt.xlabel("Predicted class", fontsize=20)
plt.ylabel("Original class", fontsize=20)
plt.title("Confusion matrix test set, n=641", fontsize=25)

## Playing around with ELI5 text explainer to make the black box models a little more expalinable

In [None]:
df_test = pd.DataFrame({
    "text": X_test,
    "true_label": y_test,
    "predicted_label": pred[0]
})

In [None]:
# select a specific test sample you want to investiage more clsoely
doc = X_test[1]

In [None]:
# here we train a text explainer that runs n_samples to simualte the model behaviour
te = TextExplainer(random_state=SEED, n_samples=20)
_ = te.fit(doc, predictor.predict_proba)

In [None]:
a = te.show_weights(target_names=predictor.c, targets=pred[0][150:155], top=10)

In [None]:
te.show_weights()