In [None]:
import pandas 
from torch.utils.data import DataLoader
import torch
import warnings
import os 

warnings.filterwarnings("ignore")


from catalyst.utils import set_global_seed, prepare_cudnn
from catalyst.dl import AccuracyCallback, OptimizerCallback, CheckpointCallback
from catalyst.dl import SupervisedRunner

In [None]:
import sys 
sys.path.append('../')

Defining constants

In [None]:
MAX_SEQUENCE_LENGTH = 512 
MODEL_NAME = "distilbert-base-uncased"
SEED = 0.10
ACCUM_STEPS=1 
F16_PARAMS=10

In [None]:
from text_classification import text

Loading datasets

In [None]:
training_set = pandas.read_csv("../data/processed_data/training_set.csv")
validation_set = pandas.read_csv("../data/processed_data/validation_set.csv")
testing_set = pandas.read_csv("../data/processed_data/testing_set.csv")

In [None]:
test_true = testing_set['category']
testing_set.drop(columns=['category'], inplace=True)

Constructing text datasets

In [None]:
training_set = text.TextClassificationDataset(
    labels=training_set['category'].values.tolist(),
    texts=training_set['text'].values.tolist(),
    max_sequence_length=MAX_SEQUENCE_LENGTH,
    model_name=MODEL_NAME,
    label_dict=None
)

validation_set = text.TextClassificationDataset(
    labels=validation_set['category'].values.tolist(),
    texts=validation_set['text'].values.tolist(),
    max_sequence_length=MAX_SEQUENCE_LENGTH,
    model_name=MODEL_NAME,
    label_dict=training_set.label_dict
) 

testing_set = text.TextClassificationDataset(
    labels=testing_set['category'].values.tolist(),
    texts=testing_set['text'].values.tolist(),
    max_sequence_length=MAX_SEQUENCE_LENGTH,
    model_name=MODEL_NAME
)

Setting up data loaders

In [None]:
data_loaders = {
    'train': DataLoader(dataset=training_set, shuffle=True),
    'valid': DataLoader(dataset=validation_set, shuffle=True),
    'test': DataLoader(dataset=testing_set, shuffle=True)
}

Initializing Bert Text Classifier

In [None]:
NUM_CLASSES = training_set['category'].unique()
LEARNING_RATE = 0.05

In [None]:
classifier = text.BertTextClassifier(num_classes=NUM_CLASSES)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

Preparing CUDA

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"   
set_global_seed(SEED)                       
prepare_cudnn(deterministic=True)     

Training model using catalyst

In [None]:
%%time 
runner = SupervisedRunner(
    input_key=(
        "features",
        "attention_mask"
    )
)

runner.train(
    model=classifier,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=data_loaders,
    callbacks=[
        AccuracyCallback(num_classes=NUM_CLASSES),
#       F1ScoreCallback(activation='Softmax'), # Tried it, but got an error on tensor shape
        OptimizerCallback(accumulation_steps=ACCUM_STEPS)
    ],
    fp16=FP16_PARAMS,
    logdir="../nn_log"
    num_epochs=NUM_EPOCHS,
    verbose=True
)

Evaludating Neural Network Performance using Testing set

In [None]:
test_loaders = {
    "test": DataLoader(
        dataset=testing_set,
        batch_size=BATCH_SIZE, 
        shuffle=False
    ) 
}

runner.infer(
    model=classifier,
    loaders=test_loaders,
    callbacks=[
        CheckpointCallback(
            resume=f"../nn_checkpoints/best.pth" % ()
        ),
        InferCallback(),
    ],   
    verbose=True
)

Evaluating Neural Network results

In [None]:
probs = runner.state.callbacks[0].predictions['logits']

test_pred = pandas.DataFrame(
    {
    'label': probs.argmax(axis=1)
    }
)

test_pred['label'] = test_pred['label'].map(
    {
        key: value for key, value in training_set.text.items()
    }
)

Visualizing Heatmap Confusion matrix

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 10))

ax = sns.heatmap(cmtx, annot=True)
ax.set_xlabel("Target")
ax.set_ylabel("Predicted")
ax.set_title('Testing set - Confusion Matrix')

Evaluating Testing predictions using evaluation metrics

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score
 
recall = recall_score(y_true=test_true, y_pred=test_pred, average='weighted')
accuracy = accuracy_score(y_true=test_true, y_pred=test_pred)
f1 = f1_score(y_true=test_true, y_pred=test_pred, average='weighted')

In [None]:
print('average weighted recall: ', recall)
print('accuracy score: ', accuracy)
print('f1 score: ', f1)