Libraries

In [1]:
import pandas as pd

Import file paths

In [2]:
import sys
sys.path.append('..')

import paths

Read Raw Data

In [3]:
annotations = pd.read_pickle(paths.filename_data)

#Get 3-class labels
dict3 = {}
dict3['0'] = '0'
dict3['E'] = 'IE'
dict3['S'] = 'IS'
annotations = annotations.replace({"label_3": dict3})
annotations['label'] = annotations['label_3']

Define Model

In [4]:
# ================================
#model specifics
model_specifics = {"data": 'Reddit',
    "global_embedding_tp": 'SBERT', #options: SBERT, BERT_cls , BERT_mean, BERT_max
    "dimensionality_reduction_tp": None, #options: ppapca, ppapcappa, umap
    "dimensionality_reduction_components": None, # options: any int number between 1 and embedding dimensions
    "dimensionality_reduction": False, #options: True, False
    "time_injection_post_tp": None, #options: timestamp, None
    "post_embedding_tp": None, #options: sentence, reduced, None
    "history_len": 29, #options: greater than 1
    "pad_with": 123, #options: any integer
    "loss_function": 'focal', #options: focal, cbfocal
    "classifier_name": 'BiLSTM-SBERT-hist', #any string name
    }

Post Embeddings

In [5]:
from utils.embeddings import Representations
rep = Representations(type = model_specifics['global_embedding_tp'], filename =paths.filename_sbert)
embeddings_sentence = rep.get_embeddings()

print(embeddings_sentence.shape)

(6195, 384)


Concatenation with dataset

In [6]:
from utils.dataset import get_modeling_dataframe
df = get_modeling_dataframe(annotations, embeddings_sentence, embeddings_reduced=[])

Data Preparation

In [7]:
from utils.preparedata import PrepareData

getdata = PrepareData(model_specifics, time_column = None, zero_padding=True, w_last=True)
df, df_padded = getdata.pad(df)
x_data = getdata.unit_input(df, df_padded, embeddings_lastdim=True)
print(x_data.shape)

  from .autonotebook import tqdm as notebook_tqdm


torch.Size([6195, 29, 384])


Labels

In [8]:
from utils.classification_utils import Splits

NUM_folds = 1
splits = Splits(num_folds=NUM_folds)
y_data = splits.get_labels(df)

Model Parameters

In [9]:
# ================================
#TRAINING PARAMETERS
num_epochs = 100
learning_rate =  [0.001] 
gamma = [2] 
beta = None
BATCH_SIZE = [32]
NUM_folds = 1
patience = 3
loss = 'focal'
RANDOM_SEED_list = [0, 1, 12, 123, 1234]
# ================================
# MODEL PARAMETERS
embedding_dim = embeddings_sentence.shape[1] #384
hidden_dim_lstm1 = [128]  
hidden_dim_lstm2 = 124 
output_dim = y_data.unique().size()[0] #3
dropout_rate= [0.5]
pad_with = model_specifics['pad_with']
# ================================
# MODEL OPTIONS
save_results = False

if (model_specifics['dimensionality_reduction'] == True):
    model_code_name = model_specifics["data"] + "_" + model_specifics["global_embedding_tp"]  \
    + "_" + str(model_specifics['dimensionality_reduction_tp']) + str(model_specifics['dimensionality_reduction_components']) \
    + "_" + str(model_specifics['time_injection_post_tp']) \
    + "_" + str(model_specifics['post_embedding_tp'])  \
    + "_" + str(model_specifics['classifier_name'])
else:
    model_code_name = model_specifics["data"] + "_" + model_specifics["global_embedding_tp"]  \
    + "_" + str(model_specifics['time_injection_post_tp']) \
    + "_" + str(model_specifics['post_embedding_tp'])  \
    + "_" + str(model_specifics['classifier_name'])

print(model_code_name)

Reddit_SBERT_None_None_BiLSTM-SBERT-hist


Training and Hyperparameter Tuning

In [10]:
import math
import torch
from datetime import date
from utils.classification_utils import set_seed, validation, training, testing
from utils.loss_functions import FocalLoss, ClassBalanced_FocalLoss
from models.bilstm_stacked import BiLSTM_stacked

#Tuning over folds and random seeds
ft_i = 0 #run number
for lr in learning_rate:
    for hid_d_lstm1 in hidden_dim_lstm1:
        for dp in dropout_rate:
            for b_size in BATCH_SIZE:
                for g in gamma:
                        #tuning parameters number
                        str_version = 'tuning' + str(ft_i)
                        ft_i+=1

                        #dictionary of model parameters
                        classifier_params = {
                            "num_epochs": num_epochs,
                            "learning_rate": lr,
                            "gamma": g,
                            "beta": beta,
                            "BATCH_SIZE": b_size,
                            "NUM_folds": NUM_folds,
                            "patience": patience,
                            "loss": loss,
                            "RANDOM_SEED_list": RANDOM_SEED_list,
                            "embedding_dimensions": embedding_dim,
                            "hidden_dim_lstm1": hid_d_lstm1,
                            "hidden_dim_lstm2" : hidden_dim_lstm2,
                            "output_dim": output_dim,
                            "dropout_rate": dp,
                            "pad_with": pad_with
                        }
                                        
                        for my_ran_seed in RANDOM_SEED_list:
                            set_seed(my_ran_seed)
                            myGenerator = torch.Generator()
                            myGenerator.manual_seed(my_ran_seed)    
                            for test_fold in range(NUM_folds):

                                print('Starting random seed #',my_ran_seed)
                                #get ith-fold data
                                x_test, y_test, x_valid, y_valid, x_train , y_train, test_tl_ids, test_pids = Splits.get_reddit_splits(df, x_data, y_data)

                                #data loaders with batches
                                train = torch.utils.data.TensorDataset( x_train, y_train)
                                valid = torch.utils.data.TensorDataset( x_valid, y_valid)
                                test = torch.utils.data.TensorDataset( x_test, y_test)

                                train_loader = torch.utils.data.DataLoader(dataset=train, batch_size = b_size, shuffle = True)
                                valid_loader = torch.utils.data.DataLoader(dataset=valid, batch_size = b_size, shuffle = True)
                                test_loader = torch.utils.data.DataLoader(dataset=test, batch_size = b_size, shuffle = True)

                                #early stopping params
                                last_metric = 0
                                trigger_times = 0
                                best_metric = 0

                                #model
                                model = BiLSTM_stacked(embedding_dim,
                                        hid_d_lstm1,
                                        hidden_dim_lstm2, 
                                        output_dim, 
                                        dp,
                                        pad_with = pad_with)

                                #loss function
                                if (loss=='focal') :
                                    alpha_values = torch.Tensor([math.sqrt(1/(y_train[y_train==0].shape[0]/y_train.shape[0])), math.sqrt(1/(y_train[y_train==1].shape[0]/y_train.shape[0])), math.sqrt(1/(y_train[y_train==2].shape[0]/y_train.shape[0]))])
                                    criterion = FocalLoss(gamma = g, alpha = alpha_values)
                                elif (loss == 'cbfocal'):
                                    classifier_params["beta"] = beta
                                    samples_count = torch.Tensor([y_train[y_train==0].shape[0], y_train[y_train==1].shape[0], y_train[y_train==2].shape[0]])
                                    criterion = ClassBalanced_FocalLoss(gamma = g, beta = beta, no_of_classes=3, samples_per_cls=samples_count)   
                                elif (loss == 'cross_entropy'):
                                    criterion = nn.CrossEntropyLoss()                            
                                optimizer = torch.optim.Adam(model.parameters(), lr=lr)

                                #model train/validation per epoch
                                for epoch in range(num_epochs):

                                    training(model, train_loader, criterion, optimizer, epoch, num_epochs)

                                    # Early stopping
                                    _ , f1_v, labels_val, predicted_val = validation(model, valid_loader, criterion, loss)

                                    print('Current Macro F1:', f1_v)

                                    if f1_v > best_metric :
                                        best_metric = f1_v

                                        #test and save so far best model
                                        predicted_test, labels_test = testing(model, test_loader, loss)

                                        results = {
                                            "model_code_name": model_code_name, 
                                            "model_specifics": model_specifics, 
                                            "classifier_params": classifier_params, 
                                            "date_run": date.today().strftime("%d/%m/%Y"),
                                            "test_tl_ids": test_tl_ids,
                                            "test_pids": test_pids,
                                            "labels": labels_test,
                                            "predictions": predicted_test,
                                            "labels_val": labels_val,
                                            "predicted_val": predicted_val,
                                            "test_fold": test_fold,
                                            "random_seed": my_ran_seed,
                                            "epoch": epoch,
                                        }

                                        if (save_results==True):
                                            file_name_results = paths.FOLDER_results + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str_version + '.pkl'
                                            file_name_model = paths.FOLDER_models + model_code_name + "_" + str(my_ran_seed) + "seed"  + "_" + str_version +'.pkl'
                                            pickle.dump(results, open(file_name_results, 'wb'))
                                            torch.save(model.state_dict(), file_name_model)

                                    if f1_v < last_metric:
                                        trigger_times += 1
                                        print('Trigger Times:', trigger_times)

                                        if trigger_times >= patience:
                                            print('Early stopping!')
                                            break

                                    else:
                                        print('Trigger Times: 0')
                                        trigger_times = 0

                                    last_metric = f1_v
        

Starting random seed # 0
[0/100, 0/122] loss: 0.78892422
[0/100, 100/122] loss: 0.42969629
Current Macro F1: 67.78657789184103
Trigger Times: 0
[1/100, 0/122] loss: 0.28486356
[1/100, 100/122] loss: 0.33344704
Current Macro F1: 70.48932386696163
Trigger Times: 0
[2/100, 0/122] loss: 0.24133493
[2/100, 100/122] loss: 0.29525751
Current Macro F1: 66.2118247049754
Trigger Times: 1
[3/100, 0/122] loss: 0.44636589
[3/100, 100/122] loss: 0.29658687
Current Macro F1: 67.96781361433807
Trigger Times: 0
[4/100, 0/122] loss: 0.32716483
[4/100, 100/122] loss: 0.15448542
Current Macro F1: 63.92250855543299
Trigger Times: 1
[5/100, 0/122] loss: 0.28753424
[5/100, 100/122] loss: 0.33693406
Current Macro F1: 65.89493245501716
Trigger Times: 0
[6/100, 0/122] loss: 0.19573054
[6/100, 100/122] loss: 0.14748468
Current Macro F1: 64.72607851918197
Trigger Times: 1
[7/100, 0/122] loss: 0.2420492
[7/100, 100/122] loss: 0.058346462
Current Macro F1: 64.27346842254518
Trigger Times: 2
[8/100, 0/122] loss: 0.1