Libraries

In [1]:
import pandas as pd

Import file paths

In [2]:
import sys
sys.path.append('..')

import paths

Read raw data

In [3]:
annotations = pd.read_pickle(paths.filename_data)

#Get 3-class labels
dict3 = {}
dict3['0'] = '0'
dict3['E'] = 'IE'
dict3['S'] = 'IS'
annotations = annotations.replace({"label_3": dict3})
annotations['label'] = annotations['label_3']

Define Model

In [4]:
# ================================
#model specifics
model_specifics = {"data": 'Reddit',
    "global_embedding_tp": 'SBERT', #options: SBERT, BERT_cls , BERT_mean, BERT_max
    "dimensionality_reduction_tp": 'umap', #options: ppapca, ppapcappa, umap
    "dimensionality_reduction_components": 15, # options: any int number between 1 and embedding dimensions
    "dimensionality_reduction": True, #options: True, False
    "time_injection_post_tp": 'timestamp', #options: timestamp, None
    "signature_dimensions": 3, #options: any int number larger than 1
    "post_embedding_tp": 'sentence', #options: sentence, reduced, None
    "w":5, #integer greater or equal to 2
    "pad_with": 0, #options: any integer
    "loss_function": 'focal', #options: focal, cbfocal
    "classifier_name": 'SWNU-Network', #any string name
    }

Post Embeddings

In [5]:
from utils.embeddings import Representations
rep = Representations(type = model_specifics['global_embedding_tp'], filename =paths.filename_sbert)
embeddings_sentence = rep.get_embeddings()

print(embeddings_sentence.shape)

(6195, 384)


Dimensionality Reduction

In [6]:
from utils.dimensionality_reduction import DimensionalityReduction

reduction = DimensionalityReduction(method= model_specifics['dimensionality_reduction_tp'], components=model_specifics['dimensionality_reduction_components'])
embeddings_reduced = reduction.fit_transform(embeddings_sentence)

print(embeddings_reduced.shape)

  from .autonotebook import tqdm as notebook_tqdm


(6195, 15)


Concatenation with dataset

In [7]:
from utils.dataset import get_modeling_dataframe
df = get_modeling_dataframe(annotations, embeddings_sentence, embeddings_reduced)

Time features

In [8]:
from utils.timeinjection import TimeFeatures
tf = TimeFeatures()
df = tf.get_time_features(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timeline_index'][first_index:last_index] = np.arange(t_id_len)


Data Preparation

In [9]:
from utils.preparedata import PrepareData

getdata = PrepareData(model_specifics, time_column = 'time_encoding', zero_padding=True, w_last=True)
df, df_padded = getdata.pad(df)
x_data = getdata.unit_input(df, df_padded)
print(x_data.shape)

torch.Size([6195, 400, 5])


Labels

In [10]:
from utils.classification_utils import Splits

NUM_folds = 1
splits = Splits(num_folds=NUM_folds)
y_data = splits.get_labels(df)

Model Parameters

In [11]:
# ================================
#TRAINING PARAMETERS
num_epochs = 100
learning_rate =  [0.0003] 
gamma = [3] 
beta = None
BATCH_SIZE = 64
NUM_folds = 1
patience = 3
loss = 'focal'
weight_decay_adam = 0.0001
RANDOM_SEED_list = [0, 1, 12, 123, 1234]
# ================================
# MODEL PARAMETERS
input_channels = model_specifics['dimensionality_reduction_components'] #15
output_channels = [10]
sig_d = 3
hidden_dim_lstm = [[10]]
embedding_dim = embeddings_sentence.shape[1] #384
hidden_dim= [64]
output_dim = y_data.unique().size()[0] #3
dropout_rate= [0.1]
augmentation_tp =  'Conv1d'
augmentation_layers = ()
comb_method ='concatenation'
attention = False 
# ================================
# MODEL OPTIONS
save_results = False

if (model_specifics['dimensionality_reduction'] == True):
    model_code_name = model_specifics["data"] + "_" + model_specifics["global_embedding_tp"]  \
    + "_" + str(model_specifics['dimensionality_reduction_tp']) + str(model_specifics['dimensionality_reduction_components']) \
    + "_" + str(model_specifics['time_injection_post_tp']) \
    + "_" + str(model_specifics['post_embedding_tp'])  \
    + "_" + str(model_specifics['classifier_name'])
else:
    model_code_name = model_specifics["data"] + "_" + model_specifics["global_embedding_tp"]  \
    + "_" + str(model_specifics['time_injection_post_tp']) \
    + "_" + str(model_specifics['post_embedding_tp'])  \
    + "_" + str(model_specifics['classifier_name'])

print(model_code_name)

Reddit_SBERT_umap15_timestamp_sentence_SWNU-Network


In [12]:
import math
import torch
from datetime import date
from utils.classification_utils import set_seed, validation, training, testing
from utils.loss_functions import FocalLoss, ClassBalanced_FocalLoss
from models.swnunet import SWNUNet

#Tuning over folds and random seeds
ft_i = 0 #run number
for out_ch in output_channels:
    for lr in learning_rate:
        for g in gamma:
            for dp in dropout_rate:
                for h_dim in hidden_dim:
                    for lstm_dim in hidden_dim_lstm:
                        #tuning parameters number
                        str_version = 'tuning' + str(ft_i)
                        ft_i+=1

                        #dictionary of model parameters
                        classifier_params = {
                            "num_epochs": num_epochs,
                            "learning_rate": lr,
                            "gamma": g,
                            "beta": beta,
                            "BATCH_SIZE": BATCH_SIZE,
                            "NUM_folds": NUM_folds,
                            "patience": patience,
                            "loss": loss,
                            "weight_decay_adam": weight_decay_adam,
                            "RANDOM_SEED_list": RANDOM_SEED_list,
                            "input_channels": input_channels,
                            "output_channels": out_ch,
                            "sig_d": sig_d,
                            "hidden_dim_lstm": lstm_dim,
                            "embedding_dimensions": embedding_dim,
                            "hidden_dim": h_dim,
                            "output_dim": output_dim,
                            "dropout_rate": dp,
                            "augmentation_tp": augmentation_tp,
                            "augmentation_layers": augmentation_layers,
                            "combination_method": comb_method,
                        }
                                        
                        for my_ran_seed in RANDOM_SEED_list:
                            set_seed(my_ran_seed)
                            myGenerator = torch.Generator()
                            myGenerator.manual_seed(my_ran_seed)    
                            for test_fold in range(NUM_folds):

                                print('Starting random seed #',my_ran_seed)
                                #get ith-fold data
                                x_test, y_test, x_valid, y_valid, x_train , y_train, test_tl_ids, test_pids = Splits.get_reddit_splits(df, x_data, y_data)

                                #data loaders with batches
                                train = torch.utils.data.TensorDataset( x_train, y_train)
                                valid = torch.utils.data.TensorDataset( x_valid, y_valid)
                                test = torch.utils.data.TensorDataset( x_test, y_test)

                                train_loader = torch.utils.data.DataLoader(dataset=train, batch_size = BATCH_SIZE, shuffle = True)
                                valid_loader = torch.utils.data.DataLoader(dataset=valid, batch_size = BATCH_SIZE, shuffle = True)
                                test_loader = torch.utils.data.DataLoader(dataset=test, batch_size = BATCH_SIZE, shuffle = True)

                                #early stopping params
                                last_metric = 0
                                trigger_times = 0
                                best_metric = 0

                                #model
                                model = SWNUNet(input_channels, 
                                            out_ch, 
                                            sig_d, 
                                            lstm_dim,
                                            embedding_dim,
                                            h_dim,
                                            output_dim,
                                            dp, 
                                            augmentation_tp,
                                            augmentation_layers,
                                            comb_method)

                                #loss function
                                if (loss=='focal') :
                                    alpha_values = torch.Tensor([math.sqrt(1/(y_train[y_train==0].shape[0]/y_train.shape[0])), math.sqrt(1/(y_train[y_train==1].shape[0]/y_train.shape[0])), math.sqrt(1/(y_train[y_train==2].shape[0]/y_train.shape[0]))])
                                    criterion = FocalLoss(gamma = g, alpha = alpha_values)
                                elif (loss == 'cbfocal'):
                                    classifier_params["beta"] = beta
                                    samples_count = torch.Tensor([y_train[y_train==0].shape[0], y_train[y_train==1].shape[0], y_train[y_train==2].shape[0]])
                                    criterion = ClassBalanced_FocalLoss(gamma = g, beta = beta, no_of_classes=3, samples_per_cls=samples_count)   
                                elif (loss == 'cross_entropy'):
                                    criterion = nn.CrossEntropyLoss()                            
                                optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay= weight_decay_adam)

                                #model train/validation per epoch
                                for epoch in range(num_epochs):

                                    training(model, train_loader, criterion, optimizer, epoch, num_epochs)

                                    # Early stopping
                                    _ , f1_v, labels_val, predicted_val = validation(model, valid_loader, criterion, loss)

                                    print('Current Macro F1:', f1_v)

                                    if f1_v > best_metric :
                                        best_metric = f1_v

                                        #test and save so far best model
                                        predicted_test, labels_test = testing(model, test_loader, loss)

                                        results = {
                                            "model_code_name": model_code_name, 
                                            "model_specifics": model_specifics, 
                                            "classifier_params": classifier_params, 
                                            "date_run": date.today().strftime("%d/%m/%Y"),
                                            "test_tl_ids": test_tl_ids,
                                            "test_pids": test_pids,
                                            "labels": labels_test,
                                            "predictions": predicted_test,
                                            "labels_val": labels_val,
                                            "predicted_val": predicted_val,
                                            "test_fold": test_fold,
                                            "random_seed": my_ran_seed,
                                            "epoch": epoch,
                                        }

                                        if (save_results==True):
                                            file_name_results = paths.FOLDER_results + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str_version + '.pkl'
                                            file_name_model = paths.FOLDER_models + model_code_name + "_" + str(my_ran_seed) + "seed"  + "_" + str_version +'.pkl'
                                            pickle.dump(results, open(file_name_results, 'wb'))
                                            torch.save(model.state_dict(), file_name_model)

                                    if f1_v < last_metric:
                                        trigger_times += 1
                                        print('Trigger Times:', trigger_times)

                                        if trigger_times >= patience:
                                            print('Early stopping!')
                                            break

                                    else:
                                        print('Trigger Times: 0')
                                        trigger_times = 0

                                    last_metric = f1_v
        

Starting random seed # 0
[0/100, 0/61] loss: 0.56981885
Current Macro F1: 38.29265684032827
Trigger Times: 0
[1/100, 0/61] loss: 0.33743268
Current Macro F1: 50.285628999458964
Trigger Times: 0
[2/100, 0/61] loss: 0.33874553
Current Macro F1: 48.85506487817117
Trigger Times: 1
[3/100, 0/61] loss: 0.27603671
Current Macro F1: 49.71341658661543
Trigger Times: 0
[4/100, 0/61] loss: 0.23250455
Current Macro F1: 50.658869395711505
Trigger Times: 0
[5/100, 0/61] loss: 0.22305843
Current Macro F1: 50.352310432231505
Trigger Times: 1
[6/100, 0/61] loss: 0.22916307
Current Macro F1: 59.86108309572592
Trigger Times: 0
[7/100, 0/61] loss: 0.23767121
Current Macro F1: 65.47869314028615
Trigger Times: 0
[8/100, 0/61] loss: 0.18360086
Current Macro F1: 65.22610484589991
Trigger Times: 1
[9/100, 0/61] loss: 0.20629813
Current Macro F1: 67.18706260596913
Trigger Times: 0
[10/100, 0/61] loss: 0.17864227
Current Macro F1: 63.83577269376951
Trigger Times: 1
[11/100, 0/61] loss: 0.20426479
Current Macro F