In [None]:
# turn internet on on kaggle before running this cell
!pip install transformers[torch]
#!pip install accelerate -U  # restart runtime if it still doesn't work
!pip install datasets
!pip install evaluate

# Notebook 

**NB:** 
- Notebook used for the experiments during the project and run on kaggle for computational reason (GPU available)
- The code for this project has been run on a cluster with 1 GPU: see python files of the directory run_to_cluster which have all the functions in this notebook structured in a clean way.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
import torch
from tqdm import tqdm
from pandas.api.types import CategoricalDtype

import evaluate
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from datasets import load_metric
from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings('ignore') # parameters : default or ignore

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Load datasets

In [None]:
# import sentences
sentences_en_tr = pd.read_csv('../input/cered-dataset/data/sentences/en/train/sentences.tsv',sep='\t')
sentences_en_val = pd.read_csv('../input/cered-dataset/data/sentences/en/val/sentences.tsv',sep='\t')
sentences_en_te = pd.read_csv('../input/cered-dataset/data/sentences/en/test/sentences.tsv',sep='\t')
print(f'In English\nLenght training set : {len(sentences_en_tr)}')
print(f'Lenght validation set : {len(sentences_en_val)}')
print(f'Lenght testing set : {len(sentences_en_te)}')

sentences_en_te.head(2)

In [None]:
# preprocess

# Change Difficuties to Difficulty in 'y' column of test dataset
sentences_en_te['y'] = np.where(sentences_en_te['y'] == 'Difficulties', 'Difficulty', sentences_en_te['y'])

# Merge the DataFrames
merged_df = pd.concat([sentences_en_tr, sentences_en_val, sentences_en_te], ignore_index=True)

# Remove the 'Reflection' label
merged_df = merged_df[merged_df['y'] != 'Reflection']

# Shuffle the merged DataFrame
merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)

train_dataset, original_test_dataset = train_test_split(merged_df, test_size=0.15, random_state=42)

# Check lengths of sets
print("Train set length:", len(train_dataset))
print("Test set length:", len(original_test_dataset))

train_dataset.head(2)

In [None]:
M_catType = CategoricalDtype(categories = ['Difficulty', 'Experience', 'Other', 'Feeling', 'Belief', 'Perspective', 'Intention', 'Learning'], ordered = True)
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Plot distribution of labels for each DataFrame
for i, df in enumerate([sentences_en_tr, sentences_en_val, sentences_en_te]):
    df['y'] = df['y'].astype(M_catType)
    sns.histplot(df['y'], ax=axs[i])
    axs[i].set_title(f'Distribution of Labels - DataFrame {i+1}')
    axs[i].tick_params(axis='x',labelrotation = 45)
plt.suptitle("Before preprocessing")
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(15, 5))

# Plot distribution of labels for each DataFrame
for i, df in enumerate([train_dataset, original_test_dataset]):
    df['y'] = df['y'].astype(M_catType)
    sns.histplot(df['y'], ax=axs[i])
    axs[i].set_title(f'Distribution of Labels - DataFrame {i+1}')
    axs[i].tick_params(axis='x',labelrotation = 45)

plt.suptitle("After preprocessing")
plt.tight_layout()
plt.show()

## Handle class imbalance of test dataset

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler


under_sampler = RandomUnderSampler(sampling_strategy = 'not minority')
balanced_test_dataset_down, _ = under_sampler.fit_resample(original_test_dataset, original_test_dataset['y'])

over_sampler = RandomOverSampler(sampling_strategy='all', random_state=42)
balanced_test_dataset_up, _ = over_sampler.fit_resample(original_test_dataset, original_test_dataset['y'])

small_balanced_test_dataset_up = balanced_test_dataset_up.sample(frac = 0.5)
# under sample again to have perfect distribution
under_sampler = RandomUnderSampler(sampling_strategy = 'not minority')
small_balanced_test_dataset_up, _ = under_sampler.fit_resample(small_balanced_test_dataset_up, small_balanced_test_dataset_up['y'])

fig, axs = plt.subplots(1, 4, figsize=(15, 4), sharey = True) 
sns.histplot(original_test_dataset['y'], ax=axs[0])
sns.histplot(balanced_test_dataset_down['y'], ax=axs[1])
sns.histplot(balanced_test_dataset_up['y'], ax=axs[2])
sns.histplot(small_balanced_test_dataset_up['y'], ax=axs[3])
axs[0].set_title('Original Test Dataset')
axs[1].set_title('Balanced Test Dataset with downsampling')
axs[2].set_title('Balanced Test Dataset with upsampling')
axs[3].set_title('Small Balanced Test Dataset with upsampling')
axs[0].tick_params(axis='x',labelrotation = 45)
axs[1].tick_params(axis='x',labelrotation = 45)
axs[2].tick_params(axis='x',labelrotation = 45)
axs[3].tick_params(axis='x',labelrotation = 45)
plt.tight_layout()
plt.show()

# define a dictionary for the different test datasets
list_test_datasets = {"original_test_dataset": original_test_dataset, "balanced_test_dataset_down": balanced_test_dataset_down, "balanced_test_dataset_up": balanced_test_dataset_up, "small_balanced_test_dataset_up": small_balanced_test_dataset_up}

In [None]:
for dataset_name, dataset in list_test_datasets.items():
    print(f"Length of {dataset_name}: {len(dataset)}")

# Define functions

- preprocess_data
- prepare_model
- train
- evaluate
- train_test
- prepare_test_dataset

In [None]:
def preprocess_data(df_sentences_train, df_sentences_test):
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()

    # Preprocess data and labels
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    max_length_train = max(df_sentences_train['sentence'].apply(lambda sentence: len(sentence.split())))
    max_length_test = max(df_sentences_test['sentence'].apply(lambda sentence: len(sentence.split())))

    df_train['text'] = df_sentences_train['sentence'].apply(
        lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_train))
    df_test['text'] = df_sentences_test['sentence'].apply(
        lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_test))
    
    label_encoder_train = LabelEncoder()
    label_encoder_test = LabelEncoder()
    df_test['label'] = label_encoder_train.fit_transform(df_sentences_test['y'])
    df_train['label'] = label_encoder_test.fit_transform(df_sentences_train['y']) # in output for evaluation
    
    # Split the data into training and validation sets
    train_data, val_data = train_test_split(df_train, test_size=0.2, random_state=42)
    test_data = df_test
    print(f"Train data : {len(train_data)}")
    print(f"Val data : {len(val_data)}")
    print(f"Test data : {len(test_data)}")
    
    # Create a custom dataset
    class CustomDataset(Dataset):
        def __init__(self, text, label):
            self.text = text
            self.label = label
        def __len__(self):
            return len(self.text)
        def __getitem__(self, idx):
            return {
              'text': torch.tensor(self.text[idx], dtype=torch.long),
              'label': torch.tensor(self.label[idx], dtype=torch.long)
          }
    train_dataset_pp = CustomDataset(train_data['text'].values, train_data['label'].values)
    val_dataset_pp = CustomDataset(val_data['text'].values, val_data['label'].values)
    test_dataset_pp = CustomDataset(test_data['text'].values, test_data['label'].values)

    return train_dataset_pp, val_dataset_pp, test_dataset_pp, label_encoder_test

def prepare_model(model, train_dataset_pp, val_dataset_pp, test_dataset_pp, freeze_weights, batch_size, epochs, learning_rate):

    if freeze_weights:
        # Freeze all layers except the last two
        for param in model.parameters():
            param.requires_grad = False
        for param in model.classifier.parameters():
            param.requires_grad = True

    # Create data loaders
    train_loader = DataLoader(train_dataset_pp, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset_pp, batch_size=batch_size)
    test_loader = DataLoader(test_dataset_pp, batch_size=batch_size)

    # Set up optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

    return train_loader, val_loader, test_loader, optimizer, scheduler

def train(model, train_loader, epochs, optimizer, scheduler, device, plot_visualization):
    accuracy_metric = load_metric("accuracy")
    train_losses = []
    avg_acc_per_epoch = []

    for epoch in range(epochs):
        #print(f"epoch {epoch} running...")
        model.train()
        train_loss = []
        acc = []
        all_preds = []
        all_labels = []

        for batch in tqdm(train_loader, position = 0, desc= f"epoch {epoch} running..."):
            optimizer.zero_grad()
            inputs = batch['text'].to(device)
            labels = batch['label'].to(device)
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            train_loss.append(loss.item())
            loss.backward()
            optimizer.step()
            scheduler.step()
            ### to plot accuracy during training ###
            predictions = torch.argmax(outputs.logits, axis=1)
            all_preds.extend(predictions.cpu().numpy().tolist())
            all_labels.extend(labels.tolist())
            ########################################
        avg_epoch_loss = sum(train_loss) / len(train_loss)
        train_losses.append(avg_epoch_loss)
        avg_acc_per_epoch.append(accuracy_metric.compute(predictions=all_preds, references=all_labels)["accuracy"])
    
    if plot_visualization == True:
        fig, axs = plt.subplots(1,2, figsize = (12,5))
        axs[0].plot(range(epochs), train_losses, label='Training Loss')
        axs[0].set_xlabel('Epoch')
        axs[0].set_ylabel('Loss')
        axs[0].set_title('Training Loss over Epochs')
        axs[0].set_xticks(np.arange(epochs))
        axs[0].legend()

        axs[1].plot(range(epochs), avg_acc_per_epoch, label='Training Accuracy')
        axs[1].set_xlabel('Epoch')
        axs[1].set_ylabel('Accuracy')
        axs[1].set_title('Training Accuracy over Epochs')
        axs[1].set_xticks(np.arange(epochs))
        axs[1].legend()
        plt.show()
    
    return train_losses, avg_acc_per_epoch

def evaluate(model, test_loader, label_encoder, device, plot_results, accuracy_metric):
    model.eval()
    all_preds = []
    all_labels = []
    pred_confidence = []
    train_confidence_scores = []
    correct_confidence = []
    incorrect_confidence = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch['text'].to(device)
            labels = batch['label'].to(device)
            outputs = model(inputs)
            predictions = torch.argmax(outputs.logits, axis=1)
            all_preds.extend(predictions.cpu().numpy().tolist())
            all_labels.extend(labels.tolist())
            ### compute confidence score
            probabilities = torch.softmax(outputs.logits, dim=1)
            pred_confidence.extend(probabilities.max(dim=1).values.cpu().detach().numpy())  # Confidence scores
            ###

    # compute accuracy
    accuracy = accuracy_metric.compute(predictions=all_preds, references=all_labels)["accuracy"]
    print(f"Accuracy: {np.round(accuracy,3)}")
    # Decode label encodings
    predicted_labels = label_encoder.inverse_transform(all_preds)
    true_labels = label_encoder.inverse_transform(all_labels)

    # Get unique labels from true and predicted labels and their union for the confusion matrix
    unique_true_labels = set(predicted_labels)
    unique_predicted_labels = set(true_labels)
    unique_labels_union = unique_true_labels.union(unique_predicted_labels)
    
    print(unique_labels_union)
    # Sort the labels alphabetically to ensure consistent order
    class_labels = sorted(unique_labels_union)
    
    
    if plot_results:
        # Generate classification report
        report = classification_report(true_labels, predicted_labels, zero_division = 1, target_names=class_labels)
        print(report)

        # Create confusion matrix
        cm = confusion_matrix(true_labels, predicted_labels, labels=class_labels)

        fig, axs = plt.subplots(1, 2, figsize=(12, 5))
        # Subplot 1: Confusion Matrix
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels, ax=axs[0])
        axs[0].set_xlabel("Predicted")
        axs[0].set_ylabel("True")
        axs[0].set_title("Confusion Matrix")

        # Subplot 2: Confidence Scores Histogram
        #axs[1].hist(pred_confidence, bins=50)
        #axs[1].set_xlabel('Confidence score')
        #axs[1].set_ylabel('Number of predictions')
        #axs[1].set_title('Confidence score of predictions')
        
        # Subplot 3: Confidence Scores Histogram for Correct and Incorrect Predictions
        correct_confidence = [pred_confidence[i] for i in range(len(predicted_labels)) if predicted_labels[i] == true_labels[i]]
        incorrect_confidence = [pred_confidence[i] for i in range(len(predicted_labels)) if predicted_labels[i] != true_labels[i]]
        axs[1].hist(correct_confidence, bins=50, color='green', alpha=0.7, label='Correct Predictions')
        axs[1].hist(incorrect_confidence, bins=50, color='red', alpha=0.7, label='Incorrect Predictions')
        axs[1].set_xlabel('Confidence score')
        axs[1].set_ylabel('Number of predictions')
        axs[1].set_title('Confidence score of predictions')
        axs[1].legend()

        plt.tight_layout()
        plt.show()
        
    ###### print for cluster######
    #predicted_labels = [', '.join(predicted_labels)]
    #true_labels = [', '.join(true_labels)]
    print(f"\npredicted_labels = {predicted_labels}")
    print(f"\ntrue_labels = {true_labels}")
    print(f"\npred_confidence = {pred_confidence}")
    print(f"\nclass_labels = {class_labels}")
    
    #print(f"\n Missing labels : {set(['Belief', 'Difficulty', 'Experience', 'Feeling', 'Other', 'Learning', 'Perspective', 'Intention']) - unique_labels_union}\n")
    

def train_test(model, train_loader, val_loader, epochs, optimizer, scheduler, device):
    accuracy_metric = load_metric("accuracy")
    train_losses = []
    val_losses = []
    avg_train_acc_per_epoch = []
    avg_val_acc_per_epoch = []
    train_confidence_scores_avg_per_epoch = []  # Store confidence scores for train set
    val_confidence_scores = []    # Store confidence scores for validation set
    avg_balanced_train_acc_per_epoch = [] 
    avg_balanced_val_acc_per_epoch = [] 

    for epoch in range(epochs):
    #for epoch in tqdm(range(epochs), desc="Epochs"):
        #print(f"epoch {epoch} running...")
        model.train()
        train_loss = []
        all_preds_train = []
        all_labels_train = []
        all_train_confidence = []

        #for batch in train_loader:
        for batch in tqdm(train_loader, position = 0, desc= f"epoch {epoch} running..."):
            optimizer.zero_grad()
            inputs = batch['text'].to(device)
            labels = batch['label'].to(device)
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            train_loss.append(loss.item())
            predictions_train = torch.argmax(outputs.logits, axis=1)
            all_preds_train.extend(predictions_train.cpu().numpy().tolist())
            all_labels_train.extend(labels.tolist())
            ### compute confidence score
            probabilities = torch.softmax(outputs.logits, dim=1)
            all_train_confidence.extend(probabilities.max(dim=1).values.cpu().detach().numpy())  # Confidence scores
            ###
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_epoch_loss_train = sum(train_loss) / len(train_loss)
        train_losses.append(avg_epoch_loss_train)
        avg_train_acc_per_epoch.append(accuracy_metric.compute(predictions=all_preds_train, references=all_labels_train)["accuracy"])
        avg_balanced_train_acc_per_epoch.append(balanced_accuracy_score(all_labels_train, all_preds_train))
        train_confidence_scores_avg_per_epoch.append(np.mean(all_train_confidence))  # Store confidence scores
        
        # Validation loop
        model.eval()
        val_loss = []
        all_preds_val = []
        all_labels_val = []
        val_confidence = []

        with torch.no_grad():
            for batch in val_loader: 
            #for batch in val_loader:
                inputs = batch['text'].to(device)
                labels = batch['label'].to(device)
                outputs = model(inputs, labels=labels)
                loss_val = outputs.loss
                val_loss.append(loss_val.item())
                predictions_val = torch.argmax(outputs.logits, axis=1)
                all_preds_val.extend(predictions_val.cpu().numpy().tolist())
                all_labels_val.extend(labels.tolist())
                probabilities = torch.softmax(outputs.logits, dim=1)
                val_confidence.extend(probabilities.max(dim=1).values.cpu().detach().numpy())  # Confidence scores

        avg_epoch_loss_val = sum(val_loss) / len(val_loss)
        val_losses.append(avg_epoch_loss_val)
        avg_val_acc_per_epoch.append(accuracy_metric.compute(predictions=all_preds_val, references=all_labels_val)["accuracy"])
        avg_balanced_val_acc_per_epoch.append(balanced_accuracy_score(all_labels_val, all_preds_val))
        val_confidence_scores.append(np.mean(val_confidence))  # Store confidence scores

    return train_losses, val_losses, avg_train_acc_per_epoch, avg_val_acc_per_epoch, train_confidence_scores_avg_per_epoch, val_confidence_scores, all_train_confidence, avg_balanced_train_acc_per_epoch, avg_balanced_val_acc_per_epoch

def prepare_test_dataset(df_sentences_test, batch_size):

    df_test = pd.DataFrame()
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    max_length_test = max(df_sentences_test['sentence'].apply(lambda sentence: len(sentence.split())))
    df_test['text'] = df_sentences_test['sentence'].apply(
            lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_test))

    label_encoder_test = LabelEncoder()
    df_test['label'] = label_encoder_test.fit_transform(df_sentences_test['y'])

    # Create a custom dataset
    class CustomDataset(Dataset):
        def __init__(self, text, label):
            self.text = text
            self.label = label
        def __len__(self):
            return len(self.text)
        def __getitem__(self, idx):
            return {
                'text': torch.tensor(self.text[idx], dtype=torch.long),
                'label': torch.tensor(self.label[idx], dtype=torch.long)
            }

    test_dataset = CustomDataset(df_test['text'].values, df_test['label'].values)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return test_loader, label_encoder_test

def preprocess_data_train_test(df_sentences_train, df_sentences_test):
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()

    # Preprocess data and labels
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    max_length_train = max(df_sentences_train['sentence'].apply(lambda sentence: len(sentence.split())))
    max_length_test = max(df_sentences_test['sentence'].apply(lambda sentence: len(sentence.split())))

    df_train['text'] = df_sentences_train['sentence'].apply(
        lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_train))
    df_test['text'] = df_sentences_test['sentence'].apply(
        lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_test))
    
    label_encoder_train = LabelEncoder()
    label_encoder_test = LabelEncoder()
    df_test['label'] = label_encoder_train.fit_transform(df_sentences_test['y'])
    df_train['label'] = label_encoder_test.fit_transform(df_sentences_train['y']) # in output for evaluation
    
    # Split the data into training and validation sets
    train_data = df_train
    test_data = df_test
    print(f"Train data : {len(train_data)}")
    print(f"Test data : {len(test_data)}")
    
    # Create a custom dataset
    class CustomDataset(Dataset):
        def __init__(self, text, label):
            self.text = text
            self.label = label
        def __len__(self):
            return len(self.text)
        def __getitem__(self, idx):
            return {
              'text': torch.tensor(self.text[idx], dtype=torch.long),
              'label': torch.tensor(self.label[idx], dtype=torch.long)
          }
    train_dataset_pp = CustomDataset(train_data['text'].values, train_data['label'].values)
    test_dataset_pp = CustomDataset(test_data['text'].values, test_data['label'].values)

    return train_dataset_pp, test_dataset_pp, label_encoder_test


---

# Multiclass CLF

#### After HP search with CV

In [None]:
# after HP search
#test_dataset = list_test_datasets['original_test_dataset']
test_dataset = original_test_dataset
# preprocess data
train_dataset_pp, test_dataset_pp, label_encoder_test = preprocess_data_train_test(train_dataset, test_dataset)

# Initialize the pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8).to(device)

# Define training parameters
batch_size, epochs, learning_rate = 8, 3, 2e-5

###################################
##### prepare model ###############
###################################
# Create data loaders
train_loader = DataLoader(train_dataset_pp, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_pp, batch_size=batch_size)

# Set up optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

# train model with 3 epochs and bs 8
train_losses, test_losses, avg_train_acc_per_epoch, avg_test_acc_per_epoch, train_confidence_scores, test_confidence_scores, all_train_confidence, avg_balanced_train_acc_per_epoch, avg_balanced_val_acc_per_epoch  = train_test(
                                                                                                                                model,
                                                                                                                                train_loader,
                                                                                                                                test_loader,
                                                                                                                                epochs=epochs,
                                                                                                                                optimizer=optimizer,
                                                                                                                                scheduler=scheduler,
                                                                                                                                device=device)


In [None]:
fig, axs = plt.subplots(1,2, figsize = (12,5))
axs[0].plot(range(epochs), train_losses, label='Training Loss')
axs[0].plot(range(epochs), test_losses, label='Test Loss')
axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('Loss')
axs[0].set_title('Train and Test Loss over Epochs')
axs[0].set_xticks(np.arange(epochs))
axs[0].legend()

axs[1].plot(range(epochs), avg_train_acc_per_epoch, label='Training Accuracy')
axs[1].plot(range(epochs), avg_test_acc_per_epoch, label='Test Accuracy')
axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('Accuracy')
axs[1].set_title('Train and Test Accuracy over Epochs')
axs[1].set_xticks(np.arange(epochs))
axs[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# train only

#test_dataset = list_test_datasets['original_test_dataset']
test_dataset = original_test_dataset
# preprocess data
train_dataset_pp, test_dataset_pp, label_encoder_test = preprocess_data_train_test(train_dataset, test_dataset)

# Initialize the pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8).to(device)

# Define training parameters
batch_size, epochs, learning_rate = 8, 3, 2e-5

###################################
##### prepare model ###############
###################################
# Create data loaders
train_loader = DataLoader(train_dataset_pp, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_pp, batch_size=batch_size)

# Set up optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)
    
train_losses, avg_acc_per_epoch = train(model, train_loader, epochs, optimizer, scheduler, device, False)

In [None]:
evaluate(model = model,
         test_loader = test_loader,
         label_encoder = label_encoder_test,
         device = device,
         plot_results = True,
         accuracy_metric = load_metric("accuracy"))

#### Old code

In [None]:
#test_dataset = list_test_datasets['original_test_dataset']
test_dataset = original_test_dataset
# preprocess data
train_dataset_pp, val_dataset_pp, test_dataset_pp, label_encoder_test = preprocess_data(train_dataset.iloc[:1000], test_dataset)

# Initialize the pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8).to(device)

# Define training parameters
batch_size, epochs, learning_rate = 8, 3, 2e-5
freeze_weights = False

train_loader, val_loader, test_loader, optimizer, scheduler = prepare_model(model, train_dataset_pp, val_dataset_pp, test_dataset_pp, freeze_weights, batch_size, epochs, learning_rate)

In [None]:
%%time

train_losses, val_losses, avg_train_acc_per_epoch, avg_val_acc_per_epoch, train_confidence_scores, val_confidence_scores  = train_test(
                                                                                                                                model,
                                                                                                                                train_loader,
                                                                                                                                val_loader,
                                                                                                                                epochs=epochs,
                                                                                                                                optimizer=optimizer,
                                                                                                                                scheduler=scheduler,
                                                                                                                                device=device)

In [None]:
fig, axs = plt.subplots(1,2, figsize = (12,5))
axs[0].plot(range(epochs), train_losses, label='Training Loss')
axs[0].plot(range(epochs), val_losses, label='Validation Loss')
axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('Loss')
axs[0].set_title('Train and Val Loss over Epochs')
axs[0].set_xticks(np.arange(epochs))
axs[0].legend()

axs[1].plot(range(epochs), avg_train_acc_per_epoch, label='Training Accuracy')
axs[1].plot(range(epochs), avg_val_acc_per_epoch, label='Validation Accuracy')
axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('Accuracy')
axs[1].set_title('Train and Val Accuracy over Epochs')
axs[1].set_xticks(np.arange(epochs))
axs[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# evaluate model on test dataset
#test_loader, label_encoder_test = prepare_test_dataset(test_dataset, batch_size)

evaluate(model = model,
         test_loader = test_loader,
         label_encoder = label_encoder_test,
         device = device,
         plot_results = True,
         accuracy_metric = load_metric("accuracy"))

In [None]:
for name, test_data in list_test_datasets.items():
    print(f"Test dataset: {name}")
    test_loader, label_encoder_test = prepare_test_dataset(test_data, batch_size)

    evaluate(model = model,
             test_loader = test_loader,
             label_encoder = label_encoder_test,
             device = device,
             plot_results = True,
             accuracy_metric = load_metric("accuracy"))

---

# Multiclass CLF with CV

Functions:
- preprocess_data_for_CV
- cross_validate

In [None]:
def preprocess_data_for_CV(df_sentences_train, df_sentences_test, train_index, val_index):
    
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()
    
    # Preprocess data and labels
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    max_length_train = max(df_sentences_train['sentence'].apply(lambda sentence: len(sentence.split())))
    max_length_test = max(df_sentences_test['sentence'].apply(lambda sentence: len(sentence.split())))

    df_train['text'] = df_sentences_train['sentence'].apply(
        lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_train))
    df_test['text'] = df_sentences_test['sentence'].apply(
        lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_test))
    
    label_encoder_train = LabelEncoder()
    label_encoder_test = LabelEncoder()
    df_train['label'] = label_encoder_train.fit_transform(df_sentences_train['y']) 
    df_test['label'] = label_encoder_test.fit_transform(df_sentences_test['y']) # in output for evaluation
    
    # Split the data into training and validation sets with CV splits
    train_data = df_train.iloc[train_index]
    val_data = df_train.iloc[val_index]
    
    test_data = df_test
    print(f"Train data : {len(train_data)}")
    print(f"Val data : {len(val_data)}")
    print(f"Test data : {len(test_data)}")

    
    # Create a custom dataset
    class CustomDataset(Dataset):
        def __init__(self, text, label):
            self.text = text
            self.label = label
        def __len__(self):
            return len(self.text)
        def __getitem__(self, idx):
            return {
              'text': torch.tensor(self.text[idx], dtype=torch.long),
              'label': torch.tensor(self.label[idx], dtype=torch.long)
            }
    train_dataset = CustomDataset(train_data['text'].values, train_data['label'].values)
    val_dataset = CustomDataset(val_data['text'].values, val_data['label'].values)
    test_dataset = CustomDataset(test_data['text'].values, test_data['label'].values)

    return train_dataset, val_dataset, test_dataset, label_encoder_test

def cross_validate(df_sentences_train, df_sentences_test, freeze_weights, batch_size, epochs, learning_rate, n_splits, train_loss_list, train_acc_list, train_balanced_acc_list, val_loss_list, val_acc_list, val_balanced_acc_list, device):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    best_val_acc = 0.0
    best_model = None

    for fold, (train_index, val_index) in enumerate(kf.split(df_sentences_train)):
        print(f"Fold {fold + 1}:")

        train_dataset, val_dataset, test_dataset, label_encoder_test = preprocess_data_for_CV(df_sentences_train, df_sentences_test, train_index, val_index)

        # Initialize the pre-trained BERT model
        model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8).to(device)
        train_loader, val_loader, test_loader, optimizer, scheduler = prepare_model(model, train_dataset, val_dataset, test_dataset, freeze_weights, batch_size, epochs, learning_rate)
        
        train_loss_fold, val_loss_fold, train_acc_fold, val_acc_fold, _, _, _,  train_balanced_acc_fold, val_balanced_acc_fold = train_test(model, train_loader, val_loader, epochs, optimizer, scheduler, device)
        
        train_loss_list.extend(train_loss_fold)
        train_acc_list.extend(train_acc_fold)
        val_loss_list.extend(val_loss_fold)
        val_acc_list.extend(val_acc_fold) 
        train_balanced_acc_list.extend(train_balanced_acc_fold)
        val_balanced_acc_list.extend(val_balanced_acc_fold)

        # Evaluate validation accuracy
        val_accuracy = val_acc_fold[-1]  # Assuming val_acc_fold contains accuracy values for each epoch

        # Update best model if current fold's validation accuracy is higher
        if val_accuracy > best_val_acc:
            best_val_acc = val_accuracy
            best_model = model.state_dict()  # Store the state dict of the best model
        
    return train_loss_list, val_loss_list, train_acc_list, train_balanced_acc_list, val_acc_list, val_balanced_acc_list, label_encoder_test, test_loader, best_model

### Train CV

In [None]:
%%time

# Set hyperparameters
batch_size = 8
epochs = 3
learning_rate = 2e-5
n_splits = 3

train_loss_list = []
train_acc_list = []
train_balanced_acc_list = []
val_loss_list = []
val_acc_list = []
val_balanced_acc_list = []

train_loss_list, val_loss_list, train_acc_list, train_balanced_acc_list, val_acc_list, val_balanced_acc_list, label_encoder_test, test_loader, best_model  = cross_validate(df_sentences_train=train_dataset.iloc[:1000],
                                                                                    df_sentences_test= list_test_datasets['original_test_dataset'],
                                                                                    freeze_weights=False, 
                                                                                    batch_size=batch_size, 
                                                                                    epochs=epochs, 
                                                                                    learning_rate=learning_rate,
                                                                                    n_splits=n_splits,
                                                                                    train_loss_list = train_loss_list,
                                                                                    train_acc_list = train_acc_list,
                                                                                    train_balanced_acc_list = train_balanced_acc_list,
                                                                                    val_loss_list = val_loss_list,
                                                                                    val_acc_list = val_acc_list,
                                                                                    val_balanced_acc_list = val_balanced_acc_list,
                                                                                    device = device)

Print results (loss and accuracy)

In [None]:
print(f"bs = {batch_size}")
print(f"n_splits = {n_splits}")
print(f"epochs = {epochs}")
print(f"\ntrain_loss_list = {train_loss_list}")
print(f"\nval_loss_list = {val_loss_list}")
print(f"\ntrain_acc_list = {train_acc_list}")
print(f"\nval_acc_list = {val_acc_list}")
print(f"\ntrain_balanced_acc_list = {train_balanced_acc_list}")
print(f"\nval_balanced_acc_list = {val_balanced_acc_list}")

In [None]:
train_loss_array = np.array(train_loss_list).reshape(n_splits,epochs)
mean_train_loss = np.mean(train_loss_array, axis=0)
ci_train_loss = np.percentile(train_loss_array, [2.5, 97.5], axis=0)

val_loss_array = np.array(val_loss_list).reshape(n_splits,epochs)
mean_val_loss = np.mean(val_loss_array, axis=0)
ci_val_loss = np.percentile(val_loss_array, [2.5, 97.5], axis=0)

# Create a DataFrame for Seaborn
df_train_loss = pd.DataFrame({
    'Epochs': np.arange(epochs),
    'Mean_Train_Loss': mean_train_loss,
    'Lower_CI': ci_train_loss[0],
    'Upper_CI': ci_train_loss[1]})

df_val_loss = pd.DataFrame({
    'Epochs': np.arange(epochs),
    'Mean_Val_Loss': mean_val_loss,
    'Lower_CI': ci_val_loss[0],
    'Upper_CI': ci_val_loss[1]})

# Code for Accuracy Plot (Second Subplot)
train_acc_array = np.array(train_acc_list).reshape(n_splits, epochs)
val_acc_array = np.array(val_acc_list).reshape(n_splits, epochs)
ci_train_acc = np.percentile(train_acc_array, [2.5, 97.5], axis=0)

mean_train_acc = np.mean(train_acc_array, axis=0)
mean_val_acc = np.mean(val_acc_array, axis=0)
ci_val_acc = np.percentile(val_acc_array, [2.5, 97.5], axis=0)

df_train_acc = pd.DataFrame({
    'Epochs': np.arange(epochs), 
    'Mean_Train_Accuracy': mean_train_acc,
    'Lower_CI': ci_train_acc[0],
    'Upper_CI': ci_train_acc[1]})

df_val_acc = pd.DataFrame({
    'Epochs': np.arange(epochs),
    'Mean_Val_Accuracy': mean_val_acc,
    'Lower_CI': ci_val_acc[0],
    'Upper_CI': ci_val_acc[1]})

# Code for Balanced Accuracy Plot (Second Subplot)
train_balanced_acc_array = np.array(train_balanced_acc_list).reshape(n_splits, epochs)
val_balanced_acc_array = np.array(val_balanced_acc_list).reshape(n_splits, epochs)
ci_train_balanced_acc = np.percentile(train_balanced_acc_array, [2.5, 97.5], axis=0)

mean_train_balanced_acc = np.mean(train_balanced_acc_array, axis=0)
mean_val_balanced_acc = np.mean(val_balanced_acc_array, axis=0)
ci_val_balanced_acc = np.percentile(val_balanced_acc_array, [2.5, 97.5], axis=0)

df_train_balanced_acc = pd.DataFrame({
    'Epochs': np.arange(epochs), 
    'Mean_Train_Balanced_Accuracy': mean_train_balanced_acc,
    'Lower_CI': ci_train_acc[0],
    'Upper_CI': ci_train_acc[1]})

df_val_balanced_acc = pd.DataFrame({
    'Epochs': np.arange(epochs),
    'Mean_Val_Balanced_Accuracy': mean_val_balanced_acc,
    'Lower_CI': ci_val_acc[0],
    'Upper_CI': ci_val_acc[1]})

#df_train_loss.head()
#df_val_loss.head()

fig, axs = plt.subplots(1,3, figsize=(12,5))
# Plot the mean accuracy line
sns.pointplot(data=df_train_loss, x='Epochs', y='Mean_Train_Loss', color='blue', ax = axs[0])
sns.pointplot(data=df_val_loss, x='Epochs', y='Mean_Val_Loss', color='orange', ax = axs[0])

sns.pointplot(data=df_train_acc, x='Epochs', y='Mean_Train_Accuracy', color='blue', ax = axs[1])
sns.pointplot(data=df_val_acc, x='Epochs', y='Mean_Val_Accuracy', color='orange', ax = axs[1])

sns.pointplot(data=df_train_balanced_acc, x='Epochs', y='Mean_Train_Balanced_Accuracy', color='blue', ax = axs[2])
sns.pointplot(data=df_val_balanced_acc, x='Epochs', y='Mean_Val_Balanced_Accuracy', color='orange', ax = axs[2])

# Fill between the confidence interval
axs[0].fill_between(df_train_loss['Epochs'], df_train_loss['Lower_CI'], df_train_loss['Upper_CI'], color='blue', alpha=0.3, label = 'train')
axs[0].fill_between(df_val_loss['Epochs'], df_val_loss['Lower_CI'], df_val_loss['Upper_CI'], color='orange', alpha=0.3, label = 'val')
axs[0].legend()
axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss')
axs[0].set_title('CV Loss with 95% CI')

axs[1].fill_between(df_train_acc['Epochs'], df_train_acc['Lower_CI'], df_train_acc['Upper_CI'], color='blue', alpha=0.3, label = 'train')
axs[1].fill_between(df_val_acc['Epochs'], df_val_acc['Lower_CI'], df_val_acc['Upper_CI'], color='orange', alpha=0.3, label = 'val')

axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Accuracy')
axs[1].set_title('CV Accuracy with 95% CI')

axs[2].fill_between(df_train_balanced_acc['Epochs'], df_train_balanced_acc['Lower_CI'], df_train_balanced_acc['Upper_CI'], color='blue', alpha=0.3, label = 'train')
axs[2].fill_between(df_val_balanced_acc['Epochs'], df_val_balanced_acc['Lower_CI'], df_val_balanced_acc['Upper_CI'], color='orange', alpha=0.3, label = 'val')

axs[2].set_xlabel('Epochs')
axs[2].set_ylabel('Balanced Accuracy')
axs[2].set_title('CV Balanced Accuracy with 95% CI')

plt.legend()
plt.show()

Evaluation of the best model

In [None]:
# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8).to(device)

# Load the state dictionary of the best model
model.load_state_dict(best_model)

# Evaluate the best model
evaluate(model=model,
             test_loader=test_loader,
             label_encoder=label_encoder_test,
             device=device,
             plot_results = True,
             accuracy_metric = load_metric("accuracy"))

---

# Multiple binary CLF with CV

In [None]:
def cross_validate_bin(df_sentences_train, df_sentences_test, freeze_weights, batch_size, epochs, learning_rate, n_splits, train_loss_list, train_acc_list, train_balanced_acc_list,val_loss_list, val_acc_list, val_balanced_acc_list, device):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    best_val_acc = 0.0
    best_model = None

    for fold, (train_index, val_index) in enumerate(kf.split(df_sentences_train)):
        print(f"Fold {fold + 1}:")

        train_dataset, val_dataset, test_dataset, label_encoder_test = preprocess_data_for_CV(df_sentences_train, df_sentences_test, train_index, val_index)

        # Initialize the pre-trained BERT model
        model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
        train_loader, val_loader, test_loader, optimizer, scheduler = prepare_model(model, train_dataset, val_dataset, test_dataset, freeze_weights, batch_size, epochs, learning_rate)
        
        train_loss_fold, val_loss_fold, train_acc_fold, val_acc_fold, _, _, _, train_balanced_acc_fold, val_balanced_acc_fold = train_test(model, train_loader, val_loader, epochs, optimizer, scheduler, device)
        
        train_loss_list.extend(train_loss_fold)
        train_acc_list.extend(train_acc_fold)
        val_loss_list.extend(val_loss_fold)
        val_acc_list.extend(val_acc_fold) 
        train_balanced_acc_list.extend(train_balanced_acc_fold)
        val_balanced_acc_list.extend(val_balanced_acc_fold)

        # Evaluate validation accuracy
        val_accuracy = val_acc_fold[-1]  # Assuming val_acc_fold contains accuracy values for each epoch

        # Update best model if current fold's validation accuracy is higher
        if val_accuracy > best_val_acc:
            best_val_acc = val_accuracy
            best_model = model.state_dict()  # Store the state dict of the best model
        
    return train_loss_list, val_loss_list, train_acc_list, train_balanced_acc_list,val_acc_list, val_balanced_acc_list, label_encoder_test, test_loader, best_model

In [None]:
# Select the reflective categories and show their distribution
reflective_cat_in_order = list(train_dataset["y"].value_counts().sort_values(ascending=False).index)
reflective_cat_in_order_wo_other = [item for item in reflective_cat_in_order if item != 'Other']
print(f"List of reflective categories in order : {reflective_cat_in_order}")
print(f"List of reflective categories in order without 'Other': {reflective_cat_in_order_wo_other}")

topN_classes = reflective_cat_in_order_wo_other[6:]
reflective_categories = topN_classes
print(f"\ntopN reflective categories: {reflective_categories}")

In [None]:
%%time
# Set hyperparameters
batch_size = 8
epochs = 5
learning_rate = 2e-5
n_splits = 2

dfs_train_loss = {}
dfs_train_acc = {}
dfs_train_balanced_acc = {}
dfs_val_loss = {}
dfs_val_acc = {}
dfs_val_balanced_acc = {}
best_model = {}

print(f"\n\nLaunching {n_splits}-fold CV per class with : {reflective_categories}")

for i, cat in enumerate(reflective_categories):
    print(f"\n\nCV for {cat}")
    train_loss_list = []
    train_acc_list = []
    train_balanced_acc_list = []
    val_loss_list = []
    val_acc_list = []
    val_balanced_acc_list = []
    
    df_sentences_bin = train_dataset.iloc[:2000].copy()
    df_sentences_bin['y'] = np.where(df_sentences_bin['y'] == cat, cat, 'Other')
    
    train_loss_list, val_loss_list, train_acc_list, train_balanced_acc_list, val_acc_list, val_balanced_acc_list, label_encoder_test, _, best_model[cat] = cross_validate_bin(df_sentences_train=df_sentences_bin,
                                                                                                df_sentences_test = original_test_dataset.iloc[:300],
                                                                                                freeze_weights=False, 
                                                                                                batch_size=batch_size, 
                                                                                                epochs=epochs, 
                                                                                                learning_rate=learning_rate,
                                                                                                n_splits=n_splits,
                                                                                                train_loss_list = train_loss_list,
                                                                                                train_acc_list = train_acc_list,
                                                                                                train_balanced_acc_list = train_balanced_acc_list,
                                                                                                val_loss_list = val_loss_list,
                                                                                                val_acc_list = val_acc_list,
                                                                                                val_balanced_acc_list = val_balanced_acc_list,
                                                                                                device = device)

    # loss on training set
    train_loss_array = np.array(train_loss_list).reshape(n_splits,epochs)
    mean_train_loss = np.mean(train_loss_array, axis=0)
    ci_train_loss = np.percentile(train_loss_array, [2.5, 97.5], axis=0)

    # loss on validation set
    val_loss_array = np.array(val_loss_list).reshape(n_splits,epochs)
    mean_val_loss = np.mean(val_loss_array, axis=0)
    ci_val_loss = np.percentile(val_loss_array, [2.5, 97.5], axis=0)
    
    # accuracy on training set
    train_acc_array = np.array(train_acc_list).reshape(n_splits,epochs)
    mean_train_acc = np.mean(train_acc_array, axis=0)
    ci_train_acc = np.percentile(train_acc_array, [2.5, 97.5], axis=0)
        
    # accuracy on validation set
    val_acc_array = np.array(val_acc_list).reshape(n_splits,epochs)
    mean_val_acc = np.mean(val_acc_array, axis=0)
    ci_val_acc = np.percentile(val_acc_array, [2.5, 97.5], axis=0)
    
    # Balanced accuracy on train set
    train_balanced_acc_array = np.array(train_balanced_acc_list).reshape(n_splits,epochs)
    mean_train_balanced_acc = np.mean(train_balanced_acc_array, axis=0)
    ci_train_balanced_acc = np.percentile(train_balanced_acc_array, [2.5, 97.5], axis=0)

    # Balanced accuracy on validation set
    val_balanced_acc_array = np.array(val_balanced_acc_list).reshape(n_splits,epochs)
    mean_val_balanced_acc = np.mean(val_balanced_acc_array, axis=0)
    ci_val_balanced_acc = np.percentile(val_balanced_acc_array, [2.5, 97.5], axis=0)

    # Create a DataFrame for Seaborn
    df_train_loss = pd.DataFrame({
            'Epochs': np.arange(epochs),
            'Mean_Train_Loss': mean_train_loss,
            'Lower_CI': ci_train_loss[0],
            'Upper_CI': ci_train_loss[1]})

    df_val_loss = pd.DataFrame({
            'Epochs': np.arange(epochs),
            'Mean_Val_Loss': mean_val_loss,
            'Lower_CI': ci_val_loss[0],
            'Upper_CI': ci_val_loss[1]})
        
    df_train_acc = pd.DataFrame({
            'Epochs': np.arange(epochs),
            'Mean_Train_Acc': mean_train_acc,
            'Lower_CI': ci_train_acc[0],
            'Upper_CI': ci_train_acc[1]})
        
    df_val_acc = pd.DataFrame({
            'Epochs': np.arange(epochs),
            'Mean_Val_Acc': mean_val_acc,
            'Lower_CI': ci_val_acc[0],
            'Upper_CI': ci_val_acc[1]})

    df_train_balanced_acc = pd.DataFrame({
        'Epochs': np.arange(epochs), 
        'Mean_Train_Balanced_Acc': mean_train_balanced_acc,
        'Lower_CI': ci_train_balanced_acc[0],
        'Upper_CI': ci_train_balanced_acc[1]})

    df_val_balanced_acc = pd.DataFrame({
        'Epochs': np.arange(epochs),
        'Mean_Val_Balanced_Acc': mean_val_balanced_acc,
        'Lower_CI': ci_val_balanced_acc[0],
        'Upper_CI': ci_val_balanced_acc[1]})
                                            
    dfs_train_loss[f'{cat}'] = df_train_loss
    dfs_val_loss[f'{cat}'] = df_val_loss
    dfs_train_acc[f'{cat}'] = df_train_acc 
    dfs_val_acc[f'{cat}'] = df_val_acc  
    dfs_train_balanced_acc[f'{cat}'] = df_train_balanced_acc 
    dfs_val_balanced_acc[f'{cat}'] = df_val_balanced_acc  

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharey = True)  

# Sample iteration over the dictionary to plot subplots
for i, cat in enumerate(reflective_categories):
    df_train = dfs_train_loss[cat]
    df_val = dfs_val_loss[cat]
    ax = axs[i // 4, i % 4]
    
    # Mean lineplot
    sns.pointplot(data=df_train, x='Epochs', y='Mean_Train_Loss', color='blue', ax = ax)
    sns.pointplot(data=df_val, x='Epochs', y='Mean_Val_Loss', color='orange', ax = ax) 
    
    # Fill between the confidence interval
    ax.fill_between(df_train['Epochs'], df_train['Lower_CI'], df_train['Upper_CI'], color='blue', alpha=0.3, label = 'train')
    ax.fill_between(df_val['Epochs'], df_val['Lower_CI'], df_val['Upper_CI'], color='orange', alpha=0.3, label = 'val')
    
    #ax.set_xticks(np.arange(len(training_examples)), training_examples)
    ax.set_xlabel('Epochs')
    ax.set_ylabel("Loss")
    ax.set_title(f"{cat}") 
    ax.legend()

fig.delaxes(axs[1,3])
plt.suptitle(f"{n_splits}-fold CV Loss with 95% CI per class with bs : {batch_size}, lr : {learning_rate}, {epochs} epochs")
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharey = True)  

# Sample iteration over the dictionary to plot subplots
for i, cat in enumerate(reflective_categories):
    df_train = dfs_train_acc[cat]
    df_val = dfs_val_acc[cat]
    ax = axs[i // 4, i % 4]
    
    # Mean lineplot
    sns.pointplot(data=df_train, x='Epochs', y='Mean_Train_Acc', color='blue', ax = ax)
    sns.pointplot(data=df_val, x='Epochs', y='Mean_Val_Acc', color='orange', ax = ax) 
    
    # Fill between the confidence interval
    ax.fill_between(df_train['Epochs'], df_train['Lower_CI'], df_train['Upper_CI'], color='blue', alpha=0.3, label = 'train')
    ax.fill_between(df_val['Epochs'], df_val['Lower_CI'], df_val['Upper_CI'], color='orange', alpha=0.3, label = 'val')
    
    #ax.set_xticks(np.arange(len(training_examples)), training_examples)
    ax.set_xlabel('Epochs')
    ax.set_ylabel("Accuracy")
    ax.set_title(f"{cat}") 
    ax.legend()
    
fig.delaxes(axs[1,3])
plt.suptitle(f"{n_splits}-fold CV Accuracy with 95% CI per class with bs : {batch_size}, lr : {learning_rate}, {epochs} epochs")
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharey = True)  

# Sample iteration over the dictionary to plot subplots
for i, cat in enumerate(reflective_categories):
    df_train = dfs_train_balanced_acc[cat]
    df_val = dfs_val_balanced_acc[cat]
    ax = axs[i // 4, i % 4]
    
    # Mean lineplot
    sns.pointplot(data=df_train, x='Epochs', y='Mean_Train_Balanced_Acc', color='blue', ax = ax)
    sns.pointplot(data=df_val, x='Epochs', y='Mean_Val_Balanced_Acc', color='orange', ax = ax) 
    
    # Fill between the confidence interval
    ax.fill_between(df_train['Epochs'], df_train['Lower_CI'], df_train['Upper_CI'], color='blue', alpha=0.3, label = 'train')
    ax.fill_between(df_val['Epochs'], df_val['Lower_CI'], df_val['Upper_CI'], color='orange', alpha=0.3, label = 'val')
    
    #ax.set_xticks(np.arange(len(training_examples)), training_examples)
    ax.set_xlabel('Epochs')
    ax.set_ylabel("Balanced Accuracy")
    ax.set_title(f"{cat}") 
    ax.legend()
    
fig.delaxes(axs[1,3])
plt.suptitle(f"{n_splits}-fold CV Balanced Accuracy with 95% CI per class with bs : {batch_size}, lr : {learning_rate}, {epochs} epochs")
plt.tight_layout()
plt.show()

### Evaluate for each class

Functions for the evaluation:
- prepare_test_dataset_for_binclf
- evaluate_bin



In [None]:
def prepare_test_dataset_for_binclf(df_sentences_test, batch_size):

    df_test = pd.DataFrame()
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    max_length_test = max(df_sentences_test['sentence'].apply(lambda sentence: len(sentence.split())))
    df_test['text'] = df_sentences_test['sentence'].apply(
            lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_test))

    label_encoder_test = LabelEncoder()
    df_test['label'] = label_encoder_test.fit_transform(df_sentences_test['y'])
    #print(f"Test data : {len(df_test)} sentences")

    # Create a custom dataset
    class CustomDataset(Dataset):
        def __init__(self, text, label):
            self.text = text
            self.label = label
        def __len__(self):
            return len(self.text)
        def __getitem__(self, idx):
            return {
                'text': torch.tensor(self.text[idx], dtype=torch.long),
                'label': torch.tensor(self.label[idx], dtype=torch.long)
            }

    test_dataset = CustomDataset(df_test['text'].values, df_test['label'].values)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return test_loader, label_encoder_test

#if freeze_weights:
#    # Freeze all layers except the last two
#    for param in model.parameters():
#        param.requires_grad = False
#    for param in model.classifier.parameters():
#        param.requires_grad = True

#####################################
def evaluate_bin(model, test_loader, label_encoder, device, accuracy_metric):
    model.eval()
    all_preds = []
    all_labels = []
    pred_confidence = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch['text'].to(device)
            labels = batch['label'].to(device)
            outputs = model(inputs)
            predictions = torch.argmax(outputs.logits, axis=1)
            all_preds.extend(predictions.cpu().numpy().tolist())
            all_labels.extend(labels.tolist())
            ### compute confidence score
            probabilities = torch.softmax(outputs.logits, dim=1)
            pred_confidence.extend(probabilities.max(dim=1).values.cpu().detach().numpy())  # Confidence scores

    # compute accuracy
    accuracy = accuracy_metric.compute(predictions=all_preds, references=all_labels)["accuracy"]
    # Decode label encodings
    predicted_labels = label_encoder.inverse_transform(all_preds)
    true_labels = label_encoder.inverse_transform(all_labels)

    # Get unique labels from true and predicted labels and their union for the confusion matrix
    unique_true_labels = set(predicted_labels)
    unique_predicted_labels = set(true_labels)
    unique_labels_union = unique_true_labels.union(unique_predicted_labels)

    class_labels = sorted(unique_labels_union)

    return accuracy, class_labels, predicted_labels, true_labels, pred_confidence

for HP search

In [None]:
# Load test dataset
#test_dataset_to_use = list_test_datasets['balanced_test_dataset_up'].copy()
print(f"Length of test data : {len(original_test_dataset)}")
sns.histplot(original_test_dataset['y'])
plt.title('Test Dataset')
plt.tick_params(axis='x',labelrotation = 45)
plt.tight_layout()
plt.show()
#fig, axs = plt.subplots(1, len(reflective_categories), figsize=(15, 4))
accuracy_data = {}
class_labels_data = {}
predicted_labels_data = {}
true_labels_data = {}
pred_confidence_data = {}

for i, cat in enumerate(reflective_categories):
    # preprocess the test dataset for each case : each model has been trained for a binary clf
    print(f"Evaluation for {cat}")
    df_sentences_test_bin = original_test_dataset.copy()
    df_sentences_test_bin['y'] = np.where(df_sentences_test_bin['y'] == cat, cat, 'Other')
    #sns.histplot(df_sentences_test_bin['y'])
    #plt.show()
    
    test_loader, label_encoder_test = prepare_test_dataset_for_binclf(df_sentences_test_bin, batch_size) 
    
    # Initialize the model: Take BINARY clf !!!
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

    # Load the state dictionary of the best model
    model.load_state_dict(best_model[cat])

    # Evaluate the best model
    #accuracy, class_labels, predicted_labels, true_labels, pred_confidence = evaluate_bin(idx = i,
#                                                                 cat = cat,
#                                                                 model = model,
#                                                                 test_loader = test_loader,
#                                                                 label_encoder = label_encoder_test,
#                                                                 device = device,
#                                                                 plot_cm = False,
#                                                                 plot_cf = True,
#                                                                 accuracy_metric = load_metric("accuracy"),
#                                                                 axs = axs)
    accuracy, class_labels, predicted_labels, true_labels, pred_confidence = evaluate_bin(model = model,
                                                                                test_loader = test_loader,
                                                                                label_encoder = label_encoder_test,
                                                                                device = device,
                                                                                accuracy_metric = load_metric("accuracy"))
    
    # Store data for each category in the respective dictionaries
    accuracy_data[cat] = accuracy
    class_labels_data[cat] = class_labels
    predicted_labels_data[cat] = predicted_labels
    true_labels_data[cat] = true_labels
    pred_confidence_data[cat] = pred_confidence

print(f"\nOverall accuracy of multiple binary clf : {np.mean(list(accuracy_data.values())).round(4)}")
# Weighted average accuracy
weights = [len(original_test_dataset[original_test_dataset['y'] == cat]) for cat in reflective_categories]
# Calculate the weighted average
weighted_avg = np.round(sum(w * v for w, v in zip(weights, accuracy_data.values())) / sum(weights),4)
print(f"\nWeights : {weights}")
print(f"Weighted accuracy of multiple binary clf : {weighted_avg}\n\n")    

#plt.tight_layout()
#plt.show()

predicted_labels_df = pd.DataFrame()
true_labels_df = pd.DataFrame()
pred_confidence_df = pd.DataFrame()
for cat in reflective_categories:
    predicted_labels_df[cat] = predicted_labels_data[cat]
    true_labels_df[cat] = true_labels_data[cat]
    pred_confidence_df[cat] = pred_confidence_data[cat]

for evaluation with 3 epochs and bs 8

In [None]:
# Select the reflective categories and show their distribution
reflective_cat_in_order = list(train_dataset["y"].value_counts().sort_values(ascending=False).index)
reflective_cat_in_order_wo_other = [item for item in reflective_cat_in_order if item != 'Other']
print(f"List of reflective categories in order : {reflective_cat_in_order}")
print(f"List of reflective categories in order without 'Other': {reflective_cat_in_order_wo_other}")

topN_classes = reflective_cat_in_order_wo_other[:8]
reflective_categories = topN_classes
print(f"\ntopN reflective categories: {reflective_categories}")

# Load test dataset
#test_dataset_to_use = list_test_datasets['balanced_test_dataset_up'].copy()
print(f"Length of test data : {len(original_test_dataset)}")
sns.histplot(original_test_dataset['y'])
plt.title('Test Dataset')
plt.tick_params(axis='x',labelrotation = 45)
plt.tight_layout()
plt.show()

In [None]:
accuracy_data = {}
class_labels_data = {}
predicted_labels_data = {}
true_labels_data = {}
pred_confidence_data = {}

# Define training parameters
batch_size, epochs, learning_rate = 8, 3, 2e-5

for i, cat in enumerate(reflective_categories):
    print(f"Training for {cat}")
    # preprocess the test dataset for each category
    train_dataset_bin = train_dataset.copy()
    train_dataset_bin['y'] = np.where(train_dataset_bin['y'] == cat, cat, 'Other')
    test_dataset_bin = original_test_dataset.copy()
    test_dataset_bin['y'] = np.where(test_dataset_bin['y'] == cat, cat, 'Other')
    
    train_dataset_pp, test_dataset_pp, label_encoder_test = preprocess_data_train_test(train_dataset_bin, test_dataset_bin)

    # Initialize the pre-trained BERT model
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset_pp, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset_pp, batch_size=batch_size)

    # Set up optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

    train_losses, avg_acc_per_epoch = train(model, train_loader, epochs, optimizer, scheduler, device, False)
    
    print(f"Evaluation for {cat}")
    accuracy, class_labels, predicted_labels, true_labels, pred_confidence = evaluate_bin(model = model,
                                                                                test_loader = test_loader,
                                                                                label_encoder = label_encoder_test,
                                                                                device = device,
                                                                                accuracy_metric = load_metric("accuracy"))
    
    # Store data for each category in the respective dictionaries
    accuracy_data[cat] = accuracy
    class_labels_data[cat] = class_labels
    predicted_labels_data[cat] = predicted_labels
    true_labels_data[cat] = true_labels
    pred_confidence_data[cat] = pred_confidence

print(f"\nOverall accuracy of multiple binary clf : {np.mean(list(accuracy_data.values())).round(4)}")
# Weighted average accuracy
weights = [len(original_test_dataset[original_test_dataset['y'] == cat]) for cat in reflective_categories]
# Calculate the weighted average
weighted_avg = np.round(sum(w * v for w, v in zip(weights, accuracy_data.values())) / sum(weights),4)
print(f"\nWeights : {weights}")
print(f"Weighted accuracy of multiple binary clf : {weighted_avg}\n\n")    

predicted_labels_df = pd.DataFrame()
true_labels_df = pd.DataFrame()
pred_confidence_df = pd.DataFrame()
for cat in reflective_categories:
    predicted_labels_df[cat] = predicted_labels_data[cat]
    true_labels_df[cat] = true_labels_data[cat]
    pred_confidence_df[cat] = pred_confidence_data[cat]

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(15, 6)) 
print(reflective_categories)

for i, cat in enumerate(reflective_categories):
    ax = axs[i // 4, i % 4]
    # Create confusion matrix
    cm = confusion_matrix(true_labels_df[cat].values, predicted_labels_df[cat].values, labels=class_labels_data[cat])
    # Confusion Matrix
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels_data[cat], yticklabels=class_labels_data[cat], ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_title(f"{cat}, acc {np.round(accuracy_data[cat],3)}")
    
fig.delaxes(axs[1, 3])
plt.suptitle(f"Confusion matrices per class on test dataset")
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharex = True, sharey = True) 

for i, cat in enumerate(reflective_categories):
    ax = axs[i // 4, i % 4]
    
    # Confidence Scores Histogram
    #ax.hist(pred_confidence_df[cat], bins=50)
    correct_confidence = [pred_confidence_df[cat][i] for i in range(len(predicted_labels_df[cat])) if predicted_labels_df[cat][i] == true_labels_df[cat][i]]
    incorrect_confidence = [pred_confidence_df[cat][i] for i in range(len(predicted_labels_df[cat])) if predicted_labels_df[cat][i] != true_labels_df[cat][i]]
    ax.hist(correct_confidence, bins=50, color='green', alpha=0.7, label='Correct Predictions')
    ax.hist(incorrect_confidence, bins=50, color='red', alpha=0.7, label='Incorrect Predictions')
    ax.set_xlabel('Confidence score')
    ax.set_ylabel('Number of predictions')
    ax.set_title(f"{cat}, acc {np.round(accuracy_data[cat],3)}")
    ax.legend()

fig.delaxes(axs[1, 3])
plt.suptitle(f"Confidence scores per class on test dataset")
plt.tight_layout()
plt.show()

In [None]:
print(predicted_labels_df['Experience'].unique())
print(true_labels_df['Experience'].unique())
print(reflective_categories)


In [None]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score

# Initialize variables to store sum of metrics and count of classes
sum_accuracy = 0.0
sum_weighted_accuracy = 0.0
sum_f1 = 0.0
sum_auc = 0.0
count_classes = 0

# Iterate over each 'cat'
for cat in reflective_categories:
    predicted_labels = predicted_labels_df[cat].values
    true_labels = true_labels_df[cat].values
    
    # Encode labels
    label_encoder = LabelEncoder()
    true_labels_encoded = label_encoder.fit_transform(true_labels)
    predicted_labels_encoded = label_encoder.transform(predicted_labels)
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    balanced_accuracy = balanced_accuracy_score(true_labels, predicted_labels)
    #f1 = f1_score(true_labels, predicted_labels, labels = [cat, 'Other'], pos_label = cat)
    f1_scores = f1_score(true_labels_encoded, predicted_labels_encoded, pos_label=label_encoder.transform([cat])[0])
    #auc = roc_auc_score(true_labels, predicted_labels) if len(set(true_labels)) > 1 else None  # AUC requires both classes
    auc = roc_auc_score(true_labels_encoded, predicted_labels_encoded) if len(set(true_labels_encoded)) > 1 else None
    
     # Sum metrics
    sum_accuracy += accuracy
    sum_weighted_accuracy += balanced_accuracy
    sum_f1 += f1_scores
    if auc is not None:
        sum_auc += auc
        
    # Increment count of classes
    count_classes += 1
    
    # Print metrics for the current class
    print(f"Overall accuracy for class '{cat}' = {accuracy:.4f}")
    print(f"Balanced accuracy for class '{cat}' = {balanced_accuracy:.4f}")
    print(f"F1 score for class '{cat}' = {f1_scores:.4f}")
    if auc is not None:
        print(f"AUC for class '{cat}' = {auc:.4f}")
    else:
        print(f"AUC for class '{cat}' cannot be computed due to insufficient data")
    print("--------------------------------------")

# Calculate averages
avg_accuracy = sum_accuracy / count_classes if count_classes > 0 else 0.0
avg_weighted_accuracy = sum_weighted_accuracy / count_classes if count_classes > 0 else 0.0
avg_f1 = sum_f1 / count_classes if count_classes > 0 else 0.0
avg_auc = sum_auc / count_classes if count_classes > 0 else None

# Print average metrics
print("#########################################")
print("############## AVERAGE ##################")
print("#########################################")
print(f"Average accuracy across classes = {avg_accuracy:.4f}")
print(f"Average weighted accuracy across classes = {avg_weighted_accuracy:.4f}")
print(f"Average F1 score across classes = {avg_f1:.4f}")
if avg_auc is not None:
    print(f"Average AUC across classes = {avg_auc:.4f}")
else:
    print("Average AUC across classes cannot be computed due to insufficient data")


In [None]:
for cat in reflective_categories:
    print(f"\npredicted_labels_{cat.lower()} = {predicted_labels_df[cat].values}\n")
        
for cat in reflective_categories:
    print(f"\ntrue_labels_{cat.lower()} = {true_labels_df[cat].values}\n")
        
for cat in reflective_categories:
    print(f"\npred_confidence_{cat.lower()} = {pred_confidence_df[cat].values}\n")

### Evaluate with Cascaded binary classifiers

In [None]:
cascaded_clf_classes_ordered = ['Intention', 'Learning', 'Perspective', 'Difficulty', 'Belief', 'Feeling', 'Experience']

# test_datasets = {"original_test_dataset": original_test_dataset, 
#                  "balanced_test_dataset_down": balanced_test_dataset_down, 
#                  "balanced_test_dataset_up": balanced_test_dataset_up, 
#                  "small_balanced_test_dataset_up": small_balanced_test_dataset_up}

test_dataset_cascaded = list_test_datasets['balanced_test_dataset'].copy()
print(f"Length of test data : {len(test_dataset_cascaded)}")
sns.histplot(test_dataset_cascaded['y'])
plt.title('Test Dataset')
plt.tick_params(axis='x',labelrotation = 45)
plt.tight_layout()
plt.show()
test_dataset_cascaded.head()

In [None]:
print("Start of the cascaded evaluation")

accuracy_data = {}
class_labels_data = {}
predicted_labels_data = {}
true_labels_data = {}
pred_confidence_data = {}
all_preds = []
all_true_labels = []

for cat in cascaded_clf_classes_ordered:
    # preprocess the test dataset for each case : each model has been trained for a binary clf
    print(f"Evaluation for {cat}")
    df_sentences_test_bin = test_dataset_cascaded.reset_index()
    df_sentences_test_bin['y'] = np.where(df_sentences_test_bin['y'] == cat, cat, 'Other')
    #sns.histplot(df_sentences_test_bin['y'])
    #plt.show()
    
    test_loader, label_encoder_test = prepare_test_dataset_for_binclf(df_sentences_test_bin, batch_size) 
    
    # Initialize the model
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=9).to(device)

    # Load the state dictionary of the best model
    model.load_state_dict(best_model[cat])
    
    accuracy, class_labels, predicted_labels, true_labels, pred_confidence = evaluate_bin(model = model,
                                                                                test_loader = test_loader,
                                                                                label_encoder = label_encoder_test,
                                                                                device = device,
                                                                                accuracy_metric = load_metric("accuracy"))
    
    indices_to_remove = [pos for pos in range(len(predicted_labels)) if predicted_labels[pos] == cat]
    all_true_labels.extend(test_dataset_cascaded.iloc[indices_to_remove]['y'])
    test_dataset_cascaded.drop(test_dataset_cascaded.index[indices_to_remove], inplace=True)
    print(f"Length test dataset after eval {cat}: {len(test_dataset_cascaded)}")
    
    #all_true_labels.extend()
    all_preds.extend(predicted_labels)
    # Store data for each category in the respective dictionaries
    accuracy_data[cat] = accuracy
    class_labels_data[cat] = class_labels
    predicted_labels_data[cat] = predicted_labels
    true_labels_data[cat] = true_labels
    pred_confidence_data[cat] = pred_confidence

In [None]:
print(all_true_labels)
print(all_preds)

In [None]:
cascaded_clf_classes_ordered.append('Other')
class_labels = cascaded_clf_classes_ordered

true_labels = all_true_labels
predicted_labels = all_preds

# Generate classification report
report = classification_report(true_labels, predicted_labels, zero_division = 1, target_names=class_labels)
print(report)

# Create confusion matrix
cm = confusion_matrix(true_labels, predicted_labels, labels=class_labels)

fig, axs = plt.subplots(1, 2, figsize=(12, 5))
# Subplot 1: Confusion Matrix
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels, ax=axs[0])
axs[0].set_xlabel("Predicted")
axs[0].set_ylabel("True")
axs[0].set_title("Confusion Matrix")

        # Subplot 2: Confidence Scores Histogram
        #axs[1].hist(pred_confidence, bins=50)
        #axs[1].set_xlabel('Confidence score')
        #axs[1].set_ylabel('Number of predictions')
        #axs[1].set_title('Confidence score of predictions')
        
        # Subplot 3: Confidence Scores Histogram for Correct and Incorrect Predictions
correct_confidence = [pred_confidence[i] for i in range(len(predicted_labels)) if predicted_labels[i] == true_labels[i]]
incorrect_confidence = [pred_confidence[i] for i in range(len(predicted_labels)) if predicted_labels[i] != true_labels[i]]
axs[1].hist(correct_confidence, bins=50, color='green', alpha=0.7, label='Correct Predictions')
axs[1].hist(incorrect_confidence, bins=50, color='red', alpha=0.7, label='Incorrect Predictions')
axs[1].set_xlabel('Confidence score')
axs[1].set_ylabel('Number of predictions')
axs[1].set_title('Confidence score of predictions')
axs[1].legend()
plt.tight_layout()
plt.show()

---

# Pipeline self-learning for each category
- **Step 1:** 
    - option A : downsample the train dataset and create 8 dataset with binary classes for each
    - option B : Create the downsampled (25/75 and equal distrib of other classes) datasets for each class

- **Step 2:** Train with CV with downsampled datasets --> with optimal number of epochs (2-3 epochs)
    - compare accuracy compared to initial multiclass clf
    - Look at correlation between confidence score and accuracy


- **Step 3:** Plot learning curves for each class with good number of epochs and the corresponding dataset

### 1) Downsampling
- Create balanced datasets with downsampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

df_sentences = train_dataset

topN = 8
reflective_cat_in_order = list(df_sentences["y"].value_counts().sort_values(ascending=False).index)
topN_classes = reflective_cat_in_order[:topN]
print(f"\nList of reflective categories in order : {reflective_cat_in_order}")
print(f"List of topN selected categories for downsampling : {topN_classes}")

M_catType = CategoricalDtype(categories = topN_classes, ordered = True)
df_sentences['y'] = df_sentences['y'].astype(M_catType)

dict_categories_ordered = dict(df_sentences["y"].value_counts().sort_values(ascending=False))
df_sentences_topN_classes = df_sentences[df_sentences['y'].isin(topN_classes)]

In [None]:
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

def downsample_dataset(df, max_values):
    label_counts = dict(df["y"].value_counts().sort_values(ascending=False))

    downsampled_label_counts = label_counts
    for key, value in label_counts.items():
        downsampled_label_counts[key] = value if value < max_values else max_values

    # Downsample to the maximum desired labels per class
    under_sampler = RandomUnderSampler(sampling_strategy= downsampled_label_counts)
    df_downsampled, _ = under_sampler.fit_resample(df, df['y'])

    # sample the dataset randomly
    df_downsampled = df_downsampled.sample(frac = 1)
    print(f"Length of the downsampled dataframe: {len(df_downsampled)}")
    
    sns.histplot(df_downsampled['y'])
    plt.xticks(rotation = 45)
    plt.title(f"Dataset downsampled - cut {max_values} samples")
    plt.show()

    return df_downsampled

In [None]:
train_dataset_downsampled = downsample_dataset(train_dataset, 400)

In [None]:
test_dataset_downsampled = downsample_dataset(original_test_dataset, 100)

In [None]:
dataset_downsampled = downsample_dataset(merged_df, 400)

In [None]:
#fig, axs = plt.subplots(2, 4, figsize = (15,6), sharey = True)
#for i, label in enumerate(topN_classes):
#    #print(label)
#    ax = axs[i//4, i%4]
#    sns.histplot(balanced_datasets[label]['y'], ax = ax)
#    ax.patches[topN_classes.index(label)].set_facecolor('red')
#    ax.set_title(f"Dataset {label}, {len(balanced_datasets[label])}")
#    ax.tick_params(axis='x',labelrotation = 45)
#plt.tight_layout()
#plt.show()

### 2) Train multiple binary CLF with the downsampled datasets

In [None]:
reflective_cat_in_order = list(train_dataset_downsampled["y"].value_counts().sort_values(ascending=False).index)
reflective_cat_in_order_wo_other = [item for item in reflective_cat_in_order if item != 'Other']

topN_classes = reflective_cat_in_order_wo_other[:1]
reflective_categories = topN_classes
print(f"\ntopN reflective categories without other: {reflective_categories}")

test_dataset_downsampled = downsample_dataset(original_test_dataset, 100)

In [None]:
%%time
print("After downsampling:")
print(f"Length of train dataset: {len(train_dataset_downsampled)}")
print(f"Length of test dataset: {len(test_dataset_downsampled)}")
# Set hyperparameters
batch_size = 8
epochs = 4
learning_rate = 2e-5
n_splits = 2

dfs_train_loss = {}
dfs_train_acc = {}
dfs_train_balanced_acc = {}
dfs_val_loss = {}
dfs_val_acc = {}
dfs_val_balanced_acc = {}
best_model = {}


print(f"\n\nLaunching {n_splits}-fold CV per class with : {reflective_categories}")

for i, cat in enumerate(reflective_categories):
    print(f"\n\nCV for {cat}")
    train_loss_list = []
    train_acc_list = []
    train_balanced_acc_list = []
    val_loss_list = []
    val_acc_list = []
    val_balanced_acc_list = []
    df_sentences_bin = train_dataset_downsampled.iloc[:1000].copy()
    df_sentences_bin['y'] = np.where(df_sentences_bin['y'] == cat, cat, 'Other')
    
    train_loss_list, val_loss_list, train_acc_list, train_balanced_acc_list, val_acc_list, val_balanced_acc_list, label_encoder_test, _, best_model[cat] = cross_validate_bin(df_sentences_train=df_sentences_bin,
                                                                                                df_sentences_test = test_dataset_downsampled,
                                                                                                freeze_weights=False, 
                                                                                                batch_size=batch_size, 
                                                                                                epochs=epochs, 
                                                                                                learning_rate=learning_rate,
                                                                                                n_splits=n_splits,
                                                                                                train_loss_list = train_loss_list,
                                                                                                train_acc_list = train_acc_list,
                                                                                                train_balanced_acc_list = train_balanced_acc_list,
                                                                                                val_loss_list = val_loss_list,
                                                                                                val_acc_list = val_acc_list,
                                                                                                val_balanced_acc_list = val_balanced_acc_list,
                                                                                                device = device)

    # loss on training set
    train_loss_array = np.array(train_loss_list).reshape(n_splits,epochs)
    mean_train_loss = np.mean(train_loss_array, axis=0)
    ci_train_loss = np.percentile(train_loss_array, [2.5, 97.5], axis=0)

    # loss on validation set
    val_loss_array = np.array(val_loss_list).reshape(n_splits,epochs)
    mean_val_loss = np.mean(val_loss_array, axis=0)
    ci_val_loss = np.percentile(val_loss_array, [2.5, 97.5], axis=0)
    
    # accuracy on training set
    train_acc_array = np.array(train_acc_list).reshape(n_splits,epochs)
    mean_train_acc = np.mean(train_acc_array, axis=0)
    ci_train_acc = np.percentile(train_acc_array, [2.5, 97.5], axis=0)
        
    # accuracy on validation set
    val_acc_array = np.array(val_acc_list).reshape(n_splits,epochs)
    mean_val_acc = np.mean(val_acc_array, axis=0)
    ci_val_acc = np.percentile(val_acc_array, [2.5, 97.5], axis=0)
    
    # Balanced accuracy on train set
    train_balanced_acc_array = np.array(train_balanced_acc_list).reshape(n_splits,epochs)
    mean_train_balanced_acc = np.mean(train_balanced_acc_array, axis=0)
    ci_train_balanced_acc = np.percentile(train_balanced_acc_array, [2.5, 97.5], axis=0)

    # Balanced accuracy on validation set
    val_balanced_acc_array = np.array(val_balanced_acc_list).reshape(n_splits,epochs)
    mean_val_balanced_acc = np.mean(val_balanced_acc_array, axis=0)
    ci_val_balanced_acc = np.percentile(val_balanced_acc_array, [2.5, 97.5], axis=0)

    # Create a DataFrame for Seaborn
    df_train_loss = pd.DataFrame({
            'Epochs': np.arange(epochs),
            'Mean_Train_Loss': mean_train_loss,
            'Lower_CI': ci_train_loss[0],
            'Upper_CI': ci_train_loss[1]})

    df_val_loss = pd.DataFrame({
            'Epochs': np.arange(epochs),
            'Mean_Val_Loss': mean_val_loss,
            'Lower_CI': ci_val_loss[0],
            'Upper_CI': ci_val_loss[1]})
        
    df_train_acc = pd.DataFrame({
            'Epochs': np.arange(epochs),
            'Mean_Train_Acc': mean_train_acc,
            'Lower_CI': ci_train_acc[0],
            'Upper_CI': ci_train_acc[1]})
        
    df_val_acc = pd.DataFrame({
            'Epochs': np.arange(epochs),
            'Mean_Val_Acc': mean_val_acc,
            'Lower_CI': ci_val_acc[0],
            'Upper_CI': ci_val_acc[1]})

    df_train_balanced_acc = pd.DataFrame({
        'Epochs': np.arange(epochs), 
        'Mean_Train_Balanced_Acc': mean_train_balanced_acc,
        'Lower_CI': ci_train_balanced_acc[0],
        'Upper_CI': ci_train_balanced_acc[1]})

    df_val_balanced_acc = pd.DataFrame({
        'Epochs': np.arange(epochs),
        'Mean_Val_Balanced_Acc': mean_val_balanced_acc,
        'Lower_CI': ci_val_balanced_acc[0],
        'Upper_CI': ci_val_balanced_acc[1]})
                                            
    dfs_train_loss[f'{cat}'] = df_train_loss
    dfs_val_loss[f'{cat}'] = df_val_loss
    dfs_train_acc[f'{cat}'] = df_train_acc 
    dfs_val_acc[f'{cat}'] = df_val_acc  
    dfs_train_balanced_acc[f'{cat}'] = df_train_balanced_acc 
    dfs_val_balanced_acc[f'{cat}'] = df_val_balanced_acc  

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharey = True)  

# Sample iteration over the dictionary to plot subplots
for i, cat in enumerate(reflective_categories):
    df_train = dfs_train_loss[cat]
    df_val = dfs_val_loss[cat]
    ax = axs[i // 4, i % 4]
    
    # Mean lineplot
    sns.pointplot(data=df_train, x='Epochs', y='Mean_Train_Loss', color='blue', ax = ax)
    sns.pointplot(data=df_val, x='Epochs', y='Mean_Val_Loss', color='orange', ax = ax) 
    
    # Fill between the confidence interval
    ax.fill_between(df_train['Epochs'], df_train['Lower_CI'], df_train['Upper_CI'], color='blue', alpha=0.3, label = 'train')
    ax.fill_between(df_val['Epochs'], df_val['Lower_CI'], df_val['Upper_CI'], color='orange', alpha=0.3, label = 'val')
    
    #ax.set_xticks(np.arange(len(training_examples)), training_examples)
    ax.set_xlabel('Epochs')
    ax.set_ylabel("Loss")
    ax.set_title(f"{cat}") 
    ax.legend()

fig.delaxes(axs[1,3])
plt.suptitle(f"{n_splits}-fold CV Loss with 95% CI per class with bs : {batch_size}, lr : {learning_rate}, {epochs} epochs")
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharey = True)  

# Sample iteration over the dictionary to plot subplots
for i, cat in enumerate(reflective_categories):
    df_train = dfs_train_acc[cat]
    df_val = dfs_val_acc[cat]
    ax = axs[i // 4, i % 4]
    
    # Mean lineplot
    sns.pointplot(data=df_train, x='Epochs', y='Mean_Train_Acc', color='blue', ax = ax)
    sns.pointplot(data=df_val, x='Epochs', y='Mean_Val_Acc', color='orange', ax = ax) 
    
    # Fill between the confidence interval
    ax.fill_between(df_train['Epochs'], df_train['Lower_CI'], df_train['Upper_CI'], color='blue', alpha=0.3, label = 'train')
    ax.fill_between(df_val['Epochs'], df_val['Lower_CI'], df_val['Upper_CI'], color='orange', alpha=0.3, label = 'val')
    
    #ax.set_xticks(np.arange(len(training_examples)), training_examples)
    ax.set_xlabel('Epochs')
    ax.set_ylabel("Accuracy")
    ax.set_title(f"{cat}") 
    ax.legend()
    
fig.delaxes(axs[1,3])
plt.suptitle(f"{n_splits}-fold CV Accuracy with 95% CI per class with bs : {batch_size}, lr : {learning_rate}, {epochs} epochs")
plt.tight_layout()
plt.show()


fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharey = True) 
# Sample iteration over the dictionary to plot subplots
for i, cat in enumerate(reflective_categories):
    df_train = dfs_train_balanced_acc[cat]
    df_val = dfs_val_balanced_acc[cat]
    ax = axs[i // 4, i % 4]
    
    # Mean lineplot
    sns.pointplot(data=df_train, x='Epochs', y='Mean_Train_Balanced_Acc', color='blue', ax = ax)
    sns.pointplot(data=df_val, x='Epochs', y='Mean_Val_Balanced_Acc', color='orange', ax = ax) 
    
    # Fill between the confidence interval
    ax.fill_between(df_train['Epochs'], df_train['Lower_CI'], df_train['Upper_CI'], color='blue', alpha=0.3, label = 'train')
    ax.fill_between(df_val['Epochs'], df_val['Lower_CI'], df_val['Upper_CI'], color='orange', alpha=0.3, label = 'val')
    
    #ax.set_xticks(np.arange(len(training_examples)), training_examples)
    ax.set_xlabel('Epochs')
    ax.set_ylabel("Balanced Accuracy")
    ax.set_title(f"{cat}") 
    ax.legend()
    
fig.delaxes(axs[1,3])
plt.suptitle(f"{n_splits}-fold CV Balanced Accuracy with 95% CI per class with bs : {batch_size}, lr : {learning_rate}, {epochs} epochs")
plt.tight_layout()
plt.show()

## Evaluate after HP search 3 epochs, bs 8

In [None]:
accuracy_data = {}
class_labels_data = {}
predicted_labels_data = {}
true_labels_data = {}
pred_confidence_data = {}

# Define training parameters
batch_size, epochs, learning_rate = 8, 3, 2e-5

for i, cat in enumerate(reflective_categories):
    print(f"\nTraining for {cat}")
    # preprocess the test dataset for each category
    train_dataset_bin = train_dataset_downsampled.copy()
    train_dataset_bin['y'] = np.where(train_dataset_bin['y'] == cat, cat, 'Other')
    test_dataset_bin = test_dataset_downsampled.copy()
    test_dataset_bin['y'] = np.where(test_dataset_bin['y'] == cat, cat, 'Other')
    
    train_dataset_pp, test_dataset_pp, label_encoder_test = preprocess_data_train_test(train_dataset_bin, test_dataset_bin)

    # Initialize the pre-trained BERT model
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset_pp, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset_pp, batch_size=batch_size)

    # Set up optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

    train_losses, avg_acc_per_epoch = train(model, train_loader, epochs, optimizer, scheduler, device, False)
    
    print(f"Evaluation for {cat}")
    accuracy, class_labels, predicted_labels, true_labels, pred_confidence = evaluate_bin(model = model,
                                                                                test_loader = test_loader,
                                                                                label_encoder = label_encoder_test,
                                                                                device = device,
                                                                                accuracy_metric = load_metric("accuracy"))
    
    # Store data for each category in the respective dictionaries
    accuracy_data[cat] = accuracy
    class_labels_data[cat] = class_labels
    predicted_labels_data[cat] = predicted_labels
    true_labels_data[cat] = true_labels
    pred_confidence_data[cat] = pred_confidence

print(f"\nOverall accuracy of multiple binary clf : {np.mean(list(accuracy_data.values())).round(4)}")
# Weighted average accuracy
weights = [len(original_test_dataset[original_test_dataset['y'] == cat]) for cat in reflective_categories]
# Calculate the weighted average
weighted_avg = np.round(sum(w * v for w, v in zip(weights, accuracy_data.values())) / sum(weights),4)
print(f"\nWeights : {weights}")
print(f"Weighted accuracy of multiple binary clf : {weighted_avg}\n\n")    

predicted_labels_df = pd.DataFrame()
true_labels_df = pd.DataFrame()
pred_confidence_df = pd.DataFrame()
for cat in reflective_categories:
    predicted_labels_df[cat] = predicted_labels_data[cat]
    true_labels_df[cat] = true_labels_data[cat]
    pred_confidence_df[cat] = pred_confidence_data[cat]

In [None]:
print(class_labels_data['Feeling'])

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(15, 6)) 
print(reflective_categories)

for i, cat in enumerate(reflective_categories):
    ax = axs[i // 4, i % 4]
    # Create confusion matrix
    cm = confusion_matrix(true_labels_df[cat].values, predicted_labels_df[cat].values, labels=class_labels_data[cat])
    # Confusion Matrix
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels_data[cat], yticklabels=class_labels_data[cat], ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_title(f"{cat}, acc {np.round(accuracy_data[cat],3)}")
    
fig.delaxes(axs[1, 3])
plt.suptitle(f"Confusion matrices per class on test dataset")
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharex = True, sharey = True) 

for i, cat in enumerate(reflective_categories):
    ax = axs[i // 4, i % 4]
    
    # Confidence Scores Histogram
    #ax.hist(pred_confidence_df[cat], bins=50)
    correct_confidence = [pred_confidence_df[cat][i] for i in range(len(predicted_labels_df[cat])) if predicted_labels_df[cat][i] == true_labels_df[cat][i]]
    incorrect_confidence = [pred_confidence_df[cat][i] for i in range(len(predicted_labels_df[cat])) if predicted_labels_df[cat][i] != true_labels_df[cat][i]]
    ax.hist(correct_confidence, bins=50, color='green', alpha=0.7, label='Correct Predictions')
    ax.hist(incorrect_confidence, bins=50, color='red', alpha=0.7, label='Incorrect Predictions')
    ax.set_xlabel('Confidence score')
    ax.set_ylabel('Number of predictions')
    ax.set_title(f"{cat}, acc {np.round(accuracy_data[cat],3)}")
    ax.legend()

fig.delaxes(axs[1, 3])
plt.suptitle(f"Confidence scores per class on test dataset")
plt.tight_layout()
plt.show()

**Evaluate**

In [None]:
print(f"Length of test data : {len(test_dataset_downsampled)}")
sns.histplot(test_dataset_downsampled['y'])
plt.title('Test Dataset')
plt.tick_params(axis='x',labelrotation = 45)
plt.tight_layout()
plt.show()

print(reflective_categories)
accuracy_data = {}
class_labels_data = {}
predicted_labels_data = {}
true_labels_data = {}
pred_confidence_data = {}

for i, cat in enumerate(reflective_categories):
    # preprocess the test dataset for each case : each model has been trained for a binary clf
    print(f"Evaluation for {cat}")
    df_sentences_test_bin = test_dataset_downsampled.copy()
    df_sentences_test_bin['y'] = np.where(df_sentences_test_bin['y'] == cat, cat, 'Other')
    
    test_loader, label_encoder_test = prepare_test_dataset_for_binclf(df_sentences_test_bin, batch_size) 
    
    # Initialize the model: Take BINARY clf !!!
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

    # Load the state dictionary of the best model
    model.load_state_dict(best_model[cat])

    accuracy, class_labels, predicted_labels, true_labels, pred_confidence = evaluate_bin(model = model,
                                                                                test_loader = test_loader,
                                                                                label_encoder = label_encoder_test,
                                                                                device = device,
                                                                                accuracy_metric = load_metric("accuracy"))
    # Store data for each category in the respective dictionaries
    accuracy_data[cat] = accuracy
    class_labels_data[cat] = class_labels
    predicted_labels_data[cat] = predicted_labels
    true_labels_data[cat] = true_labels
    pred_confidence_data[cat] = pred_confidence

print(f"\nOverall accuracy of multiple binary clf : {np.mean(list(accuracy_data.values())).round(4)}")
# Weighted average accuracy
weights = [len(test_dataset_downsampled[test_dataset_downsampled['y'] == cat]) for cat in reflective_categories]
# Calculate the weighted average
weighted_avg = np.round(sum(w * v for w, v in zip(weights, accuracy_data.values())) / sum(weights),4)
print(f"\nWeights : {weights}")
print(f"Weighted accuracy of multiple binary clf : {weighted_avg}\n\n")    

#plt.tight_layout()
#plt.show()

predicted_labels_df = pd.DataFrame()
true_labels_df = pd.DataFrame()
pred_confidence_df = pd.DataFrame()
for cat in reflective_categories:
    predicted_labels_df[cat] = predicted_labels_data[cat]
    true_labels_df[cat] = true_labels_data[cat]
    pred_confidence_df[cat] = pred_confidence_data[cat]

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(15, 6)) 
print(reflective_categories)

for i, cat in enumerate(reflective_categories):
    ax = axs[i // 4, i % 4]
    # Create confusion matrix
    cm = confusion_matrix(true_labels_df[cat].values, predicted_labels_df[cat].values, labels=class_labels_data[cat])
    # Confusion Matrix
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels_data[cat], yticklabels=class_labels_data[cat], ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_title(f"{cat}, acc {np.round(accuracy_data[cat],3)}")
    
fig.delaxes(axs[1, 3])
plt.suptitle(f"Confusion matrices per class on test dataset")
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharex = True, sharey = True) 

for i, cat in enumerate(reflective_categories):
    ax = axs[i // 4, i % 4]
    
    # Confidence Scores Histogram
    #ax.hist(pred_confidence_df[cat], bins=50)
    correct_confidence = [pred_confidence_df[cat][i] for i in range(len(predicted_labels_df[cat])) if predicted_labels_df[cat][i] == true_labels_df[cat][i]]
    incorrect_confidence = [pred_confidence_df[cat][i] for i in range(len(predicted_labels_df[cat])) if predicted_labels_df[cat][i] != true_labels_df[cat][i]]
    ax.hist(correct_confidence, bins=50, color='green', alpha=0.7, label='Correct Predictions')
    ax.hist(incorrect_confidence, bins=50, color='red', alpha=0.7, label='Incorrect Predictions')
    ax.set_xlabel('Confidence score')
    ax.set_ylabel('Number of predictions')
    ax.set_title(f"{cat}, acc {np.round(accuracy_data[cat],3)}")
    ax.legend()

fig.delaxes(axs[1, 3])
plt.suptitle(f"Confidence scores per class on test dataset")
plt.tight_layout()
plt.show()

### Further evaluation : weighted accuracy, F1, AUC

In [None]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score

# Initialize variables to store sum of metrics and count of classes
sum_accuracy = 0.0
sum_weighted_accuracy = 0.0
sum_f1 = 0.0
sum_auc = 0.0
count_classes = 0

# Iterate over each 'cat'
for cat in reflective_categories:
    predicted_labels = predicted_labels_df[cat].values
    true_labels = true_labels_df[cat].values
    
    # Encode labels
    label_encoder = LabelEncoder()
    true_labels_encoded = label_encoder.fit_transform(true_labels)
    predicted_labels_encoded = label_encoder.transform(predicted_labels)
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    balanced_accuracy = balanced_accuracy_score(true_labels, predicted_labels)
    #f1 = f1_score(true_labels, predicted_labels, labels = [cat, 'Other'], pos_label = cat)
    f1_scores = f1_score(true_labels_encoded, predicted_labels_encoded, pos_label=label_encoder.transform([cat])[0])
    #auc = roc_auc_score(true_labels, predicted_labels) if len(set(true_labels)) > 1 else None  # AUC requires both classes
    auc = roc_auc_score(true_labels_encoded, predicted_labels_encoded) if len(set(true_labels_encoded)) > 1 else None
    
     # Sum metrics
    sum_accuracy += accuracy
    sum_weighted_accuracy += balanced_accuracy
    sum_f1 += f1_scores
    if auc is not None:
        sum_auc += auc
        
    # Increment count of classes
    count_classes += 1
    
    # Print metrics for the current class
    print(f"Overall accuracy for class '{cat}' = {accuracy:.4f}")
    print(f"Balanced accuracy for class '{cat}' = {balanced_accuracy:.4f}")
    print(f"F1 score for class '{cat}' = {f1_scores:.4f}")
    if auc is not None:
        print(f"AUC for class '{cat}' = {auc:.4f}")
    else:
        print(f"AUC for class '{cat}' cannot be computed due to insufficient data")
    print("--------------------------------------")

# Calculate averages
avg_accuracy = sum_accuracy / count_classes if count_classes > 0 else 0.0
avg_weighted_accuracy = sum_weighted_accuracy / count_classes if count_classes > 0 else 0.0
avg_f1 = sum_f1 / count_classes if count_classes > 0 else 0.0
avg_auc = sum_auc / count_classes if count_classes > 0 else None

# Print average metrics
print("#########################################")
print("############## AVERAGE ##################")
print("#########################################")
print(f"Average accuracy across classes = {avg_accuracy:.4f}")
print(f"Average weighted accuracy across classes = {avg_weighted_accuracy:.4f}")
print(f"Average F1 score across classes = {avg_f1:.4f}")
if avg_auc is not None:
    print(f"Average AUC across classes = {avg_auc:.4f}")
else:
    print("Average AUC across classes cannot be computed due to insufficient data")


### 3) Learning curves for each class

Functions:
- preprocess_data_for_LearningCurves_bin
- prepare_model_for_LearningCurves
- cv_for_LearningCurves_bin

In [None]:
def preprocess_data_for_LC(df_sentences_train, df_sentences_test):
    
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()
    
    # Preprocess data and labels
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    max_length_train = max(df_sentences_train['sentence'].apply(lambda sentence: len(sentence.split())))
    max_length_test = max(df_sentences_test['sentence'].apply(lambda sentence: len(sentence.split())))

    df_train['text'] = df_sentences_train['sentence'].apply(
        lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_train))
    df_test['text'] = df_sentences_test['sentence'].apply(
        lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_test))
    
    label_encoder_train = LabelEncoder()
    label_encoder_test = LabelEncoder()
    df_train['label'] = label_encoder_train.fit_transform(df_sentences_train['y']) 
    df_test['label'] = label_encoder_test.fit_transform(df_sentences_test['y']) # in output for evaluation
    
    
    # Create a custom dataset
    class CustomDataset(Dataset):
        def __init__(self, text, label):
            self.text = text
            self.label = label
        def __len__(self):
            return len(self.text)
        def __getitem__(self, idx):
            return {
                'text': torch.tensor(self.text[idx], dtype=torch.long),
                'label': torch.tensor(self.label[idx], dtype=torch.long)
                }
    train_dataset = CustomDataset(df_train['text'].values, df_train['label'].values)
    test_dataset = CustomDataset(df_test['text'].values, df_test['label'].values)

    return train_dataset, test_dataset, label_encoder_test

def prepare_model_for_LC(model, train_dataset, test_dataset, freeze_weights, batch_size, epochs, learning_rate):

    if freeze_weights:
        # Freeze all layers except the last two
        for param in model.parameters():
            param.requires_grad = False
        for param in model.classifier.parameters():
            param.requires_grad = True

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Set up optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

    return train_loader, test_loader, optimizer, scheduler 

def cv_for_LearningCurves_bin(df_sentences, freeze_weights, batch_size, epochs, learning_rate, predictions_list, true_labels_list, train_loss_list, train_acc_list, train_balanced_acc_list, val_loss_list, val_acc_list, val_balanced_acc_list, training_examples, N_shuffle, reflective_category):
    print(f"\n\nReflective category : {reflective_category}")
    for seed in range(N_shuffle):
        print(f"Shuffle {seed}")
        # shuffle with a different seed each time
        df_shuffled = df_sentences.sample(frac = 1, random_state = seed)
        for _, nb_train_ex in enumerate(training_examples):
            print(f"Train with {nb_train_ex } examples:")
            train_dataset, label_encoder, val_dataset = preprocess_data_for_LearningCurves_bin(df_shuffled, nb_train_ex, reflective_category)

            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
            train_loader, optimizer, scheduler, val_loader = prepare_model_for_LearningCurves(model, train_dataset, val_dataset, freeze_weights, batch_size, epochs, learning_rate)
            #train_loss_loop, train_acc_loop = train(model, train_loader, epochs, optimizer, scheduler, device, plot_visualization = False)
            train_loss_loop, val_loss_loop, train_acc_loop, val_acc_loop,  _, _ = train_test(model, train_loader, val_loader, epochs, optimizer, scheduler, device)
            print(train_loss_loop)
            train_loss_list.append(train_loss_loop[-1]) # take only value of the loss from the last epoch
            val_loss_list.append(val_loss_loop[-1])
            train_acc_list.extend(train_acc_loop)
            val_acc_list.append(val_acc_loop[-1])
    
    return train_loss_list, train_acc_list, val_loss_list, val_acc_list


In [None]:
def prepare_dataset_for_binclf(df_sentences, batch_size):

    df = pd.DataFrame()
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    max_length_test = max(df_sentences['sentence'].apply(lambda sentence: len(sentence.split())))
    df['text'] = df_sentences['sentence'].apply(
            lambda x: tokenizer.encode(x, add_special_tokens=True, padding='max_length', truncation = True, max_length=max_length_test))

    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df_sentences['y'])

    # Create a custom dataset
    class CustomDataset(Dataset):
        def __init__(self, text, label):
            self.text = text
            self.label = label
        def __len__(self):
            return len(self.text)
        def __getitem__(self, idx):
            return {
                'text': torch.tensor(self.text[idx], dtype=torch.long),
                'label': torch.tensor(self.label[idx], dtype=torch.long)
            }

    dataset = CustomDataset(df['text'].values, df['label'].values)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle = True)

    return dataloader, label_encoder

def train_test_for_LC(model, train_loader, val_loader, epochs, optimizer, scheduler, device):
    
    #subset_data = list(islice(train_loader, 300))  # Convert islice to a list
    # Create a new DataLoader from the subset data
    #subset_train_loader = DataLoader(subset_data, batch_size=train_loader.batch_size)
    
    accuracy_metric = load_metric("accuracy")
    train_losses = []
    val_losses = []
    avg_train_acc_per_epoch = []
    avg_val_acc_per_epoch = []
    train_confidence_scores = []  # Store confidence scores for train set
    val_confidence_scores = []    # Store confidence scores for validation set

    #for epoch in range(epochs):
    for epoch in range(epochs):
        #print(f"epoch {epoch} running...")
        model.train()
        train_loss = []
        all_preds_train = []
        all_labels_train = []
        train_confidence = []

        #for batch in train_loader:
        for batch in tqdm(subset_train_loader, position = 0, desc= f"epoch {epoch} running..."):
            optimizer.zero_grad()
            inputs = batch['text'].to(device)
            labels = batch['label'].to(device)
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            train_loss.append(loss.item())
            predictions_train = torch.argmax(outputs.logits, axis=1)
            all_preds_train.extend(predictions_train.cpu().numpy().tolist())
            all_labels_train.extend(labels.tolist())
            ### compute confidence score
            probabilities = torch.softmax(outputs.logits, dim=1)
            train_confidence.extend(probabilities.max(dim=1).values.cpu().detach().numpy())  # Confidence scores
            ###
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_epoch_loss_train = sum(train_loss) / len(train_loss)
        train_losses.append(avg_epoch_loss_train)
        avg_train_acc_per_epoch.append(accuracy_metric.compute(predictions=all_preds_train, references=all_labels_train)["accuracy"])
        train_confidence_scores.append(np.mean(train_confidence))  # Store confidence scores
        
        # Validation loop
        model.eval()
        val_loss = []
        all_preds_val = []
        all_labels_val = []
        val_confidence = []

        with torch.no_grad():
            #for batch in tqdm(val_loader, desc="Validation"): 
            for batch in val_loader:
                inputs = batch['text'].to(device)
                labels = batch['label'].to(device)
                outputs = model(inputs, labels=labels)
                loss_val = outputs.loss
                val_loss.append(loss_val.item())
                predictions_val = torch.argmax(outputs.logits, axis=1)
                all_preds_val.extend(predictions_val.cpu().numpy().tolist())
                all_labels_val.extend(labels.tolist())
                probabilities = torch.softmax(outputs.logits, dim=1)
                val_confidence.extend(probabilities.max(dim=1).values.cpu().detach().numpy())  # Confidence scores

        avg_epoch_loss_val = sum(val_loss) / len(val_loss)
        val_losses.append(avg_epoch_loss_val)
        avg_val_acc_per_epoch.append(accuracy_metric.compute(predictions=all_preds_val, references=all_labels_val)["accuracy"])
        val_confidence_scores.append(np.mean(val_confidence))  # Store confidence scores

    return train_losses, val_losses, avg_train_acc_per_epoch, avg_val_acc_per_epoch, train_confidence_scores, val_confidence_scores

def downsample_dataset(df, max_values):
    label_counts = dict(df["y"].value_counts().sort_values(ascending=False))

    downsampled_label_counts = label_counts
    for key, value in label_counts.items():
        downsampled_label_counts[key] = value if value < max_values else max_values

    # Downsample to the maximum desired labels per class
    under_sampler = RandomUnderSampler(sampling_strategy= downsampled_label_counts)
    df_downsampled, _ = under_sampler.fit_resample(df, df['y'])

    # sample the dataset randomly
    df_downsampled = df_downsampled.sample(frac = 1)
    print(f"Length of the downsampled dataframe: {len(df_downsampled)}")
    
    sns.histplot(df_downsampled['y'])
    plt.xticks(rotation = 45)
    plt.title("Dataset downsampled")
    plt.show()

    return df_downsampled


In [None]:
print(f"Length train dataset: {len(train_dataset)}")
train_dataset_downsampled = downsample_dataset(train_dataset, 400)
#print(f"Length train dataset: {len(train_dataset)}")

In [None]:
dataset_downsampled = downsample_dataset(merged_df, 400)

### By splitting the dataframe and re process sentences each time

In [None]:
%%time

# Set hyperparameters
batch_size = 8
epochs = 2
learning_rate = 2e-5
freeze_weights = False

# Create a dictionary to store DataFrames
dfs_train = {}
dfs_val = {}
dfs_train_acc = {}
dfs_train_balanced_acc = {}
dfs_val_acc = {}
dfs_val_balanced_acc = {}
dfs_test_confidence = {}

#training_examples = [50, 100, 150, 200, 250, 300]
training_examples = [200, 400, 600, 800]#, 1000]
#training_examples = [500, 1000] #, 1500, 2000]
N_shuffle_total = 2
reflective_categories = ['Intention', 'Perspective'] 
# reflective_categories = ['Experience']#, 'Feeling', 'Difficulty', 'Belief'] 
print(reflective_categories)
print(f"Number of sentences in balanced dataset: {len(dataset_downsampled)}")
print(f"Length of train set: {int(len(dataset_downsampled) * 0.8)}")
print(f"Length of test set: {int(len(dataset_downsampled) * 0.2)}")

for cat in reflective_categories:
    print(f"\n\nStarting Learning curves for category : {cat}")
    train_loss_list = []
    train_acc_list = []
    train_balanced_acc_list = []
    val_loss_list = []
    val_acc_list = []
    val_balanced_acc_list = []
    test_confidence_score_list = []
    
    # Prepare dataset for binary classification
    df_sentences_bin = dataset_downsampled.copy()
    df_sentences_bin['y'] = np.where(df_sentences_bin['y'] == cat, cat, 'Other')
    
    for i, N_shuffle in enumerate(range(N_shuffle_total)):
        print(f"\nShuffle {i+1}:")
        # Split between train and test
        train_dataset_bin, test_dataset_bin = train_test_split(df_sentences_bin, test_size=0.2, random_state=42)
        
        # Initialize the pre-trained BERT model for bin clf
        model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
        
        for nb_train_ex in training_examples:
            print(f"Train with {nb_train_ex} sentences")
            # preprocess sentences 
            train_dataset_bin_pp, test_dataset_bin_pp, label_encoder_test = preprocess_data_for_LC(train_dataset_bin.head(nb_train_ex), test_dataset_bin)
            # prepare model
            train_loader, test_loader, optimizer, scheduler = prepare_model_for_LC(model, train_dataset_bin_pp, test_dataset_bin_pp, freeze_weights, batch_size, epochs, learning_rate)

            #train_loss_fold, val_loss_fold, train_acc_fold, val_acc_fold, _, test_confidence_scores, _ = train_test(model, train_loader, test_loader, epochs, optimizer, scheduler, device)
            train_loss_fold, val_loss_fold, train_acc_fold, val_acc_fold, _, test_confidence_scores, _, train_balanced_acc_fold, val_balanced_acc_fold = train_test(model, train_loader, test_loader, epochs, optimizer, scheduler, device)
        
            train_loss_list.append(train_loss_fold[-1])
            train_acc_list.append(train_acc_fold[-1])
            train_balanced_acc_list.append(train_balanced_acc_fold[-1])
            val_loss_list.append(val_loss_fold[-1])
            val_acc_list.append(val_acc_fold[-1])
            val_balanced_acc_list.append(val_balanced_acc_fold[-1])
            test_confidence_score_list.append(test_confidence_scores[-1])
            
    # Train loss
    train_loss_array = np.array(train_loss_list).reshape(N_shuffle_total, len(training_examples))
    mean_train_loss = np.mean(train_loss_array, axis=0)
    ci_train_loss = np.percentile(train_loss_array, [2.5, 97.5], axis=0)
    
    # Val loss
    val_loss_array = np.array(val_loss_list).reshape(N_shuffle_total,len(training_examples))
    mean_val_loss = np.mean(val_loss_array, axis=0)
    ci_val_loss = np.percentile(val_loss_array, [2.5, 97.5], axis=0)
    
    # Train acc
    train_acc_array = np.array(train_acc_list).reshape(N_shuffle_total,len(training_examples))
    mean_train_acc = np.mean(train_acc_array, axis=0)
    ci_train_acc = np.percentile(train_acc_array, [2.5, 97.5], axis=0)
    
    # Val acc
    val_acc_array = np.array(val_acc_list).reshape(N_shuffle_total,len(training_examples))
    mean_val_acc = np.mean(val_acc_array, axis=0)
    ci_val_acc = np.percentile(val_acc_array, [2.5, 97.5], axis=0)
    
    # Balanced accuracy on train set
    train_balanced_acc_array = np.array(train_balanced_acc_list).reshape(N_shuffle_total,len(training_examples))
    mean_train_balanced_acc = np.mean(train_balanced_acc_array, axis=0)
    ci_train_balanced_acc = np.percentile(train_balanced_acc_array, [2.5, 97.5], axis=0)

    # Balanced accuracy on validation set
    val_balanced_acc_array = np.array(val_balanced_acc_list).reshape(N_shuffle_total,len(training_examples))
    mean_val_balanced_acc = np.mean(val_balanced_acc_array, axis=0)
    ci_val_balanced_acc = np.percentile(val_balanced_acc_array, [2.5, 97.5], axis=0)
    
    # Test Confidence scores
    test_conf_array = np.array(test_confidence_score_list).reshape(N_shuffle_total,len(training_examples))
    mean_test_conf = np.mean(test_conf_array, axis=0)
    ci_test_conf = np.percentile(test_conf_array, [2.5, 97.5], axis=0)

    # Create a DataFrame for Seaborn
    df_train_loss = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Train_Loss': mean_train_loss,
        'Lower_CI': ci_train_loss[0],
        'Upper_CI': ci_train_loss[1]})
    
    df_val_loss = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Val_Loss': mean_val_loss,
        'Lower_CI': ci_val_loss[0],
        'Upper_CI': ci_val_loss[1]})
    
    df_train_acc = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Train_Acc': mean_train_acc,
        'Lower_CI': ci_train_acc[0],
        'Upper_CI': ci_train_acc[1]})
    
    df_val_acc = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Val_Acc': mean_val_acc,
        'Lower_CI': ci_val_acc[0],
        'Upper_CI': ci_val_acc[1]})
    
    df_train_balanced_acc = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Train_Balanced_Acc': mean_train_balanced_acc,
        'Lower_CI': ci_train_balanced_acc[0],
        'Upper_CI': ci_train_balanced_acc[1]})

    df_val_balanced_acc = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Val_Balanced_Acc': mean_val_balanced_acc,
        'Lower_CI': ci_val_balanced_acc[0],
        'Upper_CI': ci_val_balanced_acc[1]})
    
    df_test_confidence = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Test_Conf': mean_test_conf,
        'Lower_CI': ci_test_conf[0],
        'Upper_CI': ci_test_conf[1]})
    
    dfs_train[f'{cat}'] = df_train_loss
    dfs_val[f'{cat}'] = df_val_loss
    dfs_train_acc[f'{cat}'] = df_train_acc
    dfs_val_acc[f'{cat}'] = df_val_acc
    dfs_test_confidence[f'{cat}'] = df_test_confidence
    dfs_train_balanced_acc[f'{cat}'] = df_train_balanced_acc 
    dfs_val_balanced_acc[f'{cat}'] = df_val_balanced_acc  

### Visualize results

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharey = True)  

# Sample iteration over the dictionary to plot subplots
for i, cat in enumerate(reflective_categories):
    df_train = dfs_train[cat]
    df_val = dfs_val[cat]
    ax = axs[i // 4, i % 4]
    
    # Mean lineplot
    sns.pointplot(data=df_train, x='N_training_examples', y='Mean_Train_Loss', color='blue', ax = ax)
    sns.pointplot(data=df_val, x='N_training_examples', y='Mean_Val_Loss', color='orange', ax = ax) 
    
    # Fill between the confidence interval
    ax.fill_between(df_train['N_training_examples'], df_train['Lower_CI'], df_train['Upper_CI'], color='blue', alpha=0.3, label = 'train')
    ax.fill_between(df_val['N_training_examples'], df_val['Lower_CI'], df_val['Upper_CI'], color='orange', alpha=0.3, label = 'val')
    
    ax.set_xticks(np.arange(len(training_examples)), training_examples)
    ax.set_xlabel('Nb Training examples')
    ax.set_ylabel("Loss")
    ax.set_title(f"{cat}") 
    ax.legend()
    
fig.delaxes(axs[1, 3])
plt.suptitle(f"Loss with 95% CI per class with bs : {batch_size}, lr : {learning_rate}, {epochs} epochs")
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(2, 4, figsize=(15, 6), sharey = True)  

# Sample iteration over the dictionary to plot subplots
for i, cat in enumerate(reflective_categories):
    df_train = dfs_train_balanced_acc[cat]
    df_acc = dfs_val_balanced_acc[cat]
    #df_test_conf = dfs_test_confidence[cat]
    ax = axs[i // 4, i % 4]
    
    # Mean lineplot
    sns.pointplot(data=df_train, x='N_training_examples', y='Mean_Train_Balanced_Acc', color='blue', ax = ax)
    sns.pointplot(data=df_acc, x='N_training_examples', y='Mean_Val_Balanced_Acc', color='orange', ax = ax) 
    #sns.pointplot(data=df_test_conf, x='N_training_examples', y='Mean_Test_Conf', color='green', ax = ax) 
    
    # Fill between the confidence interval
    ax.fill_between(df_train['N_training_examples'], df_train['Lower_CI'], df_train['Upper_CI'], color='blue', alpha=0.3, label = 'train')
    ax.fill_between(df_acc['N_training_examples'], df_acc['Lower_CI'], df_acc['Upper_CI'], color='orange', alpha=0.3, label = 'val')
    #ax.fill_between(df_test_conf['N_training_examples'], df_test_conf['Lower_CI'], df_test_conf['Upper_CI'], color='green', alpha=0.3, label = 'test confidence')
    
    ax.set_xticks(np.arange(len(training_examples)), training_examples)
    ax.set_xlabel('Nb Training examples')
    ax.set_ylabel("Balanced Accuracy")
    ax.set_title(f"{cat}") 
    ax.legend()

fig.delaxes(axs[1, 3])
plt.suptitle(f"Balanced Accuracy with 95% CI per class with bs : {batch_size}, lr : {learning_rate}, {epochs} epochs")
plt.tight_layout()
plt.show()

## Experiment 5 : Learning curves with the most confident predictions

In [None]:
print(f"Length dataset: {len(merged_df)}")
dataset_downsampled = downsample_dataset(merged_df, 600)

In [None]:
%%time

# Set hyperparameters
batch_size = 8
epochs = 2  ## One epoch = self-learning 
learning_rate = 2e-5
freeze_weights = False

# Create a dictionary to store DataFrames
dfs_train = {}
dfs_val = {}
dfs_train_acc = {}
dfs_val_acc = {}
dfs_test_confidence = {}

#training_examples = [50, 100, 150, 200, 250, 300]
training_examples = [200, 400] # 600, 800]#, 1000]
#training_examples = [500, 1000] #, 1500, 2000]
N_shuffle_total = 3
reflective_categories = ['Experience']#, 'Feeling', 'Difficulty', 'Belief'] 
print(reflective_categories)
print(f"Number of sentences in balanced dataset: {len(dataset_downsampled)}")
print(f"Length of train set: {int(len(dataset_downsampled) * 0.8)}")
print(f"Length of test set: {int(len(dataset_downsampled) * 0.2)}")

for cat in reflective_categories:
    print(f"\n\nStarting Learning curves for category : {cat}")
    train_loss_list = []
    train_acc_list = []
    val_loss_list = []
    val_acc_list = []
    all_train_confidence_score_list = []
    avg_test_confidence_score_per_epoch_list = []
    
    # Prepare dataset for binary classification
    df_sentences_bin = dataset_downsampled.copy()
    df_sentences_bin['y'] = np.where(df_sentences_bin['y'] == cat, cat, 'Other')
    
    for i, N_shuffle in enumerate(range(N_shuffle_total)):
        print(f"\nShuffle {i+1}:")
        # Split between train and test
        train_dataset_bin, test_dataset_bin = train_test_split(df_sentences_bin, test_size=0.2, random_state=42)
        
        # Initialize the pre-trained BERT model for bin clf
        model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
        
        combined_data = train_dataset_bin.head(training_examples[0])
        
        for N , nb_train_ex in enumerate(training_examples):
            # add the most confident predictions to the training set
            print(f"Train with {len(combined_data)} sentences")
            # preprocess sentences 
            train_dataset_bin_pp, test_dataset_bin_pp, label_encoder_test = preprocess_data_for_LC(combined_data, test_dataset_bin)
            # prepare model
            train_loader, test_loader, optimizer, scheduler = prepare_model_for_LC(model, train_dataset_bin_pp, test_dataset_bin_pp, freeze_weights, batch_size, epochs, learning_rate)

            train_loss_fold, val_loss_fold, train_acc_fold, val_acc_fold, train_confidence_scores_avg_per_epoch, test_confidence_scores_avg_per_epoch, all_train_confidence = train_test(model, train_loader, test_loader, epochs, optimizer, scheduler, device)
                                                                                                                         
            train_loss_list.append(train_loss_fold[-1])
            train_acc_list.append(train_acc_fold[-1])
            val_loss_list.append(val_loss_fold[-1])
            val_acc_list.append(val_acc_fold[-1])
            #all_train_confidence_score_list.extend(all_train_confidence)
            avg_test_confidence_score_per_epoch_list.append(test_confidence_scores_avg_per_epoch[-1])
            
            num_values = len(all_train_confidence)
            num_top_20_percent = int(num_values * 0.2)  # Calculate the number of values in the top 20%
            indices_top_20_percent = sorted(range(num_values), key=lambda i: all_train_confidence[i], reverse=True)[:num_top_20_percent]

            df_most_confident = dataset_downsampled.iloc[indices_top_20_percent]
            
            new_data = train_dataset_bin.iloc[N*training_examples[0]: (N+1)*training_examples[0]]
            combined_data = pd.concat([df_most_confident, new_data])
            print(f"Length of combined dataset: {len(combined_data)}")
        
    # Train loss
    train_loss_array = np.array(train_loss_list).reshape(N_shuffle_total, len(training_examples))
    mean_train_loss = np.mean(train_loss_array, axis=0)
    ci_train_loss = np.percentile(train_loss_array, [2.5, 97.5], axis=0)
    
    # Val loss
    val_loss_array = np.array(val_loss_list).reshape(N_shuffle_total,len(training_examples))
    mean_val_loss = np.mean(val_loss_array, axis=0)
    ci_val_loss = np.percentile(val_loss_array, [2.5, 97.5], axis=0)
    
    # Train acc
    train_acc_array = np.array(train_acc_list).reshape(N_shuffle_total,len(training_examples))
    mean_train_acc = np.mean(train_acc_array, axis=0)
    ci_train_acc = np.percentile(train_acc_array, [2.5, 97.5], axis=0)
    
    # Val acc
    val_acc_array = np.array(val_acc_list).reshape(N_shuffle_total,len(training_examples))
    mean_val_acc = np.mean(val_acc_array, axis=0)
    ci_val_acc = np.percentile(val_acc_array, [2.5, 97.5], axis=0)
    
    # Test Confidence scores
    test_conf_array = np.array(avg_test_confidence_score_per_epoch_list).reshape(N_shuffle_total,len(training_examples))
    mean_test_conf = np.mean(test_conf_array, axis=0)
    ci_test_conf = np.percentile(test_conf_array, [2.5, 97.5], axis=0)

    # Create a DataFrame for Seaborn
    df_train_loss = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Train_Loss': mean_train_loss,
        'Lower_CI': ci_train_loss[0],
        'Upper_CI': ci_train_loss[1]})
    
    df_val_loss = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Val_Loss': mean_val_loss,
        'Lower_CI': ci_val_loss[0],
        'Upper_CI': ci_val_loss[1]})
    
    df_train_acc = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Train_Acc': mean_train_acc,
        'Lower_CI': ci_train_acc[0],
        'Upper_CI': ci_train_acc[1]})
    
    df_val_acc = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Val_Acc': mean_val_acc,
        'Lower_CI': ci_val_acc[0],
        'Upper_CI': ci_val_acc[1]})
    
    df_test_confidence = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Test_Conf': mean_test_conf,
        'Lower_CI': ci_test_conf[0],
        'Upper_CI': ci_test_conf[1]})
    
    dfs_train[f'{cat}'] = df_train_loss
    dfs_val[f'{cat}'] = df_val_loss
    dfs_train_acc[f'{cat}'] = df_train_acc
    dfs_val_acc[f'{cat}'] = df_val_acc
    dfs_test_confidence[f'{cat}'] = df_test_confidence

In [None]:
values = [0.89725614, 0.8265314, 0.8608944, 0.89096713, 0.90152985, 0.8810586, 0.8641204, 0.779013, 0.9036705, 0.8782628, 0.8470351, 0.862957, 0.87726593, 0.85752606, 0.8642521, 0.89463615, 0.89705205, 0.88434637, 0.8844721, 0.8724542, 0.8710404, 0.86373454, 0.87535864, 0.8877713, 0.8403641, 0.8795528, 0.8588633, 0.8833332, 0.85883814, 0.7513898, 0.87752813, 0.89346623, 0.86391985, 0.8336804, 0.83276635, 0.8329713, 0.8751115, 0.90970105, 0.8292461, 0.86291826, 0.8639757, 0.873903, 0.87614065, 0.8809002, 0.83957916, 0.8515999, 0.86006474, 0.83797777, 0.85472125, 0.85212344, 0.8218158, 0.80510116, 0.7178283, 0.87010324, 0.836947, 0.83996314, 0.8139497, 0.81396234, 0.849028, 0.8285842, 0.80438775, 0.8583806, 0.8627805, 0.81667066, 0.8049326, 0.84818, 0.82001126, 0.81965625, 0.87334657, 0.8313316, 0.7719984, 0.8333738, 0.7940738, 0.83995056, 0.8343295, 0.7970121, 0.7900876, 0.83439934, 0.8432067, 0.82171685, 0.8278204, 0.8148168, 0.8216123, 0.84128857, 0.74447674, 0.84744465, 0.84314924, 0.83186847, 0.70089346, 0.8448653, 0.8065221, 0.8073163, 0.69540477, 0.8533329, 0.8652067, 0.85866153, 0.8319706, 0.8198268, 0.82017654, 0.8545485, 0.8350983, 0.8119053, 0.80813396, 0.82151556, 0.80893975, 0.8763131, 0.83548516, 0.85596704, 0.8543422, 0.80797094, 0.8430984, 0.8113218, 0.84107375, 0.80528545, 0.8430511, 0.8267623, 0.83612174, 0.8538626, 0.84286916, 0.772975, 0.8017599, 0.8248337, 0.8434411, 0.8088821, 0.80819887, 0.8080551, 0.83073604, 0.8302567, 0.7885074, 0.8553188, 0.8393506, 0.8509006, 0.8480079, 0.7951159, 0.8022355, 0.82036436, 0.8583823, 0.8570997, 0.81445855, 0.8481593, 0.8223166, 0.84600216, 0.83105963, 0.8030795, 0.83624935, 0.78268456, 0.83683234, 0.8238378, 0.82838446, 0.84227175, 0.8452065, 0.80628335, 0.8684133, 0.80064934, 0.80887806, 0.81648415, 0.8277713, 0.80383104, 0.8106005, 0.8258066, 0.85887057, 0.8450083, 0.8295496, 0.81164175, 0.8439562, 0.77985466, 0.8150946, 0.82951474, 0.82633066, 0.85161924, 0.86996955, 0.8402585, 0.8397682, 0.8583724, 0.8118849, 0.8363616, 0.8685401, 0.81430596, 0.86222714, 0.84854424, 0.8601118, 0.81300575, 0.8078579, 0.8178351, 0.8080628, 0.86383814, 0.85081226, 0.86024946, 0.8267963, 0.86264116, 0.84713376, 0.8009532, 0.83537334, 0.77203023, 0.8106392, 0.80602807, 0.8346353, 0.8292258, 0.7603937, 0.82759184]
[0.738134, 0.8352792]
print(len(values))

# Find the indices of the top 20% values
num_values = len(values)
num_top_20_percent = int(num_values * 0.2)  # Calculate the number of values in the top 20%
indices_top_20_percent = sorted(range(num_values), key=lambda i: values[i], reverse=True)[:num_top_20_percent]

print(indices_top_20_percent)

df_most_confident = dataset_downsampled.iloc[indices_top_20_percent]
sns.histplot(df_most_confident['y'])
print(len(df_most_confident))
df_most_confident.head()

In [None]:
other_data = dataset_downsampled.iloc[500:1000]
print(len(other_data))
sns.histplot(other_data['y'])
other_data.head()

In [None]:
combined_data = pd.concat([df_most_confident, other_data])
print(len(combined_data))
sns.histplot(combined_data['y'])
combined_data.head()

### By splitting the train_dataloader

In [None]:
%%time

# Set hyperparameters
batch_size = 8
epochs = 2
learning_rate = 2e-5
freeze_weights = False

# Create a dictionary to store DataFrames
dfs_train = {}
dfs_val = {}
dfs_train_acc = {}
dfs_val_acc = {}

#training_examples = [50, 100, 150, 200, 250, 300]
training_examples = [100, 300, 500]
N_shuffle_total = 2
reflective_categories = ['Experience', 'Feeling'] #top3_classes #['Experience', 'Feeling']
print(reflective_categories)
print(f"Number of sentences in balanced dataset: {len(train_dataset_downsampled)}")

for cat in reflective_categories:
    print(f"\nStarting Learning curves for category : {cat}")
    train_loss_list = []
    train_acc_list = []
    val_loss_list = []
    val_acc_list = []
    
    # Prepare dataset for binary classification
    df_sentences_bin = train_dataset_downsampled.iloc[:600].copy()
    df_sentences_bin['y'] = np.where(df_sentences_bin['y'] == cat, cat, 'Other')
    
    for N_shuffle in range(N_shuffle_total):
        # Split between train and test
        train_dataset_bin, test_dataset_bin = train_test_split(df_sentences_bin, test_size=0.2, random_state=42)
        # preprocess sentences for different
        train_dataset_bin_pp, test_dataset_bin_pp, label_encoder_test = preprocess_data_for_LC(train_dataset_bin, test_dataset_bin)
        # Initialize the pre-trained BERT model for bin clf
        model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
        # prepare model
        train_loader, test_loader, optimizer, scheduler = prepare_model_for_LC(model, train_dataset_bin_pp, test_dataset_bin_pp, freeze_weights, batch_size, epochs, learning_rate)
        
        for nb_train_ex in training_examples:
            subset_data = list(islice(train_loader, size))  # Convert islice to a list
            # Create a new DataLoader from the subset data
            subset_train_loader = DataLoader(subset_data, batch_size=train_loader.batch_size)
            print(f"Number of samples in training: {len(subset_train_loader) * subset_train_loader.batch_size}")
            train_loss_fold, val_loss_fold, train_acc_fold, val_acc_fold, _, _ = train_test(model, subset_train_loader, test_loader, epochs, optimizer, scheduler, device)
        
            train_loss_list.extend(train_loss_fold[-1])
            train_acc_list.extend(train_acc_fold[-1])
            val_loss_list.extend(val_loss_fold[-1])
            val_acc_list.extend(val_acc_fold[-1]) 
            
    # Train loss
    train_loss_array = np.array(train_loss_list).reshape(N_shuffle_total, len(training_examples))
    mean_train_loss = np.mean(train_loss_array, axis=0)
    ci_train_loss = np.percentile(train_loss_array, [2.5, 97.5], axis=0)
    
    # Val loss
    val_loss_array = np.array(val_loss_list).reshape(N_shuffle,len(training_examples))
    mean_val_loss = np.mean(val_loss_array, axis=0)
    ci_val_loss = np.percentile(val_loss_array, [2.5, 97.5], axis=0)
    
    # Train acc
    train_acc_array = np.array(train_acc_list).reshape(N_shuffle,len(training_examples))
    mean_train_acc = np.mean(train_acc_array, axis=0)
    ci_train_acc = np.percentile(train_acc_array, [2.5, 97.5], axis=0)
    
    # Val acc
    val_acc_array = np.array(val_acc_list).reshape(N_shuffle,len(training_examples))
    mean_val_acc = np.mean(val_acc_array, axis=0)
    ci_val_acc = np.percentile(val_acc_array, [2.5, 97.5], axis=0)

    # Create a DataFrame for Seaborn
    df_train_loss = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Train_Loss': mean_train_loss,
        'Lower_CI': ci_train_loss[0],
        'Upper_CI': ci_train_loss[1]})
    
    df_val_loss = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Val_Loss': mean_val_loss,
        'Lower_CI': ci_val_loss[0],
        'Upper_CI': ci_val_loss[1]})
    
    df_train_acc = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Train_Acc': mean_train_acc,
        'Lower_CI': ci_train_acc[0],
        'Upper_CI': ci_train_acc[1]})
    
    df_val_acc = pd.DataFrame({
        'N_training_examples': np.arange(len(training_examples)),
        'Mean_Val_Acc': mean_val_acc,
        'Lower_CI': ci_val_acc[0],
        'Upper_CI': ci_val_acc[1]})
    
    dfs_train[f'{cat}'] = df_train_loss
    dfs_val[f'{cat}'] = df_val_loss
    dfs_train_acc[f'{cat}'] = df_train_acc
    dfs_val_acc[f'{cat}'] = df_val_acc