In [None]:
!nvidia-smi
!pip install datasets

In [None]:
import torch
import pandas as pd
import numpy as np
import itertools
import re
import nltk
from datasets import Dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns

import os
os.environ["WANDB_DISABLED"] = "true"

# Download stopwords if not already available
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
#Load the Gold Data
bbkpi_df = pd.read_csv('/content/bbkpi_gold.csv')
ground_truth_cols = [c for c in bbkpi_df.columns[2:] if not c.endswith('_pred')]
verity_prediction_cols = [c for c in bbkpi_df.columns[2:] if c.endswith('_pred')]

bbkpi_df.head()

In [None]:
# Load the Silver Data
prod_silver_df = pd.read_excel('/content/prod_silver - fullyGPT2.0.xlsx')
prod_silver_df = prod_silver_df[['TIMESTAMP', 'VIDEO_UUID', 'INTERVAL_ID', 'TEXT', 'LANGUAGE_CODE'] + ground_truth_cols]

prod_silver_df.head()

In [None]:
print(prod_silver_df.shape)
print(bbkpi_df.shape)

print(prod_silver_df.columns)
print(bbkpi_df.columns)


In [None]:
# Remove rows with NaN or empty text values
prod_silver_df = prod_silver_df.dropna(subset=['TEXT'])  # Drop rows where TEXT is NaN
prod_silver_df = prod_silver_df[prod_silver_df['TEXT'].str.len() > 0]  # Drop rows where TEXT is empty

bbkpi_df = bbkpi_df.dropna(subset=['text'])  # Drop rows where text is NaN
bbkpi_df = bbkpi_df[bbkpi_df['text'].str.len() > 0]  # Drop rows where text is empty

print(prod_silver_df.shape)
print(bbkpi_df.shape)

In [None]:
# Preparing silver data for training
pre_train_df = pd.DataFrame({'text' : prod_silver_df['TEXT'].tolist(),
                        'labels' : [x for x in prod_silver_df[ground_truth_cols].values.astype(float)]})
pre_train_df = pre_train_df[pre_train_df['text'].apply(lambda s : isinstance(s, str) and len(s) >= 1)]

print(len(pre_train_df))
silver_dataset = Dataset.from_pandas(pre_train_df)
pre_train_df.head()

In [None]:
# Preparing gold data for test
pre_test_df = pd.DataFrame({'text' : bbkpi_df['text'].tolist(),
                        'labels' : [x for x in bbkpi_df[ground_truth_cols].values.astype(float)]})
pre_test_df = pre_test_df[pre_test_df['text'].apply(lambda s : isinstance(s, str) and len(s) >= 1)]

print(len(pre_test_df))
gold_dataset = Dataset.from_pandas(pre_test_df)
pre_test_df.head()

In [None]:
# Define preprocessing function
def preprocess_text(text):
    text = text.lower()                         # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()    # Remove extra white spaces

    tokens = text.split()                       # Tokenize and remove stop words
    tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)                     # Join tokens back to a single string

In [None]:
# Models and their tokenizers
models = [
    {'model': 'bert-base-uncased', 'tokenizer': 'bert-base-uncased'},
    #{'model': 'roberta-base', 'tokenizer': 'roberta-base'},
    #{'model': 'distilbert-base-uncased', 'tokenizer': 'distilbert-base-uncased'},


    #{'model': 'microsoft/deberta-base', 'tokenizer': 'microsoft/deberta-base'},   #should use 1e-5 learning rate
    #{'model': 'albert-base-v2', 'tokenizer': 'albert-base-v2'},
    #{'model': 'xlnet-base-cased', 'tokenizer': 'xlnet-base-cased'}
]

results = []

# Hyperparameter values to test
batch_sizes = [16] #8, 16, 32
epochs = [3] #3, 5
learning_rates = [5e-5]  #1e-5, 5e-5
weight_decays = [0.01] # 0, 0.1, 0.01

In [None]:
for model_info in models:
    model_name = model_info['model']
    tokenizer_name = model_info['tokenizer']
    print(f"Training with {model_name} and {tokenizer_name}")

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # Tokenization function with preprocessing
    def tokenize_function_with_preprocessing(examples):
      # Apply the preprocess_text function
      preprocessed_texts = [preprocess_text(text) for text in examples["text"]]

      # Tokenize using the tokenizer
      return tokenizer(preprocessed_texts, padding="max_length", truncation=True)


    # def tokenize_function(examples):
    #   return tokenizer(examples["text"], padding="max_length", truncation=True)


    # Apply preprocessing and tokenization to the dataset
    tokenized_train_dataset = silver_dataset.map(tokenize_function_with_preprocessing, batched=True)                 #ADJUST - for training dataset, original is silver_dataset
    test_dataset = gold_dataset.map(tokenize_function_with_preprocessing, batched=True)                            #ADJUST - for test dataset, original was gold_dataset


    # Split train and validation datasets
    # all_datasets = tokenized_train_dataset.train_test_split(test_size=0.2)      -----------------------------------------------
    # train_dataset = all_datasets["train"]
    # val_dataset = all_datasets["test"]


    #Train on Silver, test on Gold
    train_dataset = tokenized_train_dataset
    val_dataset = test_dataset




    id2label = dict((i, l) for i, l in enumerate(ground_truth_cols))
    label2id = dict((l, i) for i, l in id2label.items())

    # Loop over hyperparameter combinations
    for batch_size, num_epochs, learning_rate, weight_decay in itertools.product(batch_sizes, epochs, learning_rates, weight_decays):
        print(f"\nHyperparameters: batch_size={batch_size}, epochs={num_epochs}, learning_rate={learning_rate}, weight_decay={weight_decay}")

        # Initialize the model for each hyperparameter combination
        model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                                  problem_type="multi_label_classification",
                                                                  num_labels=len(id2label),
                                                                  id2label=id2label,
                                                                  label2id=label2id).to('cuda')

        training_args = TrainingArguments(
            output_dir="./results",
            eval_strategy="epoch",
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            logging_dir='./logs',
        )

        def compute_multilabel_metrics(predictions, labels, threshold=0.5):
            # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
            sigmoid = torch.nn.Sigmoid()
            probs = sigmoid(torch.Tensor(predictions))
            y_pred = np.zeros(probs.shape)
            # next, use threshold to turn them into integer predictions
            y_pred[np.where(probs >= threshold)] = 1

            # finally, compute metrics
            y_true = labels
            f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
            f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
            accuracy = accuracy_score(y_true, y_pred)
            # return as dictionary
            metrics = {'f1_micro': f1_micro_average,
                      'f1_macro': f1_macro_average,
                      'accuracy': accuracy}
            return metrics

        def compute_metrics(p):
            preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

            result = compute_multilabel_metrics(
                predictions=preds,
                labels=p.label_ids)
            return result


        class MultilabelTrainer(Trainer):
            def compute_loss(self, model, inputs, return_outputs=False):
                labels = inputs.get("labels")
                outputs = model(**inputs)
                logits = outputs.get("logits")
                loss_fct = torch.nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels.float())
                return (loss, outputs) if return_outputs else loss

        print(f"Training with {model_name}...")

        trainer = MultilabelTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        trainer.train()

        # Evaluate and store results
        metrics = trainer.evaluate()


        #ADJUST - CONFUSION MATRIX
        # Generate predictions for the validation dataset
        predictions = trainer.predict(val_dataset).predictions
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.Tensor(predictions))
        y_pred = np.zeros(probs.shape)
        y_pred[np.where(probs >= 0.5)] = 1
        y_true = val_dataset['labels']

        # Compute confusion matrix
        confusion_matrices = multilabel_confusion_matrix(y_true, y_pred)

        # Visualize confusion matrix for each label
        for i, cm in enumerate(confusion_matrices):
            plt.figure(figsize=(10, 7))
            sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                        xticklabels=['Not ' + ground_truth_cols[i], ground_truth_cols[i]],
                        yticklabels=['Not ' + ground_truth_cols[i], ground_truth_cols[i]])
            plt.title(f'Confusion Matrix for {ground_truth_cols[i]}')
            plt.ylabel('True label')
            plt.xlabel('Predicted label')
            plt.show()





        # Save the results
        results.append({
                'model': model_name,
                'batch_size': batch_size,
                'epochs': num_epochs,
                'learning_rate': learning_rate,
                'weight_decay': weight_decay,
                'eval_accuracy': metrics['eval_accuracy'],
                'eval_f1_macro': metrics['eval_f1_macro'],
                'eval_f1_micro': metrics['eval_f1_micro'],
                'eval_loss': metrics['eval_loss'],
            })


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Print and compare results
print("\nAll Results:")
for result in results:
    print(f"Model: {result['model']}, Batch Size: {result['batch_size']}, Epochs: {result['epochs']}, "
          f"Learning Rate: {result['learning_rate']}, Weight Decay: {result['weight_decay']}, "
          f"F1 Macro: {result['eval_f1_macro']}, Accuracy: {result['eval_accuracy']}")

In [None]:
# Optionally, save the results to a CSV file
# df_results = pd.DataFrame(results)
# df_results.to_csv("hyperparameter_results_learningrate_model.csv", index=False)

In [None]:
torch.cuda.empty_cache()