# Noisy Student NLP Experiment


## 1. Setup

In [None]:
!nvidia-smi

### 1.1 - Dependencies

In [None]:
!pip install gputil transformers emoji --quiet

In [None]:
from google.colab import drive, files
drive.mount('/content/drive')

In [None]:
import os
import random
import time
import logging
import json
import pickle
from datetime import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import RandomSampler, SequentialSampler, Dataset, DataLoader

from scipy.special import softmax
from GPUtil import showUtilization as gpu_usage
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer, get_scheduler

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

logger = logging.getLogger(__name__)
logger.propagate = False

### 1.2 - Utility Functions

#### Data Handling

In [None]:
def load_mhs():
    path = os.path.join(CFG.data_path, "english", "measuring_hate_speech")

    unlabeled_path = os.path.join(CFG.data_path, "english", "unlabeled", "tweets_augmented.csv")
    train_path = os.path.join(path, "measuring_hate_speech.csv")

    data_df = pd.read_csv(train_path)
    data_df.loc[data_df["hate_speech_score"] >= 1, "label"] = 1
    data_df.loc[data_df["hate_speech_score"] < 1, "label"] = 0
    data_df = data_df[["text", "label"]]
    data_df["label"] = data_df["label"].astype(int)

    train_df, dev_df = train_test_split(data_df, train_size = 0.7, stratify=data_df["label"], random_state=CFG.seed)
    dev_df, test_df = train_test_split(dev_df, train_size = 0.5, stratify=dev_df["label"], random_state=CFG.seed)

    train_df = train_df.reset_index(drop=True)
    dev_df = dev_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    unlabeled_df = pd.read_csv(unlabeled_path)
     
    return train_df, dev_df, test_df, unlabeled_df

In [None]:
def load_convabuse():
    path = os.path.join(CFG.data_path, "english", "ConvAbuse")

    unlabeled_path = os.path.join(CFG.data_path, "english", "unlabeled", "tweets_augmented.csv")
    train_path = os.path.join(path, "ConvAbuseEMNLPtrain.csv")
    dev_path = os.path.join(path, "ConvAbuseEMNLPvalid.csv")
    test_path = os.path.join(path, "ConvAbuseEMNLPtest.csv")

    train_df = pd.read_csv(train_path)
    dev_df = pd.read_csv(dev_path)
    test_df = pd.read_csv(test_path)
    unlabeled_df = pd.read_csv(unlabeled_path)

    train_df["text"] = train_df.apply(lambda x: x["prev_agent"] + "\n" + x["prev_user"] + "\n" + x["agent"] + "\n" + x["user"], axis=1)
    dev_df["text"] = dev_df.apply(lambda x: x["prev_agent"] + "\n" + x["prev_user"] + "\n" + x["agent"] + "\n" + x["user"], axis=1)
    test_df["text"] = test_df.apply(lambda x: x["prev_agent"] + "\n" + x["prev_user"] + "\n" + x["agent"] + "\n" + x["user"], axis=1)

    train_df = train_df[["text", "is_abuse_majority"]]
    dev_df = dev_df[["text", "is_abuse_majority"]]
    test_df = test_df[["text", "is_abuse_majority"]]
     
    return train_df, dev_df, test_df, unlabeled_df

In [None]:
def load_olid():
    eng_path = os.path.join(CFG.data_path, "english")

    train_path = os.path.join(eng_path, "OLIDv1.0", "olid-training-v1.0.tsv")
    test_path = os.path.join(eng_path, "OLIDv1.0", "testset-levela.tsv")
    test_labels_path = os.path.join(eng_path, "OLIDv1.0", "labels-levela.csv")
    unlabeled_path = os.path.join(eng_path, "unlabeled", "tweets_augmented.csv")

    train_df = pd.read_csv(train_path, engine="python", sep='\t')[["tweet", "subtask_a"]]
    train_df["subtask_a"] = train_df["subtask_a"].apply(lambda x: 1 if x == "OFF" else 0)
    train_df = train_df.rename({"tweet": "text", "subtask_a": "toxic"}, axis=1)

    test_df = pd.read_csv(test_path, engine="python", sep='\t')
    test_labels = pd.read_csv(test_labels_path, header=None)
    test_df["toxic"] = test_labels[1].apply(lambda x: 1 if x == "OFF" else 0)
    test_df = test_df[["tweet", "toxic"]]
    test_df = test_df.rename({"tweet": "text"}, axis=1)

    unlabeled_df = pd.read_csv(unlabeled_path)[["text", "text_augmented"]]
    unlabeled_df["text"] = unlabeled_df["text"]
    unlabeled_df["text_augmented"] = unlabeled_df["text_augmented"]

    unlabeled_df = unlabeled_df.drop_duplicates("text")
    
    return train_df, None, test_df, unlabeled_df

In [None]:
def load_davidson():
    path = os.path.join(CFG.data_path, "english", "davidson", "davidson.csv")
    unlabeled_path = os.path.join(CFG.data_path, "english", "unlabeled", "tweets_augmented.csv")

    data_df = pd.read_csv(path)
    data_df.loc[data_df["class"] != 0, "label"] = 1
    data_df.loc[data_df["class"] == 0, "label"] = 0
    data_df["label"] = data_df["label"].astype(int)
    data_df = data_df[["tweet", "label"]]

    train_df, dev_df = train_test_split(data_df, train_size = 0.7, stratify=data_df["label"], random_state=CFG.seed)
    dev_df, test_df = train_test_split(dev_df, train_size = 0.5, stratify=dev_df["label"], random_state=CFG.seed)
    unlabeled_df = pd.read_csv(unlabeled_path)

    train_df = train_df.reset_index(drop=True)
    aux_train_df = train_df[train_df["label"] == 0]
    train_df = aux_train_df.append(train_df[train_df["label"] == 1].sample(len(aux_train_df)))
    dev_df = dev_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
     
    return train_df, dev_df, test_df, unlabeled_df


In [None]:
def get_stratified_split(df, num_split):
    """splits the dataset into 4 equal sized stratified parts and returns one of them"""
    splits = []
    left_half, right_half = train_test_split(df, train_size=0.5, shuffle=True, stratify=df.iloc[:, 1], random_state=CFG.seed)
    splits.extend(train_test_split(left_half, train_size=0.5, shuffle=True, stratify=left_half.iloc[:, 1], random_state=CFG.seed))
    splits.extend(train_test_split(right_half, train_size=0.5, shuffle=True, stratify=right_half.iloc[:, 1], random_state=CFG.seed))

    return splits[num_split]


In [None]:
def load_dataset(few_shot=False, num_split=None):
    if CFG.dataset_name == "olidv1":
        train_df, dev_df, test_df, unlabeled_df = load_olid()
    elif CFG.dataset_name == "convabuse":
        train_df, dev_df, test_df, unlabeled_df = load_convabuse()
    elif CFG.dataset_name == "davidson":
        train_df, dev_df, test_df, unlabeled_df = load_davidson()
    elif CFG.dataset_name == "measuring_hate_speech":
        train_df, dev_df, test_df, unlabeled_df = load_mhs()

    if few_shot:
        train_df = get_stratified_split(train_df, num_split)

    loaded_log = \
        f"""\tLoaded {CFG.dataset_name}"""

    if few_shot:
        loaded_log += f" - Split {num_split}"

    loaded_log += \
        f"""\n
        Train Size: {len(train_df)}
            Positives: {len(train_df[train_df.iloc[:, 1] == 1])}
            Negatives: {len(train_df[train_df.iloc[:, 1] == 0])}
        """

    if dev_df is not None:
        loaded_log += \
        f"""
        Dev Size: {len(dev_df)}
            Positives: {len(dev_df[dev_df.iloc[:, 1] == 1])}
            Negatives: {len(dev_df[dev_df.iloc[:, 1] == 0])}
        """
    loaded_log += \
        f"""
        Test Size: {len(test_df)}
            Positives: {len(test_df[test_df.iloc[:, 1] == 1])}
            Negatives: {len(test_df[test_df.iloc[:, 1] == 0])}
        Augmented Data: {len(unlabeled_df)}
        """
    log(loaded_log)

    return train_df, dev_df, test_df, unlabeled_df

In [None]:
class BertDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
class AugmentedDataset(Dataset):
    def __init__(self, df, labels=None):
        self.text = df["text"].to_list()
        self.text_augmented = df["text_augmented"].to_list()
        self.labels = labels

    def __getitem__(self, idx):
        if self.labels:
            item = {
                "text": self.text[idx],
                "labels": self.labels[idx],
                "text_augmented": self.text_augmented[idx]
            }
        else:
            item = {
                "text": self.text[idx],
                "text_augmented": self.text_augmented[idx]
            }
        return item

    def __len__(self):
        return len(self.text)

#### Model

##### Bert Helpers

In [None]:
def initialize_model(train_dataloader, attention_dropout=None, classifier_dropout=None):
    model = AutoModelForSequenceClassification.from_pretrained(CFG.pretrained_bert_name)

    if "distilbert" in CFG.pretrained_bert_name:
        if attention_dropout:
            model.config.attention_dropout=attention_dropout
        if classifier_dropout:
            model.config.seq_classif_dropout=classifier_dropout

    else:
        if attention_dropout:
            model.config.attention_probs_dropout_prob=attention_dropout
        if classifier_dropout:
            model.config.classifier_dropout=classifier_dropout

    if CFG.weight_decay:
      no_decay = ['bias', 'LayerNorm.weight']
      optimizer_grouped_parameters = [
          {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
          'weight_decay': CFG.weight_decay},
          {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
      ]

    model.to(CFG.device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.learning_rate)

    total_steps = len(train_dataloader) * CFG.num_train_epochs
    num_warmup_steps = int(total_steps*CFG.warmup_ratio)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=total_steps
    )

    return model, optimizer, scheduler

In [None]:
def train(
    model,
    train_dataloader,
    optimizer,
    scheduler,
    val_dataloader=None,
    evaluate_during_training=False,
    is_student=False,
    unlabeled_dataloader=None,
    unl_to_label_batch_ratio=None,
):
    progress_bar = tqdm(range(CFG.num_train_epochs * len(train_dataloader)))
    print_each_n_steps = int(len(train_dataloader) // 4)
    log("Start training...\n")

    historic_loss = {"loss": [], "labeled_loss": [], "unlabeled_loss": [], "steps": [], "unl_steps": []}
    for epoch_i in range(CFG.num_train_epochs):
        if is_student:
            log(
                f"{'Epoch':^7} | {'Labeled Batch':^14} | {'Unlabeled Batch':^16} | "
                f"{'Train Loss':^11} | {'Labeled Loss':^13} | "
                f"{'Unlabeled Loss':^15} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}"
            )
            log("-"*130)
        else:
            log(
                f"{'Epoch':^7} | {'Train Batch':^12} | "
                f"{'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}"
            )
            log("-"*80)

        # measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_unl_loss, batch_lab_loss, batch_counts, = 0, 0, 0, 0, 0

        loss_list = []
        unl_loss_list = []
        lab_loss_list = []
        step_list = []
        unl_step_list = []

        # train loop
        model.train()
        loss_fn = nn.CrossEntropyLoss()
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            batch_inputs = {k: v.to(CFG.device) for k, v in batch.items()}

            optimizer.zero_grad()
            output = model(**batch_inputs)
            # if model is student, train with the noised data aswell
            if is_student:
                text_col = "text_augmented" if CFG.augmented_data else "text"
                unl_logits = []
                unl_labels = []

                unl_losses_list = []
                for i in range(unl_to_label_batch_ratio):
                    unl_batch = next(iter(unlabeled_dataloader))
                    unl_inputs = tokenizer.batch_encode_plus(
                        unl_batch[text_col],
                        padding="max_length",
                        truncation=True,
                        max_length=CFG.max_seq_len,
                        return_tensors="pt"
                    )
                    unl_inputs["labels"] = unl_batch["labels"].clone().detach()
                    unl_batch_inputs = {k: v.to(CFG.device) for k, v in unl_inputs.items()}
                    unl_output = model(**unl_batch_inputs)

                    unl_logits.append(unl_output.logits.cpu().detach().numpy())
                    unl_labels.append(unl_inputs["labels"].cpu().detach().numpy())

                    del unl_batch_inputs
                    del unl_output

                # concatenate the unlabeled batch outputs into a single tensor
                unl_labels = torch.cat([torch.as_tensor(t) for t in unl_labels])
                unl_logits = torch.cat([torch.as_tensor(t) for t in unl_logits])

                # combine unlabeled + labeled loss
                unl_loss = loss_fn(unl_logits, unl_labels)
                lab_loss = output.loss
                loss = lab_loss + unl_loss

                batch_lab_loss += lab_loss.item()
                batch_unl_loss += unl_loss.item()

            else:
                loss = output.loss

            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()
            
            # historic data
            loss_list.append(batch_loss/batch_counts)
            step_list.append(step)
            if is_student:
                unl_loss_list.append(batch_unl_loss/batch_counts)
                lab_loss_list.append(batch_lab_loss/batch_counts)
                unl_step_list.append(unl_to_label_batch_ratio*step)

            if CFG.clip_grad:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()
            progress_bar.update(1)

            if (step % print_each_n_steps == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch

                # Print training results
                if is_student:
                    log(
                        f"{epoch_i + 1:^7} | {step:^14} | {(step*unl_to_label_batch_ratio):^16} | "
                        f"{batch_loss / batch_counts:^11.6f} | "
                        f"{batch_lab_loss / batch_counts:^15.6f} | "
                        f"{batch_unl_loss / batch_counts :^13.6f} | "
                        f"{'-':^10} | {'-':^9} | {time_elapsed:^9.2f}"
                    )

                else:
                    log(
                        f"{epoch_i + 1:^7} | {step:^12} | {batch_loss / batch_counts:^12.6f} | "
                        f"{'-':^10} | {'-':^9} | {time_elapsed:^9.2f}"
                    )

                batch_loss, batch_lab_loss, batch_unl_loss, batch_counts = 0, 0, 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        if evaluate_during_training:
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            time_elapsed = time.time() - t0_epoch

            if is_student:
                log("-"*130)
                log(
                    f"{epoch_i + 1:^7} | {'-':^14} | {'-':^16} | {avg_train_loss:^11.6f} | "
                    f"{'-':^15} | {'-':^13}| {val_loss:^10.6f} | "
                    f"{val_accuracy:^9.2f} | {time_elapsed:^9.2f}"
                )
                log("-"*130)
            else: 
                log("-"*80)
                log(
                    f"{epoch_i + 1:^7} | {'-':^12} | {avg_train_loss:^12.6f} | "
                    f"{val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}"
                )
                log("-"*80)
        log("\n")

        historic_loss["loss"].append(loss_list)
        historic_loss["labeled_loss"].append(lab_loss_list)
        historic_loss["unlabeled_loss"].append(unl_loss_list)
        historic_loss["unl_steps"].append(unl_step_list)
        historic_loss["steps"].append(step_list)

    return historic_loss

In [None]:
def evaluate(model, val_dataloader):
    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        batch_inputs = {k: v.to(CFG.device) for k, v in batch.items()}

        with torch.no_grad():
            output = model(**batch_inputs)
            logits = output.logits
            loss = output.loss

        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()
        labels = batch_inputs["labels"]

        accuracy = (preds == labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
def bert_predict(model, dataloader):
    model.eval()
    all_logits = []

    for batch in dataloader:
        batch_inputs = {k: v.to(CFG.device) for k, v in batch.items()}

        with torch.no_grad():
            output = model(**batch_inputs)
            logits = output.logits
        all_logits.append(logits)
    
    all_logits = torch.cat(all_logits, dim=0)

    probs = F.softmax(all_logits, dim=1).cpu().numpy()
    labels = np.argmax(probs, axis=1)

    return probs, labels

In [None]:
def get_metrics(model, test_dataloader):
    model.eval()
    all_logits = []
    true_labels = []
    history = {"y_true": [], "y_pred": [], "logits_0": [], "logits_1": []}
    for batch in test_dataloader:
        true_labels.extend(batch["labels"].detach().cpu().numpy())
        batch_inputs = {k: v.to(CFG.device) for k, v in batch.items()}

        with torch.no_grad():
            output = model(**batch_inputs)
            logits = output.logits
        all_logits.append(logits)
    
    all_logits = torch.cat(all_logits, dim=0)

    probs = F.softmax(all_logits, dim=1).cpu().numpy()
    preds = np.argmax(probs, axis=1)

    clf_report = classification_report(true_labels, preds)
    f1 = f1_score(true_labels, preds, average="macro")

    history["y_true"] = true_labels
    history["y_pred"] = preds.tolist()
    history["logits_0"] = all_logits.detach().cpu().numpy()[:, 0]
    history["logits_1"] = all_logits.detach().cpu().numpy()[:, 1]

    return clf_report, f1, history

##### Noisy Student Helpers

In [None]:
def noisy_loop(
    train_dataloader,
    dev_dataloader,
    test_dataloader,
    unlabeled_dataloader,
    ):
    attention_dropout = CFG.attention_dropout_proba
    classifier_dropout = CFG.classifier_dropout_proba
    confidence_threshold = CFG.min_pred_confidence

    history = {}

    teacher_model, teacher_optimizer, teacher_scheduler = initialize_model(
      train_dataloader=train_dataloader,
      attention_dropout=attention_dropout,
      classifier_dropout=classifier_dropout
    )

    print("before training the teacher:")
    gpu_usage()
    # train teacher (base classifier)
    train_history = train(
        model=teacher_model,
        train_dataloader=train_dataloader,
        optimizer=teacher_optimizer,
        scheduler=teacher_scheduler,
        val_dataloader=dev_dataloader,
        evaluate_during_training=True,
        is_student=False
    )
    print("after training the teacher:")
    gpu_usage()

    # eval teacher
    log("Base Classifier Metrics:")
    clf_report, f1, eval_history = get_metrics(teacher_model, test_dataloader)
    log(clf_report)
    log(f"F1 Score: {f1:.4}")

    history["base_model"] = {"train_history": train_history, "eval_history": eval_history}

    for i in range(CFG.noisy_student_iter):
        # get new high confidence samples from augmented data
        augmented_dataloader, num_new_examples_pos, num_new_examples_neg = get_high_confidence_augmented(
            teacher_model,
            unlabeled_dataloader,
            min_confidence = confidence_threshold
        )
        print("after inference")
        gpu_usage()
        train_steps = int(np.ceil(len(train_dataloader.dataset)/CFG.batch_size))
        augmented_steps = int(np.ceil(len(augmented_dataloader.dataset)/CFG.batch_size))
        unl_to_label_batch_ratio = int(np.ceil(augmented_steps/train_steps))
        if unl_to_label_batch_ratio < 1:
            raise Exception("Not enough new samples to train")

        # free teacher model from gpu
        del teacher_model

        # add model noise
        attention_dropout += CFG.increase_attention_dropout
        classifier_dropout += CFG.increase_classifier_dropout
        confidence_threshold += CFG.increase_confidence_threshold

        # define student
        student_model, student_optimizer, student_scheduler = initialize_model(
            train_dataloader=train_dataloader,
            attention_dropout=attention_dropout,
            classifier_dropout=classifier_dropout
        )

        print("before training the student")
        gpu_usage()
        # train student
        train_history = train(
            model = student_model,
            train_dataloader = train_dataloader,
            optimizer = student_optimizer,
            scheduler = student_scheduler,
            val_dataloader = dev_dataloader,
            evaluate_during_training = True,
            is_student = True,
            unl_to_label_batch_ratio=unl_to_label_batch_ratio,
            unlabeled_dataloader=augmented_dataloader
        )
        print("after training the student")
        gpu_usage()
        # eval student
        log("Classifier Metrics:")
        clf_report, f1, eval_history = get_metrics(student_model, test_dataloader)
        log(clf_report)
        log(f"F1 Score: {f1:.4}")

        teacher_model = student_model

        history[f"student_{i}"] = {
            "train_history": train_history,
            "eval_history": eval_history
        }
        history[f"student_{i}"]["train_history"]["num_new_examples_pos"] = num_new_examples_pos
        history[f"student_{i}"]["train_history"]["num_new_examples_neg"] = num_new_examples_neg

    return history


In [None]:
def get_high_confidence_augmented(model, unlabeled_dataloader, min_confidence):
    texts = []
    augmented = []
    labels = []
    logits = []
    for unl_batch in tqdm(unlabeled_dataloader):
        # tokenize unlabeled batch
        unl_inputs = tokenizer.batch_encode_plus(
            unl_batch["text"],
            padding="max_length",
            truncation=True,
            max_length=CFG.max_seq_len,
            return_tensors='pt'
        )

        # get model predictions
        batch_inputs = {k: v.to(CFG.device) for k, v in unl_inputs.items()}
        model.to(CFG.device)
        with torch.no_grad():
            unl_outputs = model(**batch_inputs)

        unl_logits = unl_outputs.logits
        batch_labels = unl_logits.argmax(dim=-1).cpu().detach().numpy()
        
        logits.append(unl_logits.cpu().detach().numpy())
        texts.extend(unl_batch["text"])
        labels.extend(batch_labels)
        augmented.extend(unl_batch["text_augmented"])

    logits = np.concatenate(logits)
    # get all examples with high confidence
    unl_softmax = softmax(logits, axis=1)
    high_confidence_positive_idxs = np.where(unl_softmax[:,1] >= min_confidence)[0]
    high_confidence_negative_idxs = np.where(unl_softmax[:,0] >= min_confidence)[0]

    # select same amount of positives and negatives (limited by the class with least examples)
    size = min(len(high_confidence_positive_idxs), len(high_confidence_negative_idxs))

    if size > 0:
        high_confidence_negative_idxs = np.random.choice(
            high_confidence_negative_idxs,
            size=size,
            replace=False
        )
        high_confidence_positive_idxs = np.random.choice(
            high_confidence_positive_idxs,
            size=size,
            replace=False
        )
        high_confidence_idxs = np.append(
            high_confidence_positive_idxs,
            high_confidence_negative_idxs
        )
        
    # model predicted everything as one of the classes
    # get 10.000 random samples from each class
    else:
        size = 10000
        high_confidence_negative_idxs = np.random.choice(
            np.where(unl_softmax[:,0] > 0.5)[0],
            size=size,
            replace=False
        )
        high_confidence_positive_idxs = np.random.choice(
            np.where(unl_softmax[:,1] > 0.5)[0],
            size=size,
            replace=False
        )
        high_confidence_idxs = np.append(
            high_confidence_positive_idxs,
            high_confidence_negative_idxs
        )

    # get selected elements from each data field by their idxs
    selected_text_augmented = list(
        map(augmented.__getitem__, high_confidence_idxs.tolist())
    )
    selected_text = list(map(texts.__getitem__, high_confidence_idxs.tolist()))
    selected_label = np.argmax(unl_softmax[high_confidence_idxs], axis=1)
    selected_confidence = np.max(unl_softmax[high_confidence_idxs], axis=1)
    
    augmented_df = pd.DataFrame({"text": selected_text, "text_augmented": selected_text_augmented, "label": selected_label, "confidence": selected_confidence})
    # augmented_df = pd.DataFrame({"text": texts, "text_augmented": augmented, "label": np.argmax(unl_softmax, axis=1), "confidence": np.max(unl_softmax, axis=1)})
    # augmented_df.to_csv("predicted-convabuse.csv", index=False)

    augmentedset = AugmentedDataset(augmented_df, labels=labels)

    augmented_sampler = RandomSampler(augmentedset)
    augmented_dataloader = DataLoader(
        augmentedset,
        sampler=augmented_sampler,
        batch_size=CFG.batch_size
    )
    amnt_new_samples_pos = len(augmented_df[augmented_df["label"] == 1])
    amnt_new_samples_neg = len(augmented_df[augmented_df["label"] == 0])
    log(
        "Added to train set:\n"
        f"\tNew + samples: {amnt_new_samples_pos}\n"
        f"\tNew - Samples: {amnt_new_samples_neg}"
    )

    return augmented_dataloader, amnt_new_samples_pos, amnt_new_samples_neg


#### General 

In [None]:
def log(text):
    print(text)
    logfile.write(text + "\n")

In [None]:
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

## 2. Experiment 

### 2.1 - Setup

#### Configuration

In [None]:
class CFG:
    # experiment parameters
    dataset_name = "measuring_hate_speech"
    experiment_id = "01"
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
    seed = 42
    data_path = "drive/MyDrive/NoisyToxic/data/"
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    num_labels = 2
    few_shot = True
    num_split = 0
    # bert parameters
    pretrained_bert_name = "bert-base-cased"
    num_train_epochs = 2
    batch_size = 32
    max_seq_len = 128
    learning_rate = 5e-5
    warmup_ratio = 0.15
    weight_decay = 1e-2
    classifier_dropout_proba = 0.1
    attention_dropout_proba = 0.1
    clip_grad = True
    # noisystudent parameters
    increase_classifier_dropout = 0.15
    increase_attention_dropout = 0.15
    increase_confidence_threshold = 0.0
    noisy_student_iter = 3
    min_pred_confidence = 0.9
    augmented_data = True
    def to_json():
        return {
            "dataset_name": CFG.dataset_name,
            "experiment_id": CFG.experiment_id,
            "timestamp": CFG.timestamp,
            "seed": CFG.seed,
            "data_path": CFG.data_path,
            "device": torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu',
            "num_labels" : CFG.num_labels,
            "few_shot": CFG.few_shot,
            "num_splt": CFG.num_split,
            "pretrained_bert_name": CFG.pretrained_bert_name,
            "num_train_epochs": CFG.num_train_epochs,
            "batch_size": CFG.batch_size,
            "max_seq_len": CFG.max_seq_len,
            "learning_rate": CFG.learning_rate,
            "warmup_ratio": CFG.warmup_ratio,
            "weight_decay": CFG.weight_decay,
            "classifier_dropout_proba": CFG.classifier_dropout_proba,
            "attention_dropout_proba": CFG.attention_dropout_proba,
            "clip_grad": CFG.clip_grad,
            "increase_classifier_dropout": CFG.increase_classifier_dropout,
            "increase_attention_dropout": CFG.increase_attention_dropout,
            "increase_confidence_threshold": CFG.increase_confidence_threshold,
            "noisy_student_iter": CFG.noisy_student_iter,
            "min_pred_confidence": CFG.min_pred_confidence,
            "augmented_data" : CFG.augmented_data,
        }

In [None]:
fname = f"experiment.log"
exp_path = f"{os.path.join(CFG.data_path, 'logs', CFG.dataset_name)}"
if CFG.few_shot:
    exp_path = f"{os.path.join(exp_path, 'few_shot', CFG.experiment_id)}"
else:
    exp_path = f"{os.path.join(exp_path, CFG.experiment_id)}"

if not os.path.exists(exp_path):
    os.mkdir(exp_path)
logfile = open(f"{os.path.join(exp_path, fname)}", "w")
log(json.dumps(CFG.to_json(), indent=4)[1:-1])

In [None]:
set_seed(CFG.seed)

#### Data

##### Loading Datasets

In [None]:
train_df, dev_df, test_df, unlabeled_df = load_dataset(CFG.few_shot, 0)

##### Tokenizing

In [None]:
if CFG.pretrained_bert_name == "vinai/bertweet-base":
    tokenizer = AutoTokenizer.from_pretrained(
        CFG.pretrained_bert_name,
        normalize=True
    )
else:
     tokenizer = AutoTokenizer.from_pretrained(CFG.pretrained_bert_name)

In [None]:
tokenized_train = tokenizer(
    train_df.iloc[:, 0].astype("str").to_list(),
    truncation=True,
    padding="max_length",
    max_length=CFG.max_seq_len,
    return_tensors="pt"
)
trainset = BertDataset(tokenized_train, labels=train_df.iloc[:, 1].to_list())

In [None]:
if dev_df is not None:
    tokenized_dev = tokenizer(
        dev_df.iloc[:, 0].astype("str").to_list(),
        truncation=True,
        padding="max_length",
        max_length=CFG.max_seq_len,
        return_tensors="pt"
    )
    devset = BertDataset(tokenized_dev, labels=dev_df.iloc[:, 1].to_list())

In [None]:
tokenized_test = tokenizer(
    test_df.iloc[:, 0].astype("str").to_list(),
    truncation=True,
    padding="max_length",
    max_length=CFG.max_seq_len,
    return_tensors="pt"
)
testset = BertDataset(tokenized_test, labels=test_df.iloc[:, 1].to_list())

In [None]:
unlabeledset = AugmentedDataset(unlabeled_df)

##### Dataloaders

In [None]:
train_sampler = RandomSampler(trainset)
train_dataloader = DataLoader(trainset, sampler=train_sampler, batch_size=CFG.batch_size)

test_sampler = SequentialSampler(testset)
test_dataloader = DataLoader(testset, sampler=test_sampler, batch_size=CFG.batch_size)

if dev_df is not None:
    dev_sampler = SequentialSampler(devset)
    dev_dataloader = DataLoader(devset, sampler=dev_sampler, batch_size=CFG.batch_size)
else:
    dev_dataloader = test_dataloader 

unlabeled_sampler = RandomSampler(unlabeledset)
unlabeled_dataloader = DataLoader(unlabeledset, sampler=unlabeled_sampler, batch_size=CFG.batch_size)

### 2.2 - Classification

In [None]:
labeled_df = pd.DataFrame(columns=["text", "text_augmented"])

In [None]:
history = noisy_loop(
    train_dataloader,
    dev_dataloader,
    test_dataloader,
    unlabeled_dataloader
)

In [None]:
logfile.close()

In [None]:
history["configs"] = CFG.to_json()
fname = "experiment.pkl"
with open(f"{os.path.join(CFG.data_path, 'logs', CFG.dataset_name, CFG.experiment_id, fname)}", "ab") as jsonfile:
    pickle.dump(history, jsonfile)

for dataset in ["olidv1", "measuring_hate_speech", "davidson", "convabuse"]:
    print(dataset)
    CFG.dataset_name = dataset
    train_df, dev_df, test_df, unlabeled_df = load_dataset()

    if CFG.pretrained_bert_name == "vinai/bertweet-base":
        tokenizer = AutoTokenizer.from_pretrained(
        CFG.pretrained_bert_name,
        normalize=True
    )
    else:
        tokenizer = AutoTokenizer.from_pretrained(CFG.pretrained_bert_name)

    teacher_model, teacher_optimizer, teacher_scheduler = initialize_model(
      train_dataloader=train_dataloader,
      attention_dropout=0.1,
      classifier_dropout=0.1
    )

    tokenized_train = tokenizer(
    train_df.iloc[:, 0].astype("str").to_list(),
    truncation=True,
    padding="max_length",
    max_length=CFG.max_seq_len,
    return_tensors="pt"
    )

    trainset = BertDataset(tokenized_train, labels=train_df.iloc[:, 1].to_list())

    if dev_df is not None:
        tokenized_dev = tokenizer(
        dev_df.iloc[:, 0].astype("str").to_list(),
        truncation=True,
        padding="max_length",
        max_length=CFG.max_seq_len,
        return_tensors="pt"
    )
        devset = BertDataset(tokenized_dev, labels=dev_df.iloc[:, 1].to_list())

        
    tokenized_test = tokenizer(
    test_df.iloc[:, 0].astype("str").to_list(),
    truncation=True,
    padding="max_length",
    max_length=CFG.max_seq_len,
    return_tensors="pt"
    )
    testset = BertDataset(tokenized_test, labels=test_df.iloc[:, 1].to_list())
    unlabeledset = AugmentedDataset(unlabeled_df)

    train_sampler = RandomSampler(trainset)
    train_dataloader = DataLoader(trainset, sampler=train_sampler, batch_size=CFG.batch_size)

    test_sampler = SequentialSampler(testset)
    test_dataloader = DataLoader(testset, sampler=test_sampler, batch_size=CFG.batch_size)

    if dev_df is not None:
        dev_sampler = SequentialSampler(devset)
        dev_dataloader = DataLoader(devset, sampler=dev_sampler, batch_size=CFG.batch_size)
    else:
        dev_dataloader = test_dataloader 

    # unlabeled_sampler = RandomSampler(unlabeledset)
    unlabeled_sampler = SequentialSampler(unlabeledset)
    unlabeled_dataloader = DataLoader(unlabeledset, sampler=unlabeled_sampler, batch_size=CFG.batch_size)

    train_history = train(
        model=teacher_model,
        train_dataloader=train_dataloader,
        optimizer=teacher_optimizer,
        scheduler=teacher_scheduler,
        val_dataloader=dev_dataloader,
        evaluate_during_training=True,
        is_student=False
    )

    # eval teacher
    log("Base Classifier Metrics:")
    clf_report, f1, eval_history = get_metrics(teacher_model, test_dataloader)
    log(clf_report)
    log(f"F1 Score: {f1:.4}")

    # get new high confidence samples from augmented data
    augmented_df = get_high_confidence_augmented(
        teacher_model,
        unlabeled_dataloader,
        min_confidence = 0.8
    )

    labeled_df["text"] = augmented_df["text"]
    labeled_df["text_augmented"] = augmented_df["text_augmented"]
    labeled_df[f"{dataset}_label"] = augmented_df["label"]
    labeled_df[f"{dataset}_confidence"] = augmented_df["confidence"]
