<a href="https://colab.research.google.com/github/JAugusto97/noisystudentNLP/blob/main/Torch_NoisyToxic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#https://skimai.com/fine-tuning-bert-for-sentiment-analysis/

In [None]:
!nvidia-smi

In [None]:
!pip install transformers emoji --quiet

In [None]:
from google.colab import drive, files
drive.mount('/content/drive')

In [None]:
import os
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AdamW, get_scheduler
import torch
import torch.nn as nn
import pandas as pd
import torch.nn.functional as F
from sklearn.metrics import classification_report, f1_score, accuracy_score
import numpy as np
import logging
from datetime import datetime
import json

logger = logging.getLogger(__name__)
logger.propagate = False

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
class CFG:
    # pretrained_bert_name = "vinai/bertweet-base"
    pretrained_bert_name = "distilbert-base-cased"
    seed = 7
    datasets_path = "drive/MyDrive/NoisyToxic/data/"
    steps = 3
    min_confidence = 0.8    
    hidden_dim = 128
    train_epochs = 1
    do_lower_case = False
    batch_size = 8
    max_seq_len = 32
    dropout_proba = 0.1
    increase_dropout_step = 0.1
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")

    def to_json():
        return {
            "pretrained_bert_name": CFG.pretrained_bert_name,
            "seed": CFG.seed,
            "datasets_path": CFG.datasets_path,
            "steps": CFG.steps,
            "hidden_dim": CFG.hidden_dim,
            "min_confidence": CFG.min_confidence,
            "train_epochs": CFG.train_epochs,
            "do_lower_case": CFG.do_lower_case,
            "batch_size": CFG.batch_size,
            "max_seq_len": CFG.max_seq_len,
            "dropout_proba": CFG.dropout_proba,
            "increase_dropout_step": CFG.increase_dropout_step,
            "timestamp": CFG.timestamp
        }

In [None]:
fname = f"EXP_{CFG.timestamp}.log"
log = open(f"{os.path.join( fname)}", "w")
plog(json.dumps(CFG.to_json(), indent=4))

In [None]:
train_df, test_df, unlabeled_df = load_olid(CFG.datasets_path)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    CFG.pretrained_bert_name,
    # use_fast=True,
    # normalize=True,
)

In [None]:
tokenized_train = batch_tokenize(train_df["text"].to_list())
trainset = OlidDataset(tokenized_train, labels=train_df["toxic"].to_list())

tokenized_test = batch_tokenize(test_df["text"].to_list())
testset = OlidDataset(tokenized_test, labels=test_df["toxic"].to_list())

unlabeledset = UnlabeledDataset(unlabeled_df, labels=[0 for i in range(len(unlabeled_df))])

In [None]:
train_dataloader = DataLoader(trainset, shuffle=True, batch_size=CFG.batch_size)
test_dataloader = DataLoader(testset, batch_size=CFG.batch_size)

unl_batch_size = (len(unlabeled_df)//(len(train_df)//CFG.batch_size))
unlabeled_dataloader = DataLoader(unlabeledset, shuffle=True, batch_size=unl_batch_size)

In [None]:
loss_fn = nn.CrossEntropyLoss()
set_seed(CFG.seed)

student_model, student_optimizer, student_scheduler = initialize_model(
    pretrained_bert_name = CFG.pretrained_bert_name,
    hidden_dim = CFG.hidden_dim,
    n_labels = 2,
    dropout_proba = CFG.dropout_proba+0.1, 
    epochs=CFG.train_epochs
)

teacher_model, teacher_optimizer, teacher_scheduler = initialize_model(
    pretrained_bert_name = CFG.pretrained_bert_name,
    hidden_dim = CFG.hidden_dim,
    n_labels = 2,
    dropout_proba = CFG.dropout_proba, 
    epochs=CFG.train_epochs
)

# for i in range(CFG.steps):
teacher_model.to(device)
teacher_model.train()
train(
    model = teacher_model,
    train_dataloader = train_dataloader,
    epochs = CFG.train_epochs,
    optimizer = teacher_optimizer,
    scheduler = teacher_scheduler,
    val_dataloader = test_dataloader,
    evaluate_during_training=True,
    is_student=False,
)

In [None]:
teacher_model.eval()
probas, preds = bert_predict(teacher_model, test_dataloader)
print(classification_report(test_df["toxic"], preds))

In [None]:
texts = []
labels = []
augmented = []
for unl_batch in unlabeled_dataloader:
    unl_inputs = batch_tokenize(unl_batch["text"])

    unl_input_ids = torch.LongTensor(unl_inputs['input_ids']).to(device)
    unl_attention_mask = torch.LongTensor(unl_inputs['attention_mask']).to(device)
    unl_logits = teacher_model(unl_input_ids, unl_attention_mask)
    unl_softmax = F.softmax(unl_logits).cpu().detach().numpy()

    high_confidence_positive_idxs = np.where(unl_softmax[:,1] >= CFG.min_confidence)[0] # high confidence positive preds
    high_confidence_negative_idxs = np.where(unl_softmax[:,0] >= CFG.min_confidence)[0]
    high_confidence_negative_idxs = np.random.choice(high_confidence_negative_idxs, size=len(high_confidence_positive_idxs), replace=False)

    high_confidence_idxs = np.append(high_confidence_positive_idxs, high_confidence_negative_idxs)

    high_confidence_augmented = list(map(unl_batch["text_augmented"].__getitem__, high_confidence_idxs.tolist()))
    high_confidence_text = list(map(unl_batch["text"].__getitem__, high_confidence_idxs.tolist()))
    unl_labels = np.argmax(unl_softmax[high_confidence_idxs], axis=1)

    texts.extend(high_confidence_text)
    labels.extend(unl_labels)
    augmented.extend(high_confidence_augmented)

df = pd.DataFrame({"text": texts, "text_augmented": augmented})
processed_dataset = UnlabeledDataset(df, labels=labels)

unl_batch_size = len(df)//(len(train_df)//CFG.batch_size)
if unl_batch_size <= 0:
    unl_batch_size = 1

augmented_dataloader = DataLoader(processed_dataset, shuffle=True, batch_size=unl_batch_size)

In [None]:
del teacher_model

In [None]:
student_model.to(device)
student_model.train()
train(
    model = student_model,
    train_dataloader = train_dataloader,
    epochs = CFG.train_epochs,
    optimizer = student_optimizer,
    scheduler = student_scheduler,
    val_dataloader = test_dataloader,
    evaluate_during_training=True,
    is_student=True,
    unlabeled_dataloader=augmented_dataloader
)

In [None]:
student_model.eval()
probas, pred = bert_predict(student_model, test_dataloader)
pred

In [None]:
print(classification_report(test_df["toxic"], pred))