In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import os
import re
from tqdm import tqdm
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
data_en = pd.read_csv('train.En.csv')
test_A_en = pd.read_csv('task_A_En_test.csv')

In [None]:
data_en = data_en.dropna(subset=['tweet'])
test_A_en = test_A_en.dropna(subset=['text'])

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.optim import Adam
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification,AutoTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, WeightedRandomSampler
from transformers import  get_linear_schedule_with_warmup
import torch.nn.functional as F

reference: https://discuss.pytorch.org/t/how-to-handle-imbalanced-classes/11264/2

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-irony")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-irony")


texts = data_en['tweet'].tolist()
labels = data_en['sarcastic'].tolist()

def generate_sampler(labels):
  class_sample_count = np.array(
      [len(np.where(labels == t)[0]) for t in np.unique(labels)])
  weight = 1. / class_sample_count
  samples_weight = np.array([weight[t] for t in labels])
  samples_weight = torch.from_numpy(samples_weight)
  sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))
  return sampler



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/705 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
input_ids = inputs['input_ids']
attention_masks = inputs['attention_mask']


labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
# train_sampler = generate_sampler(train_dataset[:][2])
# val_sampler = generate_sampler(val_dataset[:][2])
batch_size = 64
# train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
# validation_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)


optimizer =  Adam(model.parameters(), lr=1e-5)


epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Helper Function for Accuracy Calculation
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training Loop
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("  Average training loss: {0:.2f}".format(avg_train_loss))

    # Validation Loop
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Validation Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))

print("Training complete!")


  Batch     0  of     44.
  Batch    40  of     44.
  Average training loss: 0.56
  Validation Accuracy: 0.78
  Batch     0  of     44.
  Batch    40  of     44.
  Average training loss: 0.46
  Validation Accuracy: 0.79
  Batch     0  of     44.
  Batch    40  of     44.
  Average training loss: 0.42
  Validation Accuracy: 0.79
  Batch     0  of     44.
  Batch    40  of     44.
  Average training loss: 0.39
  Validation Accuracy: 0.78
Training complete!


In [None]:
import torch
from sklearn.metrics import classification_report, accuracy_score
from torch.nn.functional import softmax

model.eval()

texts_test = test_A_en['text'].tolist()
labels_test = test_A_en['sarcastic'].tolist()


inputs_test = tokenizer(texts_test, padding=True, truncation=True, return_tensors="pt", max_length=512)
input_ids_test = inputs_test['input_ids']
attention_mask_test = inputs_test['attention_mask']
labels_test = torch.tensor(labels_test)


test_dataset = TensorDataset(input_ids_test, attention_mask_test, labels_test)
batch_size = 128
test_dataloader = DataLoader(test_dataset, sampler=RandomSampler(test_dataset), batch_size=batch_size)


true_labels_test = []
pred_labels_test = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids_test = batch[0].to(device)
        attention_mask_test = batch[1].to(device)
        labels_test = batch[2].to(device)

        outputs_test = model(input_ids_test, attention_mask=attention_mask_test)

        # Convert model logits to class probabilities using softmax
        probs_test = softmax(outputs_test.logits, dim=1)

        # Get the predicted labels
        preds_test = torch.argmax(probs_test, dim=1)

        true_labels_test.extend(labels_test.cpu().numpy())
        pred_labels_test.extend(preds_test.cpu().numpy())

accuracy = accuracy_score(y_true=true_labels_test, y_pred=pred_labels_test)
print(f'Accuracy: {accuracy:.3f}')

# Generate accuracy report
unique_labels = set(true_labels_test)  # Get unique labels

for label in unique_labels:
    label_indices = [i for i in range(len(true_labels_test))
                      if true_labels_test[i] == label]
    label_y_true = [true_labels_test[i] for i in label_indices]
    label_y_pred = [pred_labels_test[i] for i in label_indices]
    accuracy = accuracy_score(label_y_true, label_y_pred)
    print(f'Accuracy for label {label}: {accuracy:.3f}')
report = classification_report(true_labels_test, pred_labels_test, zero_division=0)
print(report)


Accuracy: 0.824
Accuracy for label 0: 0.867
Accuracy for label 1: 0.570
              precision    recall  f1-score   support

           0       0.92      0.87      0.89      1200
           1       0.42      0.57      0.48       200

    accuracy                           0.82      1400
   macro avg       0.67      0.72      0.69      1400
weighted avg       0.85      0.82      0.84      1400

