In [None]:
!pip install 'transformers[torch]' torch

In [2]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import numpy as np
import random
import gc

from sklearn.metrics import accuracy_score, f1_score

def set_seeds(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seeds(393)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def get_metrics(preds, labels):
    acc = accuracy_score(labels, preds)
    f1_micro = f1_score(labels, preds, average='micro')
    f1_macro = f1_score(labels, preds, average='macro')
    print ('jacc acc:{}, f1 micro score:{} f1 macro score:{}'.format(acc, f1_micro, f1_macro))
    return acc, f1_micro, f1_macro

In [3]:
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/data/abusexlmr')
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/data/abusexlmr', num_labels=2)
model.cuda()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [20]:
train_args = TrainingArguments(
        'outputs',
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate = 2e-5,
        per_device_train_batch_size = 10,
        per_device_eval_batch_size = 10,
        num_train_epochs = 10,
        weight_decay = 0.01,
        load_best_model_at_end = True,
        metric_for_best_model = 'f1_macro'
    )

In [21]:
class MACDataset(Dataset):
    def __init__(self, text, labels, tokenizer, max_len):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [22]:
def prepare_dataset(df):
  return MACDataset(text=df.text.to_numpy(), labels=df.label_yn.to_numpy(), tokenizer=tokenizer, max_len=128)

In [23]:
train_df = pd.read_csv('/content/drive/MyDrive/data/telugu/final_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/data/telugu/final_test.csv')
val_df = pd.read_csv('/content/drive/MyDrive/data/telugu/final_val.csv')

In [24]:
train_ds = prepare_dataset(train_df)
test_ds = prepare_dataset(test_df)
val_ds = prepare_dataset(val_df)

In [25]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc, f1_micro, f1_macro = get_metrics(preds, labels)
    print(f"accuracy: {acc}, f1_macro: {f1_macro}, f1_micro: {f1_micro}")
    #return {'accuracy': acc, "f1_macro": f1_macro, "f1_micro": f1_micro}
    return {'f1_macro':f1_macro, 'accuracy':acc}

In [26]:
trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset = train_ds,
        eval_dataset = val_ds,
        tokenizer = tokenizer,
        compute_metrics = compute_metrics
    )

In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Macro,Accuracy
1,0.1599,0.24357,0.917834,0.931335
2,0.1253,0.326668,0.923353,0.934805
3,0.0881,0.336456,0.920009,0.932387
4,0.0628,0.452032,0.917415,0.93081
5,0.0386,0.531608,0.908322,0.922608
6,0.0343,0.516352,0.917709,0.930179
7,0.0174,0.55239,0.918371,0.930494
8,0.0181,0.529292,0.920994,0.932492
9,0.0093,0.638246,0.91799,0.93081
10,0.0053,0.66662,0.921229,0.933018


jacc acc:0.931335436382755, f1 micro score:0.931335436382755 f1 macro score:0.9178340036577584
accuracy: 0.931335436382755, f1_macro: 0.9178340036577584, f1_micro: 0.931335436382755
jacc acc:0.9348054679284963, f1 micro score:0.9348054679284963 f1 macro score:0.9233534325292078
accuracy: 0.9348054679284963, f1_macro: 0.9233534325292078, f1_micro: 0.9348054679284963
jacc acc:0.9323869610935857, f1 micro score:0.9323869610935857 f1 macro score:0.9200087843525817
accuracy: 0.9323869610935857, f1_macro: 0.9200087843525817, f1_micro: 0.9323869610935857
jacc acc:0.9308096740273396, f1 micro score:0.9308096740273396 f1 macro score:0.9174147961230652
accuracy: 0.9308096740273396, f1_macro: 0.9174147961230652, f1_micro: 0.9308096740273396
jacc acc:0.9226077812828601, f1 micro score:0.9226077812828601 f1 macro score:0.9083224427900691
accuracy: 0.9226077812828601, f1_macro: 0.9083224427900691, f1_micro: 0.9226077812828601
jacc acc:0.9301787592008413, f1 micro score:0.9301787592008413 f1 macro sc

TrainOutput(global_step=42600, training_loss=0.054471786239057636, metrics={'train_runtime': 4974.6299, 'train_samples_per_second': 85.633, 'train_steps_per_second': 8.563, 'total_flos': 2.80206696182016e+16, 'train_loss': 0.054471786239057636, 'epoch': 10.0})

In [31]:
gc.collect()

79

In [30]:
torch.cuda.empty_cache()

In [28]:
test_metrics = trainer.predict(test_ds)

jacc acc:0.9444487071280595, f1 micro score:0.9444487071280595 f1 macro score:0.9191846913018573
accuracy: 0.9444487071280595, f1_macro: 0.9191846913018573, f1_micro: 0.9444487071280595


In [29]:
test_metrics

PredictionOutput(predictions=array([[ 3.7194731, -4.5912337],
       [-3.3240883,  3.675735 ],
       [-3.5344553,  3.9739187],
       ...,
       [ 2.9705765, -3.4781687],
       [ 3.247798 , -3.9461331],
       [ 3.2790737, -3.8932557]], dtype=float32), label_ids=array([0, 1, 1, ..., 0, 0, 0]), metrics={'test_loss': 0.27196773886680603, 'test_f1_macro': 0.9191846913018573, 'test_accuracy': 0.9444487071280595, 'test_runtime': 38.5401, 'test_samples_per_second': 338.167, 'test_steps_per_second': 33.835})

In [30]:
original_test_df = pd.read_csv('/content/drive/MyDrive/data/telugu/macd_tel_test.csv')
original_test_ds = prepare_dataset(original_test_df)

In [31]:
original_test_metrics = trainer.predict(original_test_ds)

jacc acc:0.9045, f1 micro score:0.9045 f1 macro score:0.9040364721696976
accuracy: 0.9045, f1_macro: 0.9040364721696976, f1_micro: 0.9045
