In [4]:
!pip install 'transformers[torch]'

# Load model directly
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
from torch.utils.data import Dataset

import torch
import numpy as np
import random
from sklearn.metrics import accuracy_score, f1_score
def set_seeds(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
set_seeds(42)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("ltrctelugu/bert_ltrc_telugu")
model = AutoModelForSequenceClassification.from_pretrained("ltrctelugu/bert_ltrc_telugu")
model.cuda()

train_args = TrainingArguments(
        'outputs',
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate = 2e-5,
        per_device_train_batch_size = 20,
        per_device_eval_batch_size = 20,
        num_train_epochs = 10,
        weight_decay = 0.01,
        load_best_model_at_end = True,
        metric_for_best_model = 'f1_macro'
    )

class LTRCDataset(Dataset):
    def __init__(self, text, labels, tokenizer, max_len):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


import pandas as pd
train_df = pd.read_csv('/kaggle/input/telugu/final_train.csv')
test_df = pd.read_csv('/kaggle/input/telugu/final_test.csv')
val_df = pd.read_csv('/kaggle/input/telugu/final_val.csv')

def prepare_dataset(df):
  return LTRCDataset(text=df.text.to_numpy(), labels=df.label_yn.to_numpy(), tokenizer=tokenizer, max_len=128)

train_ds = prepare_dataset(train_df)
test_ds = prepare_dataset(test_df)
val_ds = prepare_dataset(val_df)

def get_metrics(preds, labels):
    acc = accuracy_score(labels, preds)
    f1_micro = f1_score(labels, preds, average='micro')
    f1_macro = f1_score(labels, preds, average='macro')
    print ('jacc acc:{}, f1 micro score:{} f1 macro score:{}'.format(acc, f1_micro, f1_macro))
    return acc, f1_micro, f1_macro

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc, f1_micro, f1_macro = get_metrics(preds, labels)
    print(f"accuracy: {acc}, f1_macro: {f1_macro}, f1_micro: {f1_micro}")
    #return {'accuracy': acc, "f1_macro": f1_macro, "f1_micro": f1_micro}
    return {'f1_macro':f1_macro, 'accuracy':acc}

trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset = train_ds,
        eval_dataset = val_ds,
        tokenizer = tokenizer,
        compute_metrics = compute_metrics
    )

trainer.train()

test_metrics = trainer.predict(test_ds)

test_metrics

import gc
gc.collect()
torch.cuda.empty_cache()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ltrctelugu/bert_ltrc_telugu and are newly initialized: ['classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1 Macro,Accuracy
1,0.3658,0.498585,0.555474,0.705783
2,0.3605,0.47741,0.680512,0.733228
3,0.3526,0.474891,0.682132,0.734175
4,0.3387,0.497417,0.680185,0.737645
5,0.3286,0.51036,0.638982,0.732492
6,0.325,0.530494,0.614188,0.726498
7,0.3213,0.553909,0.652166,0.733018
8,0.3175,0.552134,0.64463,0.732913
9,0.3186,0.563762,0.645805,0.731546
10,0.31,0.573598,0.642268,0.730389


jacc acc:0.7057833859095689, f1 micro score:0.7057833859095689 f1 macro score:0.5554738452902812
accuracy: 0.7057833859095689, f1_macro: 0.5554738452902812, f1_micro: 0.7057833859095689




jacc acc:0.7332281808622503, f1 micro score:0.7332281808622502 f1 macro score:0.6805121462780512
accuracy: 0.7332281808622503, f1_macro: 0.6805121462780512, f1_micro: 0.7332281808622502




jacc acc:0.7341745531019979, f1 micro score:0.7341745531019979 f1 macro score:0.6821323857116541
accuracy: 0.7341745531019979, f1_macro: 0.6821323857116541, f1_micro: 0.7341745531019979




jacc acc:0.7376445846477392, f1 micro score:0.7376445846477392 f1 macro score:0.6801848533421222
accuracy: 0.7376445846477392, f1_macro: 0.6801848533421222, f1_micro: 0.7376445846477392




jacc acc:0.7324921135646688, f1 micro score:0.7324921135646688 f1 macro score:0.6389820113557514
accuracy: 0.7324921135646688, f1_macro: 0.6389820113557514, f1_micro: 0.7324921135646688




jacc acc:0.7264984227129337, f1 micro score:0.7264984227129337 f1 macro score:0.6141884869760845
accuracy: 0.7264984227129337, f1_macro: 0.6141884869760845, f1_micro: 0.7264984227129337




jacc acc:0.7330178759200842, f1 micro score:0.7330178759200842 f1 macro score:0.6521661027509125
accuracy: 0.7330178759200842, f1_macro: 0.6521661027509125, f1_micro: 0.7330178759200842




jacc acc:0.732912723449001, f1 micro score:0.732912723449001 f1 macro score:0.6446298135275295
accuracy: 0.732912723449001, f1_macro: 0.6446298135275295, f1_micro: 0.732912723449001




jacc acc:0.7315457413249211, f1 micro score:0.7315457413249211 f1 macro score:0.6458049352635246
accuracy: 0.7315457413249211, f1_macro: 0.6458049352635246, f1_micro: 0.7315457413249211




jacc acc:0.7303890641430074, f1 micro score:0.7303890641430074 f1 macro score:0.642267725130095
accuracy: 0.7303890641430074, f1_macro: 0.642267725130095, f1_micro: 0.7303890641430074




jacc acc:0.792373206475869, f1 micro score:0.792373206475869 f1 macro score:0.6965803960948703
accuracy: 0.792373206475869, f1_macro: 0.6965803960948703, f1_micro: 0.792373206475869


In [7]:
test_metrics = trainer.predict(test_ds)



jacc acc:0.792373206475869, f1 micro score:0.792373206475869 f1 macro score:0.6965803960948703
accuracy: 0.792373206475869, f1_macro: 0.6965803960948703, f1_micro: 0.792373206475869


In [9]:
original_test_df = pd.read_csv('/kaggle/input/ltrctest/ltrc_tel_test.csv')
original_test_ds = prepare_dataset(original_test_df)
original_test_metrics = trainer.predict(original_test_ds)



jacc acc:0.9522252239442628, f1 micro score:0.9522252239442628 f1 macro score:0.48776402039329936
accuracy: 0.9522252239442628, f1_macro: 0.48776402039329936, f1_micro: 0.9522252239442628
