In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, load_metric
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from huggingface_hub import login

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home2/likhithasapu/.cache/huggingface/token
Login successful


In [3]:
# Load custom dataset
dataset = load_dataset('likhithasapu/codemix-annotated-dataset')

In [4]:
dataset

DatasetDict({
    test: Dataset({
        features: ['data.idx', 'data.L1', 'data.L2', 'data.alignments', 'data.CM_candidates', 'data.CM_candidates_transliterated_indictrans', 'average_rating', 'int_annotations', 'LID', 'PoSTags'],
        num_rows: 2145
    })
    train: Dataset({
        features: ['data.idx', 'data.L1', 'data.L2', 'data.alignments', 'data.CM_candidates', 'data.CM_candidates_transliterated_indictrans', 'average_rating', 'int_annotations', 'LID', 'PoSTags'],
        num_rows: 7507
    })
    validation: Dataset({
        features: ['data.idx', 'data.L1', 'data.L2', 'data.alignments', 'data.CM_candidates', 'data.CM_candidates_transliterated_indictrans', 'average_rating', 'int_annotations', 'LID', 'PoSTags'],
        num_rows: 1073
    })
})

In [24]:
dataset['train'][10:20]

{'data.idx': [133401,
  479708,
  1697626,
  405017,
  1966633,
  154465,
  487741,
  89372,
  434991,
  698315],
 'data.L1': ['इनकी डेंगू की जांच भी करवाई जा रही है।',
  'सवाल हिंदी और इंग्लिश दोनों में होंगे।',
  'भारत ने अपनी टीम में एक बदलाव किया।',
  'इस फ़िल्म में मुख्य भूमिका अक्षय कुमार निभा रहे हैं',
  'हमें अगले मैच पर ध्यान लगाना होगा .',
  'भारतीय लोक संस्कृति और समाज में गाय का सर्वोत्तम स्थान है।',
  'गाने को तनिष्क बागची ने रिक्रिएट किया है .',
  'कुछ देर के लिए सदन की कार्यवाही को रोकना पड़ा।',
  'संजय दत्त को देखें , उन्होंने दुनिया के सामने कबूल किया .',
  'कांग्रेस राज्य में सत्ता में है।'],
 'data.L2': ['Their background checks are also being conducted .',
  'the questions will be in English and Hindi .',
  'India has made one change in the squad .',
  'The film features Akshay Kumar in the lead role .',
  'We focus on the next match .',
  'Cows have a special place in Indian society and culture .',
  'The song has been re-composed by Tanishk Bagchi .',
  'The proce

In [6]:
# Tokenizer and Model
model_name = 'ai4bharat/indic-bert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)  # Adjust num_labels based on your labels


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Tokenization and preprocessing
def preprocess_function(examples):
    # Use 'average_rating' directly for regression
    labels = np.array(examples['average_rating'])
    # Tokenize the 'data.L1' text
    tokenized_input = tokenizer(examples['data.CM_candidates'], truncation=True)
    return {**tokenized_input, 'labels': labels}

# Apply preprocessing and remove columns
processed_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset['test'].column_names)

In [8]:
processed_datasets['test'][0]

{'input_ids': [2, 27750, 32, 18, 496, 297, 8007, 70, 524, 487, 2092, 5, 3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': 4.3333333333}

In [9]:
import numpy as np
import evaluate

metric = evaluate.load("mse")

In [10]:
from sklearn.metrics import mean_squared_error

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
# Define TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluation strategy
    learning_rate=2e-5,               # learning rate
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=10,              # number of epochs
    weight_decay=0.01,               # strength of weight decay
    save_total_limit=1,              # limit the total amount of checkpoints
    save_strategy='epoch',           # save the model after each epoch
    load_best_model_at_end=True, 
    metric_for_best_model='rmse'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_datasets['train'],
    eval_dataset=processed_datasets['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlikhithasapu[0m ([33mcmacc[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Rmse
1,No log,0.784629,0.885793
2,1.348600,0.627635,0.792234
3,0.614000,0.639771,0.799857
4,0.426700,0.682592,0.826191
5,0.257200,0.711185,0.843318
6,0.156200,0.696518,0.834576
7,0.094300,0.684776,0.827512
8,0.055600,0.699496,0.836359
9,0.033500,0.692598,0.832225
10,0.019400,0.691912,0.831813




TrainOutput(global_step=4700, training_loss=0.3203004168956838, metrics={'train_runtime': 438.7694, 'train_samples_per_second': 171.092, 'train_steps_per_second': 10.712, 'total_flos': 63999744125310.0, 'train_loss': 0.3203004168956838, 'epoch': 10.0})

In [30]:
# save model to hub
tokenizer.push_to_hub("likhithasapu/indic-bert-regression-v1")
model.push_to_hub("likhithasapu/indic-bert-regression-v1")

tokenizer.json:   0%|          | 0.00/15.3M [00:00<?, ?B/s]
tokenizer.json:   0%|          | 16.4k/15.3M [00:00<02:43, 93.6kB/s]
tokenizer.json:  11%|█         | 1.61M/15.3M [00:00<00:02, 6.09MB/s]
[A
tokenizer.json:  15%|█▍        | 2.28M/15.3M [00:00<00:06, 2.08MB/s]
tokenizer.json:  18%|█▊        | 2.69M/15.3M [00:01<00:05, 2.35MB/s]
tokenizer.json: 100%|██████████| 15.3M/15.3M [00:02<00:00, 6.52MB/s]
spiece.model: 100%|██████████| 5.65M/5.65M [00:02<00:00, 2.00MB/s]
Upload 2 LFS files: 100%|██████████| 2/2 [00:03<00:00,  1.79s/it]
model.safetensors: 100%|██████████| 134M/134M [00:15<00:00, 8.80MB/s] 


CommitInfo(commit_url='https://huggingface.co/likhithasapu/indic-bert-regression-v1/commit/311c23a43e96975334b88754218a822eeed88f02', commit_message='Upload AlbertForSequenceClassification', commit_description='', oid='311c23a43e96975334b88754218a822eeed88f02', pr_url=None, pr_revision=None, pr_num=None)

In [31]:
# Inference code
def predict(text):
    tokenized_input = tokenizer(text, truncation=True, padding=True, return_tensors='pt').to(model.device)
    return model(**tokenized_input).logits.item()

predict("यही बात is नही to the हज़म of कांग्रेस .")

2.4797494411468506