In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score


train_data = pd.read_csv('read file here')
test_data = pd.read_csv('read file here')


train_texts = train_data['text'].astype(str).tolist()
train_labels = train_data['label'].tolist()


test_texts = test_data['text'].astype(str).tolist()
test_labels = test_data['label'].tolist()

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Extract new tokens from the Telugu_List column
telugu_tokens = train_data['tokens'].dropna().str.split().explode().unique().tolist()

# Retrieve the vocabulary using the `get_vocab()` method
existing_vocab = tokenizer.get_vocab()

# Filter out tokens that are already in the tokenizer vocabulary
new_tokens = set(telugu_tokens) - set(existing_vocab.keys())

# Add the new tokens to the tokenizer
tokenizer.add_tokens(list(new_tokens))

# Print the number of new tokens added
print(f"Added {len(new_tokens)} new tokens to the tokenizer.")

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

In [None]:
class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = HateSpeechDataset(train_encodings, train_labels)
test_dataset = HateSpeechDataset(test_encodings, test_labels)

model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)

model.resize_token_embeddings(len(tokenizer))

In [None]:
training_args = TrainingArguments(
    output_dir='./results',              # Output directory for the model
    evaluation_strategy="epoch",         # Evaluate after every epoch
    save_strategy="epoch",               # Save the model after every epoch
    per_device_train_batch_size=16,      # Increase batch size for training
    per_device_eval_batch_size=32,       # Increase batch size for evaluation
    num_train_epochs=20,                 # Train for 20 epochs
    weight_decay=0.01,                   # Apply weight decay
    logging_dir='./logs',                # Log directory
    logging_steps=10,                    # Log every 10 steps
    learning_rate=5e-5,                  # Adjust learning rate
    lr_scheduler_type="cosine_with_restarts", # Use a learning rate scheduler
    warmup_steps=500,                    # Add a warmup period to stabilize early training
    load_best_model_at_end=True,         # Load the best model at the end of training
    save_total_limit=3,                  # Limit the number of saved checkpoints
)

# Function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    acc = accuracy_score(labels, predictions.numpy())
    return {'accuracy': acc}

# Initialize the Trainer object for fine-tuning
trainer = Trainer(
    model=model,                         # XLM-RoBERTa model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Test dataset for evaluation
    compute_metrics=compute_metrics      # Pass the custom accuracy computation function
)

# Fine-tune the model
trainer.train()

# Evaluate the model on the test dataset and calculate accuracy
results = trainer.evaluate()
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")

In [None]:
# Save the trained model and tokenizer
model_save_path = 'path'
tokenizer_save_path = 'path'

# Save model
model.save_pretrained(model_save_path)

# Save tokenizer
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model and tokenizer saved to {model_save_path} and {tokenizer_save_path}, respectively.")

In [None]:
# Reload the model and tokenizer
model = XLMRobertaForSequenceClassification.from_pretrained(model_save_path)
tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_save_path)

# Tokenize test data
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)
test_dataset = HateSpeechDataset(test_encodings, test_labels)

# Initialize Trainer for evaluation
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for the model
    per_device_eval_batch_size=16,   # Batch size for evaluation
)

trainer = Trainer(
    model=model,                         # Reloaded mBERT model
    args=training_args,                  # Training arguments
    eval_dataset=test_dataset,           # Test dataset for evaluation
    compute_metrics=compute_metrics      # Pass the custom accuracy computation function
)

# Evaluate the model on the test dataset and calculate accuracy
results = trainer.evaluate()
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")