In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score


train_data = pd.read_csv('read file here')
test_data = pd.read_csv('read file here')


train_texts = train_data['text'].astype(str).tolist()
train_labels = train_data['label'].tolist()


test_texts = test_data['text'].astype(str).tolist()
test_labels = test_data['label'].tolist()


tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')

telugu_tokens = train_data['tokens'].dropna().str.split().explode().unique().tolist()

new_tokens = set(telugu_tokens) - set(tokenizer.vocab.keys())

tokenizer.add_tokens(list(new_tokens))

print(f"Added {len(new_tokens)} new tokens to the tokenizer.")

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

In [None]:
class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = HateSpeechDataset(train_encodings, train_labels)
test_dataset = HateSpeechDataset(test_encodings, test_labels)


model = AutoModelForSequenceClassification.from_pretrained('ai4bharat/indic-bert', num_labels=2)

# Since we've added new tokens, we need to resize the token embedding layer
model.resize_token_embeddings(len(tokenizer))

In [None]:
def make_model_contiguous(model):
    for param in model.parameters():
        param.data = param.data.contiguous()

In [None]:
make_model_contiguous(model)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',              # Output directory for the model
    evaluation_strategy="epoch",         # Evaluate after every epoch
    save_strategy="epoch",               # Save the model after every epoch
    per_device_train_batch_size=32,      # Increase batch size for training
    per_device_eval_batch_size=32,       # Increase batch size for evaluation
    num_train_epochs=20,                 # Train for 20 epochs
    weight_decay=0.01,                   # Apply weight decay
    logging_dir='./logs',                # Log directory
    logging_steps=10,                    # Log every 10 steps
    learning_rate=5e-5,                  # Adjust learning rate
    lr_scheduler_type="cosine_with_restarts", # Use a learning rate scheduler
    warmup_steps=500,                    # Add a warmup period to stabilize early training
    load_best_model_at_end=True,         # Load the best model at the end of training
    save_total_limit=3,                  # Limit the number of saved checkpoints
)

# Function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    acc = accuracy_score(labels, predictions.numpy())
    return {'accuracy': acc}

# Initialize the Trainer object for fine-tuning
trainer = Trainer(
    model=model,                         # mBERT model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Test dataset for evaluation
    compute_metrics=compute_metrics      # Pass the custom accuracy computation function
)

# Fine-tune the model
trainer.train()

# Evaluate the model on the test dataset and calculate accuracy
results = trainer.evaluate()
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")

In [None]:
# Save the trained model and tokenizer
model_save_path = 'path'
tokenizer_save_path = 'path'

# Save model
model.save_pretrained(model_save_path)

# Save tokenizer
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model and tokenizer saved to {model_save_path} and {tokenizer_save_path}, respectively.")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import torch.nn.functional as F

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    probs = F.softmax(torch.tensor(logits), dim=-1)

    # Convert predictions and labels to numpy arrays
    predictions = predictions.numpy()
    labels = labels

    # Compute metrics for binary classification
    acc = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='binary')
    recall = recall_score(labels, predictions, average='binary')
    f1 = f1_score(labels, predictions, average='binary')

    # Compute macro metrics
    macro_precision = precision_score(labels, predictions, average='macro')
    macro_recall = recall_score(labels, predictions, average='macro')
    macro_f1 = f1_score(labels, predictions, average='macro')

    # Return only scalar values to avoid logging issues
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1': macro_f1
    }


In [None]:
# Reload the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

# Tokenize test data
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)
test_dataset = HateSpeechDataset(test_encodings, test_labels)

# Initialize Trainer for evaluation
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for the model
    per_device_eval_batch_size=16,   # Batch size for evaluation
)

trainer = Trainer(
    model=model,                         # Reloaded mBERT model
    args=training_args,                  # Training arguments
    eval_dataset=test_dataset,           # Test dataset for evaluation
    compute_metrics=compute_metrics      # Pass the custom accuracy computation function
)

# Evaluate the model on the test dataset and calculate accuracy
results = trainer.evaluate()
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")

In [None]:
results