In [1]:
!pip install sympy>=1.13.3


In [2]:
import sympy
import mpmath
print(f"SymPy version: {sympy.__version__}")
print(f"mpmath version: {mpmath.__version__}")
print("Success - packages imported correctly!")

SymPy version: 1.13.1
mpmath version: 1.3.0
Success - packages imported correctly!


In [3]:
!pip install transformers datasets nltk scikit-learn pandas
print("Success")

Defaulting to user installation because normal site-packages is not writeable
Success


In [4]:
import numpy as np
import pandas as pd
print("✓ Both packages working!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy location: {np.__file__}")
print(f"Pandas location: {pd.__file__}")

✓ Both packages working!
NumPy version: 1.26.4
Pandas version: 2.2.2
NumPy location: C:\ProgramData\anaconda3\Lib\site-packages\numpy\__init__.py
Pandas location: C:\ProgramData\anaconda3\Lib\site-packages\pandas\__init__.py


In [5]:
# Step 1: Import necessary libraries

import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
print("Imported!")


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Imported!


In [6]:
# Load Tamil training and validation datasets
tamil_train = pd.read_csv("tamil_offensive_speech_train.csv")[["comment", "label"]]
tamil_val = pd.read_csv("tamil_offensive_speech_val.csv")[["comment", "label"]]

# Rename 'comment' to 'text' (standardized)
tamil_train = tamil_train.rename(columns={"comment": "text"})
tamil_val = tamil_val.rename(columns={"comment": "text"})

# Check data shape and first few rows
print(f"Train shape: {tamil_train.shape}")
print(f"Validation shape: {tamil_val.shape}")
tamil_train.head()


Train shape: (27875, 2)
Validation shape: (6969, 2)


Unnamed: 0,text,label
0,omg that bgm make me goosebumb...,0
1,neraya neraya neraya neraya neraya neraya.,0
2,thalaivar mersal look .semma massss thalaiva ....,0
3,paaaa... repeat mode.... adra adra adraaaaa......,0
4,epaa ena panaporam... sweet sapade poram... aw...,0


In [7]:
import re

# Function to clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip()               # Remove leading/trailing spaces
    text = text.lower()               # Lowercase all text
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces/newlines with single space
    # Optional: Remove special characters except Tamil letters, numbers and basic punctuation
    # text = re.sub(r"[^a-zA-Z0-9அஆஇஈஉஊஎஏஐஒஓஔகஙசஞடணதநபமயரலவஷஸஹாிீுூெேைொோௌ்.,!?]", " ", text)
    return text

# Apply cleaning to train and val data
tamil_train['text'] = tamil_train['text'].apply(clean_text)
tamil_val['text'] = tamil_val['text'].apply(clean_text)

# Remove empty or whitespace-only rows after cleaning
tamil_train = tamil_train[tamil_train['text'].str.strip() != ""]
tamil_val = tamil_val[tamil_val['text'].str.strip() != ""]

# Reset index after cleaning
tamil_train = tamil_train.reset_index(drop=True)
tamil_val = tamil_val.reset_index(drop=True)

print(f"Cleaned train shape: {tamil_train.shape}")
print(f"Cleaned val shape: {tamil_val.shape}")


Cleaned train shape: (27870, 2)
Cleaned val shape: (6969, 2)


In [8]:
print(tamil_train['label'].dtype)  # Should output: int64 or int32
print(tamil_val['label'].dtype)    # Should output: int64 or int32


int64
int64


In [9]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)

# Convert pandas DataFrame to Huggingface Dataset first
from datasets import Dataset

train_dataset = Dataset.from_pandas(tamil_train)
val_dataset = Dataset.from_pandas(tamil_val)

# Apply tokenization (batched=True for faster processing)
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch tensors (required for training)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print("Tokenization done!")
print(f"Train dataset example keys: {train_dataset[0].keys()}")




Map:   0%|          | 0/27870 [00:00<?, ? examples/s]

Map:   0%|          | 0/6969 [00:00<?, ? examples/s]

Tokenization done!
Train dataset example keys: dict_keys(['label', 'input_ids', 'attention_mask'])


In [10]:
from transformers import AutoModelForSequenceClassification

# Number of classes (labels)
num_labels = 2  # since your labels are already integers 0 or 1

# Load the pretrained XLM-RoBERTa model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=num_labels)

print("Model loaded with", num_labels, "labels.")


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded with 2 labels.


In [11]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary', zero_division=0)
    accuracy = accuracy_score(labels, predictions)
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [12]:
!pip install transformers==4.40.0
print("Installed!")

Defaulting to user installation because normal site-packages is not writeable
Installed!


In [14]:
import transformers
print(transformers.__version__)


4.40.0


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
from sklearn.metrics import f1_score

# Load CSV files
tamil_train = pd.read_csv("tamil_offensive_speech_train.csv")
tamil_val = pd.read_csv("tamil_offensive_speech_val.csv")

# Replace NaN or None with empty string or drop rows
tamil_train['comment'] = tamil_train['comment'].fillna("").astype(str)
tamil_val['comment'] = tamil_val['comment'].fillna("").astype(str)
tamil_train = tamil_train.rename(columns={"comment": "text"})
tamil_val = tamil_val.rename(columns={"comment": "text"})


# Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(tamil_train)
val_dataset = Dataset.from_pandas(tamil_val)

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize_function(examples):
    # examples["text"] should be a list of strings if batched=True,
    # or a single string if batched=False
    # So just pass it directly
    return tokenizer(
        examples["text"],  # list or str both work for tokenizer
        padding="max_length",
        truncation=True,
        max_length=128,
    ) 

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Rename label column to labels
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds, average="weighted")
    return {"f1": f1}

# Training arguments - keep batch size small to speed up training
training_args = TrainingArguments(
    output_dir="./model_checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    seed=42,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train and evaluate
trainer.train()
results = trainer.evaluate()
print("Evaluation results:", results)

# Save model and tokenizer
trainer.save_model("./saved_model")
tokenizer.save_pretrained("./saved_model")
print("Model and tokenizer saved in './saved_model'")


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/27875 [00:00<?, ? examples/s]

Map:   0%|          | 0/6969 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss


In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Step 1: Get model predictions on the validation dataset
predictions_output = trainer.predict(val_dataset)

# Extract raw logits and true labels
logits = predictions_output.predictions  # Model raw output scores (logits)
true_labels = predictions_output.label_ids  # True labels from the dataset

# Step 2: Convert logits to predicted class indices
predicted_labels = np.argmax(logits, axis=1)

# Step 3: Generate classification report
# This report includes precision, recall, f1-score, and support for each class
print("===== Classification Report =====")
print(classification_report(true_labels, predicted_labels, digits=4))

# Step 4: Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

print("===== Confusion Matrix =====")
print(conf_matrix)

# Step 5: Define the class names for labeling the plot axes
# IMPORTANT: Replace these labels with your actual class names
class_names = ["Non-Offensive", "Offensive"]

# Step 6: Plot confusion matrix with detailed labels and color map
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=class_names)

# Plot the confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))  # You can change the size if needed
disp.plot(cmap=plt.cm.Blues, ax=ax)

# Step 7: Add axis labels and title for clarity
plt.xlabel("Predicted Label", fontsize=14)
plt.ylabel("True Label", fontsize=14)
plt.title("Confusion Matrix for Tamil Hate Speech Detection", fontsize=16)

# Optional: Improve layout and display plot
plt.xticks(rotation=45)  # Rotate x-axis labels for readability if needed
plt.tight_layout()
plt.savefig("confusion_matrix_tamil_hate_speech.png", dpi=300)
plt.show()
