In [2]:
#modules needed for the sentiment analysis
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import Trainer,TrainingArguments
import torch
import re
import string
from sklearn.metrics import f1_score, classification_report
from tqdm import tqdm
from datasets import Dataset, DatasetDict

In [3]:

# Monkey patch np.array for NumPy 2.0 compatibility with HuggingFace datasets
_original_array = np.array
def patched_array(obj, copy=False, **kwargs):
    try:
        return _original_array(obj, copy=copy, **kwargs)
    except ValueError as e:
        if 'avoid copy' in str(e):
            return _original_array(obj, copy=True, **kwargs)
        else:
            raise e
np.array = patched_array


In [5]:
# Load the TSV files using pandas
train_df = pd.read_csv('/content/train.tsv', delimiter='\t')
test_df = pd.read_csv('/content/test.tsv', delimiter='\t')

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine into a DatasetDict if needed
df = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [6]:
print(df["train"][0])
print(df["test"][0])

{'text': ' Trailer late ah parthavanga like podunga', 'category': 'Positive '}
{'text': ' Daily likes & views pakka vanthavaga ellarukum vanakkam', 'category': 'Positive '}


In [7]:
import re
import string

# Optional: Add your own stopwords (extend this list as needed)
stopwords = set([
    "da", "dei", "la", "machan", "enna", "epdi", "ipo", "illa", "vanthu",
    "ah", "na", "irukku", "vera", "apdi", "pa", "poda", "pannitu", "seri"
])

# Emoji pattern
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F"   # emoticons
    "\U0001F300-\U0001F5FF"   # symbols & pictographs
    "\U0001F680-\U0001F6FF"   # transport & map symbols
    "\U0001F700-\U0001F77F"   # alchemical symbols
    "\U0001F780-\U0001F7FF"   # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"   # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"   # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"   # Chess Symbols
    "\U0001FA70-\U0001FAFF"   # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"   # Miscellaneous Symbols
    "\U000024C2-\U0001F251"   # Enclosed Characters
    "]+", flags=re.UNICODE)

def dataclean(text):
    # Lowercase the text
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove mentions and hashtags
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)

    # Remove emojis using the emoji pattern
    text = emoji_pattern.sub(r"", text)

    # Remove punctuation
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stopwords])

    return text


In [8]:

# Apply cleaning
def preprocess(batch):
    batch["text"] = dataclean(batch["text"])
    return batch

df = df.map(preprocess)

Map:   0%|          | 0/11335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

In [9]:
# Load the data using pandas
train_df = pd.read_csv('/content/train.tsv', delimiter='\t')

# Convert to Hugging Face Dataset
raw_dataset = Dataset.from_pandas(train_df)

# Now preview before and after cleaning
for i in range(50):
    original = raw_dataset[i]['text']
    cleaned = dataclean(original)
    print(f"Original: {original}")
    print(f"Cleaned : {cleaned}")
    print("-" * 50)


Original:  Trailer late ah parthavanga like podunga
Cleaned : trailer late parthavanga like podunga
--------------------------------------------------
Original:  Move pathutu vanthu trailer pakurvnga yaru
Cleaned : move pathutu trailer pakurvnga yaru
--------------------------------------------------
Original:  Puthupetai dhanush  ah yarellam pathinga
Cleaned : puthupetai dhanush yarellam pathinga
--------------------------------------------------
Original:  Dhanush oda character ,puthu sa erukay , mass ta
Cleaned : dhanush oda character puthu sa erukay mass ta
--------------------------------------------------
Original:  vera level ippa pesungada mokka nu thalaivaaaaaa
Cleaned : level ippa pesungada mokka nu thalaivaaaaaa
--------------------------------------------------
Original:  Thala mass . U1 bgm.     Vera level
Cleaned : thala mass u1 bgm level
--------------------------------------------------
Original:  ivara pathta death vadi madiri irukku
Cleaned : ivara pathta death vadi m

In [10]:
df = df.filter(lambda example: example['text'] != "")

Filter:   0%|          | 0/11335 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1260 [00:00<?, ? examples/s]

In [11]:
#Tokenize the datasets
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
def token_func(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = example["category"]
    return tokens


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [13]:
tokenized_datasets=df.map(token_func)

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


In [15]:
# 3. Encode the 'category' column as ClassLabel
# Automatically infers class names from the unique values
train_dataset = train_dataset.class_encode_column("category")

# 4. Cast the test dataset to use the same features as the train dataset
test_dataset = test_dataset.cast(train_dataset.features)

# 5. Combine into a DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# 6. Print label mapping
print("Label Mapping:", dataset["train"].features["category"].names)

# (Optional) Preview
print(dataset["train"][0])

Casting to class labels:   0%|          | 0/11335 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1260 [00:00<?, ? examples/s]

Label Mapping: ['Mixed_feelings ', 'Negative ', 'Positive ', 'not-Tamil ', 'unknown_state ']
{'text': ' Trailer late ah parthavanga like podunga', 'category': 2}


In [16]:
print(df["train"][10])
print(df["test"][23])

{'text': 'when pandey gets up vandumurugan im speaking no cross speaking sittt downnn', 'category': 'Negative '}
{'text': 'yenapa hollywood movie madiri trailer iruku chiyan rocks kalakitaru getup level', 'category': 'Positive '}


In [17]:
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=3,
    ignore_mismatched_sizes=True
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
#training the XLMRoberta model up
training_args = TrainingArguments(
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True
)

In [19]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [None]:
trainer.train()

In [24]:
import numpy as np
from datasets.formatting.formatting import ArrowFormatter

def patched_arrow_array_to_numpy(self, pa_array):
    """Patched method to handle copy errors."""
    try:
        return np.array(pa_array, copy=False)  # Try without copy first
    except ValueError:
        # Fallback to ensure copy if necessary
        return np.asarray(pa_array)

# Apply the patch to the method
ArrowFormatter._arrow_array_to_numpy = patched_arrow_array_to_numpy

In [None]:
# Run prediction
predictions = trainer.predict(tokenized_datasets["test"])
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1)

# Ground-truth labels
labels = tokenized_datasets["test"]["labels"]

# Evaluate
accuracy = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average="weighted")
report = classification_report(labels, preds)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Classification Report:\n", report)