NLP Course Project - Code-Mixed Urdu–English Sentiment Analysis
Using Linguistically-Aware Transformer Models

22K-4125 Ibtesam Hussain
22K-4039 Safey Suhail
22K-8720 Ayan Khan
22K-8719 Shaheer Uddin

# Data Handling and Pre-processing

In [None]:
!pip install -q kagglehub pandas numpy tqdm transformers datasets emoji clean-text unidecode


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone


In [None]:
import kagglehub
import pandas as pd
import os

# Download dataset from KaggleHub
path = kagglehub.dataset_download("drkhurramshahzad/roman-urdu-english-code-switched-emotion-dataset")
print("Dataset Path:", path)

# List files to confirm structure
print(os.listdir(path))

# ✅ Load Excel file directly
dataset_path = os.path.join(path, "RU-EN-Emotion Dataset.xlsx")

df = pd.read_excel(dataset_path)
print("Shape:", df.shape)
print("Columns:", df.columns)
df.head()


Downloading from https://www.kaggle.com/api/v1/datasets/download/drkhurramshahzad/roman-urdu-english-code-switched-emotion-dataset?dataset_version_number=1...


100%|██████████| 1.45M/1.45M [00:01<00:00, 1.46MB/s]

Extracting files...





Dataset Path: /root/.cache/kagglehub/datasets/drkhurramshahzad/roman-urdu-english-code-switched-emotion-dataset/versions/1
['RU-EN-Emotion Dataset.xlsx']
Shape: (20000, 4)
Columns: Index(['Unnamed: 0', 'Tweets', 'Level 1', 'Level 2'], dtype='object')


Unnamed: 0.1,Unnamed: 0,Tweets,Level 1,Level 2
0,0,yeh fair game nai thi I don’t like it,emotion,Anger
1,1,Nasir mama ban gya aur jija ji hinduo ko bol r...,emotion,Anger
2,2,: Part - #MissionMuzaffarabad #MasterShifujisM...,emotion,Anger
3,3,O bhai ghoor se daikho vehicle parking ki wjah...,emotion,Anger
4,4,Yeah hy hi ghatia so kbhi serious na lo is chu...,emotion,Anger


In [None]:
import pandas as pd

# Load your Excel file
df = pd.read_excel(dataset_path)

# Drop useless unnamed column
df = df.drop(columns=['Unnamed: 0'], errors='ignore')
df = df.drop(columns=['Level 1'], errors='ignore')


# Rename key columns for consistency
df.rename(columns={'Tweets': 'Text', 'Level 2': 'Emotion'}, inplace=True)

# Drop any missing values
df = df.dropna(subset=['Text', 'Emotion'])

print("Columns after renaming:", df.columns)
df.head(100)


Columns after renaming: Index(['Text', 'Emotion'], dtype='object')


Unnamed: 0,Text,Emotion
0,yeh fair game nai thi I don’t like it,Anger
1,Nasir mama ban gya aur jija ji hinduo ko bol r...,Anger
2,: Part - #MissionMuzaffarabad #MasterShifujisM...,Anger
3,O bhai ghoor se daikho vehicle parking ki wjah...,Anger
4,Yeah hy hi ghatia so kbhi serious na lo is chu...,Anger
...,...,...
95,Na Acha voice hai iska Nah accha lyrics Only v...,Anger
96,Ya like always kaise koe ek hi type ka gaana b...,Anger
97,This song shows ladkiya ke andar paise dekh ke...,Anger
98,:22 minutes ke song m minutes tq ek hi line h ...,Anger


In [None]:
import re
import emoji
from tqdm import tqdm

# Common Roman Urdu normalization dictionary (expand as needed)
normalization_dict = {
    # Basic verbs / auxiliaries
    'hy': 'hai', 'ha': 'hai', 'h': 'hai', 'haii': 'hai', 'hey': 'hai', 'hae': 'hai',
    'hun': 'hoon', 'hon': 'hoon', 'hu': 'hoon', 'huu': 'hoon',
    'tha': 'tha', 'thaa': 'tha', 'thaay': 'tha', 'thy': 'tha', 'the': 'the',
    'thi': 'thi', 'thii': 'thi',
    'haiii': 'hai', 'hhai': 'hai', 'haaa': 'hai',
    'se': 'sey', 'seeee': 'se', 'seee': 'sey',

    # Negations
    'nai': 'nahi', 'nahe': 'nahi', 'nhi': 'nahi', 'na': 'nahi', 'nahii': 'nahi',
    'nae': 'nahi', 'nah': 'nahi', 'nh': 'nahi',

    # Intensifiers
    'bhot': 'bohat', 'bohot': 'bohat', 'buhat': 'bohat', 'bohatt': 'bohat',
    'bohut': 'bohat', 'bohatt': 'bohat', 'boht': 'bohat', 'bahot': 'bohat',

    # Positive expressions
    'acha': 'acha', 'achha': 'acha', 'achaah': 'acha', 'achaay': 'acha', 'achy': 'acha',
    'accha': 'acha', 'achaa': 'acha', 'achhaaa': 'acha',
    'pyara': 'pyaara', 'pyari': 'pyaari', 'pyaar': 'pyaar', 'pyar': 'pyaar',
    'achaacha': 'acha', 'mast': 'mazedaar', 'awesome': 'bohat acha',

    # Negative expressions
    'bura': 'bura', 'buraa': 'bura', 'ghatia': 'ghatiya', 'ghatiyaa': 'ghatiya',
    'bekar': 'bekaar', 'bakar': 'bekaar', 'faltu': 'faltu',
    'bkwas': 'bakwaas', 'bakwas': 'bakwaas', 'bwkass': 'bakwaas',
    'ghltt': 'ghalat', 'ghalat': 'ghalat', 'glat': 'ghalat',
    'naraz': 'naraaz', 'nraz': 'naraaz', 'narazz': 'naraaz',

    # Slang / casual
    'plz': 'please', 'pls': 'please', 'plss': 'please', 'plzzz': 'please',
    'thx': 'thanks', 'tnx': 'thanks', 'thanx': 'thanks', 'thnx': 'thanks',
    'sry': 'sorry', 'soz': 'sorry', 'srri': 'sorry',
    'okie': 'ok', 'okk': 'ok', 'okkk': 'ok', 'okey': 'ok',
    'ya': 'haan', 'yaa': 'haan', 'yaaa': 'haan', 'han': 'haan', 'haan': 'haan',
    'hmm': 'haan', 'hm': 'haan',

    # Pronouns / common words
    'me': 'main', 'mein': 'main', 'mayn': 'main', 'm': 'main',
    'tm': 'tum', 'tumh': 'tum', 'tumhe': 'tumhe', 'tume': 'tumhe',
    'ap': 'aap', 'aapka': 'aapka', 'apka': 'aapka', 'apki': 'aapki',
    'meri': 'meri', 'merii': 'meri', 'mera': 'mera', 'meraa': 'mera',
    'tera': 'tera', 'teraa': 'tera', 'teri': 'teri', 'terii': 'teri',
    'uska': 'uska', 'uskay': 'uska', 'uski': 'uski', 'unka': 'unka', 'unki': 'unki',

    # Common nouns / objects
    'ghar': 'ghar', 'gahr': 'ghar', 'ghr': 'ghar',
    'school': 'school', 'skool': 'school', 'skl': 'school',
    'mobile': 'mobile', 'mob': 'mobile', 'phone': 'phone',
    'pic': 'picture', 'dp': 'display picture',
    'vid': 'video', 'vido': 'video', 'videoo': 'video',

    # Emotions
    'khush': 'khush', 'khosh': 'khush', 'happy': 'khush',
    'sad': 'udaas', 'dukhii': 'udaas', 'dukhi': 'udaas',
    'angry': 'ghussa', 'ghusa': 'ghussa', 'ghussa': 'ghussa',
    'love': 'pyaar', 'luv': 'pyaar', 'loov': 'pyaar', 'lov': 'pyaar',

    # Misc common
    'abhi': 'abhi', 'abhii': 'abhi', 'abi': 'abhi',
    'kal': 'kal', 'aj': 'aaj', 'aaj': 'aaj', 'ajj': 'aaj',
    'phir': 'phir', 'fir': 'phir', 'phr': 'phir',
    'chahiye': 'chahiye', 'chaiye': 'chahiye', 'chahye': 'chahiye',
    'nahi': 'nahi', 'nh': 'nahi', 'ni': 'nahi', 'nahe': 'nahi',
    'mat': 'mat', 'mtt': 'mat',
    'ja': 'jaa', 'jaa': 'jaa', 'jao': 'jao', 'jau': 'jao',
    'kr': 'kar', 'kar': 'kar', 'kra': 'kara', 'krna': 'karna',
    'karna': 'karna', 'krta': 'karta', 'krti': 'kartii',
    'karta': 'karta', 'kartii': 'kartii', 'krtay': 'kartay',

    # Greetings
    'salam': 'salaam', 'slam': 'salaam', 'aslam': 'salaam',
    'walaikum': 'walaikum', 'walikum': 'walaikum',
    'hi': 'hello', 'helo': 'hello', 'heloo': 'hello',
    'bye': 'khuda hafiz', 'byee': 'khuda hafiz', 'bbye': 'khuda hafiz',
    'khuda': 'khuda', 'allah': 'allah',

    # Misc.
    'achaacha': 'acha', 'bohatbohat': 'bohat', 'achaah': 'acha',
    'shukriya': 'thanks', 'shkria': 'thanks', 'shukria': 'thanks',
    'thnks': 'thanks', 'ty': 'thanks', 'tq': 'thanks',
}


# Cleaning Function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", text)   # remove URLs, mentions, hashtags
    text = emoji.replace_emoji(text, replace="")            # remove emojis
    text = re.sub(r"[^a-zA-Z\s]", "", text)                 # keep only letters
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)              # reduce elongated words (e.g., "bohottt"→"bohott")
    text = re.sub(r"\s+", " ", text).strip()                # remove extra spaces
    return text

# Normalization Function
def normalize_roman_urdu(text):
    tokens = text.split()
    normalized_tokens = [normalization_dict.get(token, token) for token in tokens]
    return " ".join(normalized_tokens)

# Apply the cleaning pipeline
tqdm.pandas()
df["clean_text"] = df["Text"].progress_apply(clean_text)
df["normalized_text"] = df["clean_text"].progress_apply(normalize_roman_urdu)

print("Text Cleaning & Normalization Done!")
df[["Text", "clean_text", "normalized_text"]].head(50)


100%|██████████| 19999/19999 [00:03<00:00, 5637.54it/s]
100%|██████████| 19999/19999 [00:00<00:00, 30646.12it/s]


Text Cleaning & Normalization Done!


Unnamed: 0,Text,clean_text,normalized_text
0,yeh fair game nai thi I don’t like it,yeh fair game nai thi i dont like it,yeh fair game nahi thi i dont like it
1,Nasir mama ban gya aur jija ji hinduo ko bol r...,nasir mama ban gya aur jija ji hinduo ko bol r...,nasir mama ban gya aur jija ji hinduo ko bol r...
2,: Part - #MissionMuzaffarabad #MasterShifujisM...,part mujhe acre confirm khareedna hai muzaffar...,part mujhe acre confirm khareedna hai muzaffar...
3,O bhai ghoor se daikho vehicle parking ki wjah...,o bhai ghoor se daikho vehicle parking ki wjah...,o bhai ghoor sey daikho vehicle parking ki wja...
4,Yeah hy hi ghatia so kbhi serious na lo is chu...,yeah hy hi ghatia so kbhi serious na lo is chu...,yeah hai hello ghatiya so kbhi serious nahi lo...
5,": ""Hindu baniya ab aukat se bahir nikalnay lag...",hindu baniya ab aukat se bahir nikalnay lag ga...,hindu baniya ab aukat sey bahir nikalnay lag g...
6,Pollen allergy valon k mu py kya chu***a likha...,pollen allergy valon k mu py kya chua likha va hy,pollen allergy valon k mu py kya chua likha va...
7,duniya duniya se bheek mangke jo tum online po...,duniya duniya se bheek mangke jo tum online po...,duniya duniya sey bheek mangke jo tum online p...
8,Bc bijli bajay se band hai. I wanted rain but ...,bc bijli bajay se band hai i wanted rain but n...,bc bijli bajay sey band hai i wanted rain but ...
9,Pakistan me CNIC pe Money Transfer ki option h...,pakistan me cnic pe money transfer ki option h...,pakistan main cnic pe money transfer ki option...


In [None]:
# Create a label map
label_map = {label: idx for idx, label in enumerate(df['Emotion'].unique())}
df['label'] = df['Emotion'].map(label_map)

print("Label Mapping:", label_map)
df[['Emotion', 'label']].drop_duplicates().head()


Label Mapping: {'Anger': 0, 'Happy': 1, 'Fear': 2, 'Surprise': 3, 'Sad': 4, 'Neutral': 5}


Unnamed: 0,Emotion,label
0,Anger,0
2413,Happy,1
5292,Fear,2
5452,Surprise,3
5619,Sad,4


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['label']   # keeps class balance
)

print("Training size:", len(train_df))
print("Testing size:", len(test_df))


Training size: 15999
Testing size: 4000


In [None]:
print("Sample from training set:")
print(train_df[['Text', 'normalized_text', 'Emotion']].sample(5))

print("\nUnique Labels:", df['Emotion'].unique())


Sample from training set:
                                                    Text  \
12537  Kya se kya hogya dekhte dekhte song sunta hoga ab   
5930   Assalam O Alaikum Sir. Main Ek Adna Sa Banda H...   
18035  Aap logon pay tou qatal wajib hay Hazrat Abu-B...   
11145  me samsung galaxy grand prime plus sale krna c...   
18767                                Popular bhi dekh lo   

                                         normalized_text  Emotion  
12537  kya sey kya hogya dekhte dekhte song sunta hog...  Neutral  
5930   assalam o alaikum sir main ek adna sa banda ho...      Sad  
18035  aap logon pay tou qatal wajib hay hazrat abuba...    Anger  
11145  main samsung galaxy grand prime plus sale karn...  Neutral  
18767                                popular bhi dekh lo  Neutral  

Unique Labels: ['Anger' 'Happy' 'Fear' 'Surprise' 'Sad' 'Neutral']


# Model 1 - Baseline (XLM-R) Fine-tune xlm-roberta-base on normalized_text.

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

def tokenize(batch):
    return tokenizer(batch['normalized_text'], padding=True, truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Map:   0%|          | 0/15999 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])
num_labels = len(le.classes_)

In [None]:
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=num_labels)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./xlmr_baseline",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    load_best_model_at_end=False,  # cannot use
    metric_for_best_model="f1",    # will be used manually after training
    report_to=[],
)


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()
eval_metrics = trainer.evaluate()
print("Evaluation metrics:", eval_metrics)

Step,Training Loss
50,1.3422
100,1.1747
150,1.1485
200,1.0145
250,1.0086
300,1.0292
350,0.9627
400,0.9694
450,0.9186
500,0.9756


Evaluation metrics: {'eval_loss': 0.808652937412262, 'eval_accuracy': 0.703, 'eval_precision': 0.6749758869221629, 'eval_recall': 0.703, 'eval_f1': 0.6853214345163258, 'eval_runtime': 26.164, 'eval_samples_per_second': 152.882, 'eval_steps_per_second': 9.555, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




```
Evaluation metrics: {
  'eval_loss': 0.808652937412262,
  'eval_accuracy': 0.703,
  'eval_precision': 0.6749758869221629,
  'eval_recall': 0.703,
  'eval_f1': 0.6853214345163258,
  'eval_runtime': 26.164,
  'eval_samples_per_second': 152.882, 'eval_steps_per_second': 9.555,
  'epoch': 3.0
}
```



In [None]:
model.save_pretrained("./xlmr_baseline_model")
tokenizer.save_pretrained("./xlmr_baseline_model")

('./xlmr_baseline_model/tokenizer_config.json',
 './xlmr_baseline_model/special_tokens_map.json',
 './xlmr_baseline_model/sentencepiece.bpe.model',
 './xlmr_baseline_model/added_tokens.json')

# Model 1 – LID-Aware Transformer XLM-R + Custom Language Embeddings

In [None]:

!pip install transformers datasets scikit-learn torch kagglehub --quiet

import kagglehub
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import XLMRobertaTokenizer, XLMRobertaModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
import numpy as np


In [None]:
# Download dataset
path = kagglehub.dataset_download("drkhurramshahzad/roman-urdu-english-code-switched-emotion-dataset")
print("Dataset Path:", path)
print("Files:", os.listdir(path))

# Load Excel file
dataset_path = os.path.join(path, "RU-EN-Emotion Dataset.xlsx")
df = pd.read_excel(dataset_path)
print("Shape:", df.shape)
print("Columns:", df.columns)


Downloading from https://www.kaggle.com/api/v1/datasets/download/drkhurramshahzad/roman-urdu-english-code-switched-emotion-dataset?dataset_version_number=1...


100%|██████████| 1.45M/1.45M [00:00<00:00, 51.3MB/s]

Extracting files...
Dataset Path: /root/.cache/kagglehub/datasets/drkhurramshahzad/roman-urdu-english-code-switched-emotion-dataset/versions/1
Files: ['RU-EN-Emotion Dataset.xlsx']





Shape: (20000, 4)
Columns: Index(['Unnamed: 0', 'Tweets', 'Level 1', 'Level 2'], dtype='object')


In [None]:
# Quick normalization for Roman Urdu + English
def normalize_text(text):
    text = str(text).lower()                     # lowercase
    text = re.sub(r"[^\w\s]", "", text)         # remove punctuation
    text = re.sub(r"(.)\1{2,}", r"\1", text)    # remove repeated letters
    text = re.sub(r"\s+", " ", text).strip()    # extra spaces
    return text

df['text'] = df['Tweets'].apply(normalize_text)


In [None]:
df

Unnamed: 0.1,Unnamed: 0,Tweets,Level 1,Level 2,text
0,0,yeh fair game nai thi I don’t like it,emotion,Anger,yeh fair game nai thi i dont like it
1,1,Nasir mama ban gya aur jija ji hinduo ko bol r...,emotion,Anger,nasir mama ban gya aur jija ji hinduo ko bol r...
2,2,: Part - #MissionMuzaffarabad #MasterShifujisM...,emotion,Anger,part missionmuzaffarabad mastershifujismission...
3,3,O bhai ghoor se daikho vehicle parking ki wjah...,emotion,Anger,o bhai ghoor se daikho vehicle parking ki wjah...
4,4,Yeah hy hi ghatia so kbhi serious na lo is chu...,emotion,Anger,yeah hy hi ghatia so kbhi serious na lo is chu...
...,...,...,...,...,...
19995,19995,Mery Pas Poco X3 ha . Kisi me realme pro se ex...,no emotion,Neutral,mery pas poco x3 ha kisi me realme pro se exch...
19996,19996,mery pass oppo a he agar koi bhai exchange kar...,no emotion,Neutral,mery pass oppo a he agar koi bhai exchange kar...
19997,19997,Ager koi realme xt sale krna chahta ho to rabt...,no emotion,Neutral,ager koi realme xt sale krna chahta ho to rabt...
19998,19998,months warranty bluecolourabhi tak use nhi kiy...,no emotion,Neutral,months warranty bluecolourabhi tak use nhi kiy...


In [None]:
# Map sentiment labels to integers
label_mapping = {'Anger':0, 'Happy':1, 'Fear':2, 'Surprise':3, 'Sad':4, 'Neutral':5}
df['sentiment_label'] = df['Level 2'].map(label_mapping)

# Split dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment_label'])
print("Train:", train_df.shape, "Test:", test_df.shape)


Train: (16000, 6) Test: (4000, 6)


In [None]:
# ---------------------------
# Pronouns
# ---------------------------
pronouns = [
    "mein","tum","aap","woh","yeh","hum","mera","meri","mere","tera","teri","tere",
    "iska","iski","iske","uska","uski","uske","kaun","kya","kis","kahan","kab","kyun"
]

# ---------------------------
# Auxiliary verbs / be verbs
# ---------------------------
aux_verbs = [
    "hai","hain","tha","thi","the","hoo","hoon","ho","raha","rahi","rahe",
    "rahega","rahegi","kar","karti","karte","karenge","kya","tha","thi","the"
]

# ---------------------------
# Common verbs
# ---------------------------
verbs = [
    "aana","jana","karna","dekhna","sona","khana","pina","bolna","sunna",
    "samajhna","seekhna","mangna","lena","dena","chalna","rukna","hona","dikhna",
    "paana","sikhna","khush hona","dukhi hona","pyaar karna","sochna","yaad rakhna",
    "bhoolna","milega","chahta","chahti","chahete","chahiye","lagna","milna","rakhna"
]

# ---------------------------
# Common adjectives / emotions
# ---------------------------
adjectives_emotions = [
    "khush","dukhi","gussa","pyar","pyaar","frustrated","happy","sad",
    "fear","excited","nervous","surprised","thakaa","thaki","thake","achha","bura","acha",
    "buri","mazedaar","boring","interesting","sharminda","embarassed","confused","tired"
]

# ---------------------------
# Common nouns / objects / places
# ---------------------------
nouns = [
    "ghar","dost","kaam","school","college","khel","safar","rasoi","khana","paani",
    "zindagi","mohabbat","beta","beti","gharwalay","samaan","mobile","computer","kitab",
    "gaadi","kapray","paise","saal","mahina","din","raat","subah","shahar","gaon","kamra",
    "dukaan","market","hospital","doctor","nurse","ma","baap","bhai","behen","sala","salaam"
]

# ---------------------------
# Common question words / conjunctions
# ---------------------------
questions_conjunctions = [
    "kaise","kyun","kyunki","agar","jab","to","se","ko","me","mein","pe","ke","ki","ka",
    "aur","ya","magar","lekin","phir","fir","ab","tab","yahaan","wahan","yahan","wahan"
]

# ---------------------------
# Negations / adverbs / frequency
# ---------------------------
neg_adverbs = [
    "nahi","nahin","kabhi","hamesha","bahut","thoda","ziyada","jaldi","der","abhi",
    "pehle","baad","phir","aur","aurat","aadmi","sab","kuch","kisi","kahan","kaun"
]

# ---------------------------
# Common social media / expressions
# ---------------------------
social_expressions = [
    "lol","haha","hehe","wow","ohh","yay","hmm","ok","okay","yes","no","yeh","wo","ye","wo","haan","nah","nahin","yehi","wahi",
    "omg","btw","idk","omfg","lmao","rofl","smh","brb","gtg","ttyl"
]


roman_urdu_words = set(
    pronouns + aux_verbs + verbs + adjectives_emotions + nouns +
    questions_conjunctions + neg_adverbs + social_expressions
)

print(f"Total words in Roman Urdu dictionary: {len(roman_urdu_words)}")


Total words in Roman Urdu dictionary: 208


In [None]:


def get_lid_tags(tokens):
    tags = []
    for token in tokens:
        if token.lower() in roman_urdu_words:
            tags.append(1)  # Roman Urdu
        else:
            tags.append(0)  # English
    return tags


In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
max_len = 64

def tokenize_and_lid(batch_texts):
    encodings = tokenizer(batch_texts, truncation=True, padding="max_length", max_length=max_len)

    lid_tags_batch = []
    for i, text in enumerate(batch_texts):
        words = text.split()
        lid_tags = get_lid_tags(words)

        token_ids = encodings["input_ids"][i]
        subword_lid = []
        word_idx = 0
        for token_id in token_ids:
            token_str = tokenizer.decode([token_id]).replace(" ", "")
            if token_str in ["<s>", "</s>", "<pad>"]:
                subword_lid.append(0)
            else:
                subword_lid.append(lid_tags[min(word_idx, len(lid_tags)-1)])
                word_idx += 1
        lid_tags_batch.append(subword_lid)

    encodings["lid_labels"] = lid_tags_batch
    return encodings


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [None]:
class SentimentLIDDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text'].tolist()
        self.labels = df['sentiment_label'].tolist()
        self.encodings = tokenize_and_lid(self.texts)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['sentiment_labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentLIDDataset(train_df)
test_dataset = SentimentLIDDataset(test_df)


In [None]:
class LIDAwareXLMR(nn.Module):
    def __init__(self, model_name="xlm-roberta-base", num_sentiment_labels=6, num_lid_labels=2):
        super(LIDAwareXLMR, self).__init__()
        self.encoder = XLMRobertaModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size

        self.sentiment_classifier = nn.Linear(hidden_size, num_sentiment_labels)
        self.lid_classifier = nn.Linear(hidden_size, num_lid_labels)

    def forward(self, input_ids, attention_mask, lid_labels=None, sentiment_labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        pooled_output = sequence_output[:,0]  # CLS token

        sentiment_logits = self.sentiment_classifier(pooled_output)
        lid_logits = self.lid_classifier(sequence_output)

        loss = None
        if sentiment_labels is not None and lid_labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss_sa = loss_fn(sentiment_logits, sentiment_labels)

            loss_fn_tok = nn.CrossEntropyLoss()
            loss_lid = loss_fn_tok(lid_logits.view(-1, lid_logits.size(-1)), lid_labels.view(-1))

            lambda1, lambda2 = 1.0, 0.5
            loss = lambda1 * loss_sa + lambda2 * loss_lid

        return {"loss": loss, "sentiment_logits": sentiment_logits, "lid_logits": lid_logits}


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [None]:
!pip install --upgrade transformers --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m87.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import transformers
print(transformers.__version__)


4.57.2


In [None]:


from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./xlmr_lid_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    load_best_model_at_end=False,  # cannot use
    metric_for_best_model="f1",    # will be used manually after training
    report_to=[],
)



In [None]:
model = LIDAwareXLMR()

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

NameError: name 'Trainer' is not defined

In [None]:
trainer.train()
results = trainer.evaluate()
print("Evaluation metrics:", results)


NameError: name 'trainer' is not defined