In [1]:
# ===============================
# 1. Install dependencies
# ===============================
!pip install --upgrade \
  "click>=8.1.3" \
  "typeguard>=4.0.1" \
  "scikit-learn<1.7,>=1.2" \
  transformers \
  datasets \
  evaluate \
  sentencepiece \
  tokenizers \
  emoji \
  urduhack

import re
import emoji
import pandas as pd
import numpy as np
import torch

from datasets import Dataset, DatasetDict, ClassLabel
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import evaluate


Collecting urduhack
  Using cached urduhack-1.1.1-py3-none-any.whl.metadata (7.2 kB)
Collecting tf2crf (from urduhack)
  Using cached tf2crf-0.1.33-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting tensorflow-datasets~=3.1 (from urduhack)
  Using cached tensorflow_datasets-3.2.1-py3-none-any.whl.metadata (4.8 kB)
INFO: pip is looking at multiple versions of urduhack to determine which version is compatible with other requirements. This could take a while.
Collecting urduhack
  Using cached urduhack-1.1.0-py3-none-any.whl.metadata (7.2 kB)
  Using cached urduhack-1.0.3-py3-none-any.whl.metadata (7.1 kB)
  Using cached urduhack-1.0.2-py3-none-any.whl.metadata (7.1 kB)
  Using cached urduhack-1.0.1-py3-none-any.whl.metadata (7.2 kB)
  Using cached urduhack-1.0.0-py3-none-any.whl.metadata (7.4 kB)
  Using cached urduhack-0.3.4-py3-none-any.whl.metadata (7.2 kB)
Collecting transformers
  Using cached transformers-2.11.0-py3-none-any.whl.metadata (45 kB)
Collecting urduhack
  Using cached ur

In [2]:
# ===============================
# 2. Preprocessing functions
# ===============================
ROMAN_URDU_MAP = {
    'nhi': 'nahi', 'ni': 'nahi', 'kya': 'kya', 'kyu': 'kyun', 'kyun': 'kyun',
    'hai': 'hai', 'hy': 'hai', 'ha': 'hai', 'han': 'haan', 'haan': 'haan'
}

def clean_text(text):
    text = str(text)
    text = re.sub(r'http\S+', '', text)       # remove links
    text = re.sub(r'@\w+', '', text)          # remove mentions
    text = re.sub(r'#', '', text)             # remove hashtag symbol
    text = emoji.demojize(text)               # convert emoji to text
    return re.sub(r'\s+', ' ', text).strip()  # remove extra spaces

def normalize_roman_urdu(text):
    return ' '.join(ROMAN_URDU_MAP.get(tok.lower(), tok) for tok in text.split())

def preprocess_df(df):
    df = df.copy()
    df['text_clean'] = df['text'].apply(clean_text).apply(normalize_roman_urdu)
    return df

In [3]:
# ===============================
# 3. Load & clean dataset
# ===============================
df = pd.read_csv("/content/Roman Urdu DataSet.csv", header=None)

df.reset_index(inplace=True) # resetting the index to give column names
df.columns= ["A", "B","C", "D"] #giving some random column names which will be changed later


# Keep only required columns
df = df[["B", "C"]]
df.rename(columns={'B': 'text', 'C': 'sentiment'}, inplace=True)

# Fix label spelling errors
df['sentiment'] = df['sentiment'].replace({'Neative': 'Negative'})

# Apply preprocessing
df = preprocess_df(df)

# Ensure lowercase labels
df['sentiment'] = df['sentiment'].str.lower()

print("Dataset sample after cleaning:")
print(df.head())
print("\nLabel distribution:\n", df['sentiment'].value_counts())

Dataset sample after cleaning:
                                                text sentiment  \
0  Sai kha ya her kisi kay bus ki bat nhi hai lak...  positive   
1                                          sahi bt h  positive   
2                                        Kya bt hai,  positive   
3                                         Wah je wah  positive   
4                               Are wha kaya bat hai  positive   

                                          text_clean  
0  Sai kha ya her kisi kay bus ki bat nahi hai la...  
1                                          sahi bt h  
2                                        kya bt hai,  
3                                         Wah je wah  
4                               Are wha kaya bat hai  

Label distribution:
 sentiment
neutral     8929
positive    6013
negative    5287
Name: count, dtype: int64


In [4]:
# ===============================
# 4. Convert to Hugging Face Dataset
# ===============================
class_labels = ClassLabel(names=["negative", "neutral", "positive"])
hf_dataset = Dataset.from_pandas(
    df[['text_clean', 'sentiment']].rename(columns={'text_clean': 'text'})
)
hf_dataset = hf_dataset.cast_column("sentiment", class_labels)

dataset_split = hf_dataset.train_test_split(test_size=0.2, seed=42)

# 💡 SPEED TIP: use only a subset for quick testing
dataset_split["train"] = dataset_split["train"].shuffle(seed=42).select(range(2000))
dataset_split["test"] = dataset_split["test"].shuffle(seed=42).select(range(500))

Casting the dataset:   0%|          | 0/20229 [00:00<?, ? examples/s]

In [5]:
# ===============================
# 5. Tokenization (shorter length)
# ===============================
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

dataset_tokenized_split = dataset_split.map(tokenize, batched=True)
dataset_tokenized_split = dataset_tokenized_split.rename_column("sentiment", "labels")
dataset_tokenized_split.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [6]:
# ===============================
# 6. Metrics
# ===============================
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {**acc, **f1}

In [7]:
# ===============================
# 7. Load model
# ===============================
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# ===============================
# 8. Training arguments (optimized for speed)
# ===============================
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,  # simulate bigger batch
    num_train_epochs=1,  # quick testing
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,
    report_to=[]
)

In [9]:
# ===============================
# 9. Trainer
# ===============================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_tokenized_split["train"],
    eval_dataset=dataset_tokenized_split["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [10]:
# ===============================
# 10. Train model
# ===============================
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0249,1.008223,0.496,0.475891


TrainOutput(global_step=500, training_loss=1.0672814331054687, metrics={'train_runtime': 5954.6912, 'train_samples_per_second': 0.336, 'train_steps_per_second': 0.084, 'total_flos': 131556708864000.0, 'train_loss': 1.0672814331054687, 'epoch': 1.0})

In [11]:
# ===============================
# 11. Save model & tokenizer
# ===============================
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

print("✅ Model training complete and saved to ./results")

✅ Model training complete and saved to ./results


In [12]:
# ===============================
# 1. Install dependencies
# ===============================

!pip install transformers gradio -q


In [30]:

# ===============================
# 1. Gradio
# ===============================


import gradio as gr

def predict_sentiment(text):
    results = sentiment_pipeline(text)
    label = results[0]['label']
    score = round(results[0]['score'], 3)
    return f"Sentiment: {label} | Confidence: {score}"

# Build interface
interface = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=3, placeholder="Type text in any language..."),
    outputs="text",
    title="🌍 Multilingual Sentiment Analysis",
    description="Enter text in any language to classify sentiment (Positive, Negative, Neutral)."
)

# Launch (share=True gives you public link)
interface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://24f28c0407f1272dd6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [31]:
import gradio as gr

# Store history
history = []

def predict_sentiment(text):
    global history
    lines = [line.strip() for line in text.split("\n") if line.strip()]  # Remove empty lines

    output_lines = []
    for line in lines:
        results = sentiment_pipeline(line)
        label = results[0]['label']
        score = round(results[0]['score'], 3)

        result_str = f"{line} → Sentiment: {label} | Confidence: {score}"
        output_lines.append(result_str)
        history.append(result_str)

    # Current output
    current_result = "\n".join(output_lines)
    # History
    history_text = "\n".join(history)

    return current_result, history_text

def clear_history():
    global history
    history = []
    return "", ""  # Clears both current result and history

# Build interface
with gr.Blocks() as interface:
    gr.Markdown("## 🌍 Multilingual Sentiment Analysis\nEnter multiple lines of text. Each line will be classified separately.")

    with gr.Row():
        text_input = gr.Textbox(lines=5, placeholder="Type text (one sentence per line)...", label="Input Text")

    with gr.Row():
        current_output = gr.Textbox(label="Current Prediction", lines=5)
        history_output = gr.Textbox(label="Prediction History", lines=10)

    with gr.Row():
        submit_btn = gr.Button("Predict")
        clear_btn = gr.Button("Clear History")

    submit_btn.click(predict_sentiment, inputs=text_input, outputs=[current_output, history_output])
    clear_btn.click(clear_history, inputs=None, outputs=[current_output, history_output])

interface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a1f98c8b8e03e89ba0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


