# Urdu IMDB Reviews Sentiment Analysis (M. Yousaf)

In [1]:
#install necessary packages

!pip install pandas scikit-learn datasets transformers torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

# Necessary Imports & Load Dataset

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

# Load the Urdu dataset from Colab
file_path = '/content/imdb_urdu_reviews_train.csv'
data = pd.read_csv(file_path)

# sentiment column string to integer labels
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [11]:
data.head(8)

Unnamed: 0,review,sentiment
0,میں نے اسے 80 کی دہائی کے وسط میں ایک کیبل گائ...,1
1,چونکہ میں نے 80 کی دہائی میں انسپکٹر گیجٹ کارٹ...,0
2,ایک ایسے معاشرے کی حالت کے بارے میں تعجب کرتا ...,1
3,مفید البرٹ پیون کی طرف سے ایک اور ردی کی ٹوکری...,0
4,یہ کولمبو ہے جس کی ہدایتکاری اپنے کیریئر کے اب...,1
5,مجھے اس فلم کا بیشتر حصہ پسند آیا۔ جیسا کہ دوس...,0
6,ٹھیک ہے ، شاید یہ آسکر کا مستحق نہیں ہے۔ یا گو...,1
7,میں نے اسے سائنس فائی چینل پر دیکھا۔ یہ پہلے و...,0


# Data Preparation

In [12]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas (test_data)

In [13]:
# Load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['review'], truncation=True, padding='max_length', max_length=128)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Add label column for PyTorch compatibility
train_dataset = train_dataset.rename_column("sentiment", "label")
test_dataset = test_dataset.rename_column("sentiment", "label")

Map:   0%|          | 0/28000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [21]:
print(train_dataset)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch',  columns=['input_ids', 'attention_mask', 'label'])

train_dataset[0]['attention_mask']

Dataset({
    features: ['review', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 28000
})


tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1])

# Model Configuration - Parameter Setting

In [23]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    evaluation_strategy="epoch",    # Evaluate each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",            # directory for logs
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [24]:
# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("./urdu_imdb_revivews_sentiment")
tokenizer.save_pretrained("./urdu_imdb_revivews_sentiment")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.5955,0.87208


('./urdu_imdb_revivews_sentiment/tokenizer_config.json',
 './urdu_imdb_revivews_sentiment/special_tokens_map.json',
 './urdu_imdb_revivews_sentiment/vocab.txt',
 './urdu_imdb_revivews_sentiment/added_tokens.json',
 './urdu_imdb_revivews_sentiment/tokenizer.json')

# Model Evaluation

In [25]:
# Evaluate the model
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

Evaluation Results: {'eval_loss': 0.8720802664756775, 'eval_runtime': 51.7039, 'eval_samples_per_second': 232.091, 'eval_steps_per_second': 116.045, 'epoch': 1.0}


In [26]:
import pandas as pd
import torch

# Evaluate the model
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

# Function to predict sentiment of Urdu text
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1).item()
    return "positive" if predictions == 1 else "negative"

Evaluation Results: {'eval_loss': 0.8720802664756775, 'eval_runtime': 82.465, 'eval_samples_per_second': 145.516, 'eval_steps_per_second': 72.758, 'epoch': 1.0}


In [29]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)

    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1).item()
    return "positive" if predictions == 1 else "negative"

# Model Prediction & Testing Accuracy

In [30]:

# Load test data from CSV
test_file = "/content/imdb_urdu_reviews_test.csv"  # Replace with your file path
df = pd.read_csv(test_file)

# Ensure the file has the required columns
#if "text" not in df.columns or "label" not in df.columns:
#    raise ValueError("CSV file must contain 'text' and 'label' columns.")

# Map labels to match model output (adjust as needed)
#label_mapping = {"positive": 1, "negative": 0}  # Update based on your model's label mapping
#df["label"] = df["label"].map(label_mapping)

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Predict sentiments and calculate accuracy
df["predicted_label"] = df["review"].apply(lambda x: 1 if predict_sentiment(x) == "positive" else 0)
accuracy = (df["sentiment"] == df["predicted_label"]).mean()

print(f"Accuracy on test data: {accuracy:.2%}")

# Save results to a new CSV file
output_file = "test_results.csv"
df.to_csv(output_file, index=False)
print(f"Predictions saved to {output_file}")


Accuracy on test data: 72.84%
Predictions saved to test_results.csv
