# **Using IMDB dataset to tarin MBERT Model**

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("akkefa/imdb-dataset-of-50k-movie-translated-urdu-reviews")
print("Path to dataset files:",path)

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-translated-urdu-reviews' dataset.
Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-translated-urdu-reviews


In [None]:
# =====================================
# 📘 Step 1: Install Libraries
# =====================================
!pip install transformers datasets torch scikit-learn

# =====================================
# 📘 Step 2: Import libraries
# =====================================
import pandas as pd
import numpy as np
import torch
import time
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



In [None]:
import kagglehub
import pandas as pd
import os

# 🔹 Step 1: Download dataset directly from Kaggle
path = kagglehub.dataset_download("akkefa/imdb-dataset-of-50k-movie-translated-urdu-reviews")
print("✅ Dataset downloaded successfully!")
print("📂 Path to dataset files:", path)

# 🔹 Step 2: Find any CSV file inside that folder
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
if not csv_files:
    raise FileNotFoundError("❌ No CSV file found in dataset folder. Please check Kaggle dataset structure.")

csv_path = os.path.join(path, csv_files[0])
print("📄 Found CSV file:", csv_path)

# 🔹 Step 3: Load the dataset
data = pd.read_csv(csv_path)

# 🔹 Step 4: Keep only relevant columns
if 'review' in data.columns and 'sentiment' in data.columns:
    data = data[['review', 'sentiment']]
else:
    raise ValueError(f"❌ Columns not found. Available columns: {data.columns.tolist()}")

# 🔹 Step 5: Convert sentiment text to numeric labels
data['label'] = data['sentiment'].map({'positive': 1, 'negative': 0})

print("\n✅ Dataset Loaded Successfully!")
print(data.head())
print("\nTotal Samples:", len(data))
print("\nSentiment Counts:")
print(data['sentiment'].value_counts())


Downloading from https://www.kaggle.com/api/v1/datasets/download/akkefa/imdb-dataset-of-50k-movie-translated-urdu-reviews?dataset_version_number=6...


100%|██████████| 32.3M/32.3M [00:00<00:00, 125MB/s] 

Extracting files...





✅ Dataset downloaded successfully!
📂 Path to dataset files: /root/.cache/kagglehub/datasets/akkefa/imdb-dataset-of-50k-movie-translated-urdu-reviews/versions/6
📄 Found CSV file: /root/.cache/kagglehub/datasets/akkefa/imdb-dataset-of-50k-movie-translated-urdu-reviews/versions/6/imdb_urdu_reviews_test.csv

✅ Dataset Loaded Successfully!
                                              review sentiment  label
0  یہ بے گھر خواتین کے بارے میں ایک دستاویزی فلم ...  negative      0
1  بالکل بھی اچھ ،ی کام نہیں کیا گیا ، پوری فلم ص...  negative      0
2  یہ عجیب بات ہے کہ کچھ لوگوں کا کیا حشر ہوتا ہے...  negative      0
3  اور یہ خاص طور پر وکیلوں اور پولیس اہلکاروں کے...  positive      1
4  پہلے ، ایک وضاحت: میری سرخی کے باوجود ، میں اس...  positive      1

Total Samples: 10000

Sentiment Counts:
sentiment
positive    5082
negative    4918
Name: count, dtype: int64


In [None]:
# 📘 Step 4: Split dataset into train and test
# =====================================
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['review'].tolist(),
    data['label'].tolist(),
    test_size=0.2,
    random_state=42
)


In [None]:
# 📘 Step 5: Load tokenizer (mBERT)
# =====================================
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize Urdu text
def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)

train_data = Dataset.from_dict({"text": train_texts, "label": train_labels})
test_data = Dataset.from_dict({"text": test_texts, "label": test_labels})

train_data = train_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)

train_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# 📘 Step 6: Load model
# =====================================
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# =====================================
# 📘 Step 7: Define metrics
# =====================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install -U transformers --quiet


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    do_eval=True,                   # Enables evaluation
    save_total_limit=2,             # Save only last 2 checkpoints
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100
)


In [None]:
# 📘 Step 9: Trainer setup
# =====================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
)


In [None]:
# 📘 Step 10: Fine-tune and measure time
# =====================================
start_time = time.time()
trainer.train()
end_time = time.time()

training_time = end_time - start_time
print(f"\n⏱️ Training Time: {training_time/60:.2f} minutes")

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mf23ari77[0m ([33mf23ari77-aror-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,0.6883
200,0.6908
300,0.655
400,0.6414
500,0.5955
600,0.5916
700,0.6058
800,0.5573
900,0.5656
1000,0.5526



⏱️ Training Time: 12.85 minutes


In [None]:
# 📘 Step 11: Evaluate model
# =====================================
results = trainer.evaluate()
print("\n✅ Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")



✅ Evaluation Results:
eval_loss: 0.5486
eval_accuracy: 0.7370
eval_precision: 0.7224
eval_recall: 0.7940
eval_f1: 0.7565
eval_runtime: 14.4810
eval_samples_per_second: 138.1120
eval_steps_per_second: 17.2640
epoch: 2.0000


In [None]:
# =====================================
# 📘 Step 11: Evaluate Model Performance
# =====================================
results = trainer.evaluate()

# Extract metrics
accuracy = results.get("eval_accuracy", 0)
precision = results.get("eval_precision", 0)
recall = results.get("eval_recall", 0)
f1 = results.get("eval_f1", 0)

# ✅ Print nicely formatted summary
print("\n🎯 Model Performance Summary")
print("=" * 35)
print(f"✅ Accuracy   : {accuracy * 100:.2f}%")
print(f"✅ Precision  : {precision * 100:.2f}%")
print(f"✅ Recall     : {recall * 100:.2f}%")
print(f"✅ F1 Score   : {f1 * 100:.2f}%")
print(f"⏱️ Training Time : {training_time / 60:.2f} minutes")
print("=" * 35)



🎯 Model Performance Summary
✅ Accuracy   : 73.70%
✅ Precision  : 72.24%
✅ Recall     : 79.40%
✅ F1 Score   : 75.65%
⏱️ Training Time : 12.85 minutes


# **Now Using SBERT Model on same dataset**

In [None]:
# =====================================
# 📘 Step 1: Install Required Libraries
# =====================================
!pip install -q sentence-transformers scikit-learn torch pandas

# =====================================
# 📘 Step 2: Import Libraries
# =====================================
import time
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# =====================================
# 📘 Step 3: Load Urdu IMDB Dataset
# =====================================
data = pd.read_csv("IMDB_Dataset.csv")
data = data[['review', 'sentiment']]
data['label'] = data['sentiment'].map({'positive': 1, 'negative': 0})

# Split data into training and testing
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['review'].tolist(),
    data['label'].tolist(),
    test_size=0.2,
    random_state=42
)

# =====================================
# 📘 Step 4: Load SBERT Model
# =====================================
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# =====================================
# 📘 Step 5: Generate Sentence Embeddings
# =====================================
print("🔹 Generating embeddings... (this may take a few minutes)")

start_time = time.time()
train_embeddings = model.encode(train_texts, convert_to_numpy=True, show_progress_bar=True)
test_embeddings = model.encode(test_texts, convert_to_numpy=True, show_progress_bar=True)
embedding_time = (time.time() - start_time) / 60

# =====================================
# 📘 Step 6: Train Classifier
# =====================================
clf = LogisticRegression(max_iter=1000)
start_time = time.time()
clf.fit(train_embeddings, train_labels)
training_time = (time.time() - start_time) / 60

# =====================================
# 📘 Step 7: Evaluate Model
# =====================================
preds = clf.predict(test_embeddings)

acc = accuracy_score(test_labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, preds, average='binary')

# =====================================
# 📘 Step 8: Display Results
# =====================================
print("\n✅ SBERT Sentiment Classification Results:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"⏱️ Embedding Time: {embedding_time:.2f} minutes")
print(f"⏱️ Training Time: {training_time:.2f} minutes")


🔹 Generating embeddings... (this may take a few minutes)


Batches:   0%|          | 0/1000 [00:00<?, ?it/s]

Batches:   0%|          | 0/250 [00:00<?, ?it/s]


✅ SBERT Sentiment Classification Results:
Accuracy: 0.7535
Precision: 0.7575
Recall: 0.7527
F1 Score: 0.7551
⏱️ Embedding Time: 2.07 minutes
⏱️ Training Time: 0.05 minutes
