In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline


pipe = pipeline(
    "text-classification",
    model="CAMeL-Lab/bert-base-arabic-camelbert-msa-did-madar-twitter5",
    device=0  # use GPU (device 0), set -1 for CPU
)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


In [2]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")


PyTorch version: 2.8.0+cu129
CUDA available: True
Device name: NVIDIA GeForce RTX 3080


In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-did-madar-twitter5")
model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-did-madar-twitter5")



In [4]:
import pandas as pd

df = pd.read_csv("data_v0.1.0.csv")

df.head()

Unnamed: 0,text,dialect
0,ياخي المدرب أختاره والمدرب دخله والمدرب بارك ا...,AE
1,شو الي قاعد يجري فالنصر يا أخوه خسر المباراة س...,AE
2,الي يبحث عن مشكلة الوصل راح يحصلها فالجولان,AE
3,انا مش معترض على تغير عامر الي دخل مكان عامر ا...,AE
4,تراجع مخيف في مستوى الحارس الكبير ماجد ناصر مش...,AE


In [38]:
import random
shuffled_df = df.sample(frac=1,random_state=42)
sentences = shuffled_df["text"]
region_mapping = {
    "AE": "GULF",
    "BH": "GULF",
    "DZ": "NA",
    "EG": "NILE",
    "IQ": "IRAQ",
    "JO": "LEV",
    "KW": "GULF",
    "LB": "LEV",
    "LY": "NA",
    "MA": "NA",
    "OM": "GULF",
    "PL": "LEV",
    "QA": "GULF",
    "SA": "GULF",
    "SD": "NILE",
    "SY": "LEV",
    "TN": "NA",
    "YE": "YEM",
    "MSA": "MSA"
}

# Map the dialect/country code to region
shuffled_df['dialect'] = shuffled_df['dialect'].map(region_mapping)

# Check the result
print(shuffled_df.head())
shuffled_df.head() 

                                                     text dialect
189081                                     لن ينفتح الباب    GULF
302421  استغلوا حبوب الهلوسه والشجاعة والبردقان المخدر...     YEM
337783                       مبروك سمعت إلى باش تجيب بيبي      NA
147454                              ليه يقلبوا سعادين هيك     LEV
281380           بيطلعلو مدعوم وماحدى بيسترجي يقله سدحلقك     LEV


Unnamed: 0,text,dialect
189081,لن ينفتح الباب,GULF
302421,استغلوا حبوب الهلوسه والشجاعة والبردقان المخدر...,YEM
337783,مبروك سمعت إلى باش تجيب بيبي,
147454,ليه يقلبوا سعادين هيك,LEV
281380,بيطلعلو مدعوم وماحدى بيسترجي يقله سدحلقك,LEV


In [47]:
lol = pipe(sentences.tolist()[:10000])


In [48]:
# Mapping from country/dialect label → region
region_mapping = {
    "United_Arab_Emirates": "GULF",
    "Bahrain": "GULF",
    "Algeria": "NA",
    "Egypt": "NILE",
    "Iraq": "IRAQ",
    "Jordan": "LEV",
    "Kuwait": "GULF",
    "Lebanon": "LEV",
    "Libya": "NA",
    "Morocco": "NA",
    "Oman": "GULF",
    "Palestine": "LEV",
    "Qatar": "GULF",
    "Saudi_Arabia": "GULF",
    "Sudan": "NILE",
    "Syria": "LEV",
    "Tunisia": "NA",
    "Yemen": "YEM",
    "Somalia": "SOM",
    "Mauritania": "NA"
}

# Map predicted labels to regions
predicted_regions = [region_mapping[label] for label in pred_labels]

# Example: check unique predicted regions
unique_regions = list(set(predicted_regions))
print(unique_regions)


['GULF', 'SOM', 'NA', 'LEV', 'YEM', 'IRAQ', 'NILE']


In [49]:
import pandas as pd

# Assuming lol is your predictions list from the pipeline
# Example: [{'label': 'Sudan', 'score': 0.1278}, ...]

# Mapping from country/dialect label → region
region_mapping = {
    "United_Arab_Emirates": "GULF",
    "Bahrain": "GULF",
    "Algeria": "NA",
    "Egypt": "NILE",
    "Iraq": "IRAQ",
    "Jordan": "LEV",
    "Kuwait": "GULF",
    "Lebanon": "LEV",
    "Libya": "NA",
    "Morocco": "NA",
    "Oman": "GULF",
    "Palestine": "LEV",
    "Qatar": "GULF",
    "Saudi_Arabia": "GULF",
    "Sudan": "NILE",
    "Syria": "LEV",
    "Tunisia": "NA",
    "Yemen": "YEM",
    "Somalia": "SOM",
    "Mauritania": "NA",
    "Modern Standard Arabic": "MSA",
    "MSA": "MSA"
}

# 1️⃣ Map predicted labels to regions
predicted_regions = [region_mapping[p['label']] for p in lol]

# 2️⃣ Get ground truth regions from shuffled_df
ground_truth = shuffled_df['dialect'].tolist()[:len(predicted_regions)]

# 3️⃣ Compare predictions to ground truth
correct = [pred==true for pred, true in zip(predicted_regions, ground_truth)]

# 4️⃣ Compute accuracy
accuracy = sum(correct) / len(correct)
print(f"Region-level accuracy: {accuracy*100:.2f}%")

# Optional: add to DataFrame for inspection
shuffled_df.loc[:len(predicted_regions)-1, 'Predicted_Region'] = predicted_regions
shuffled_df['Correct'] = shuffled_df['Region'] == shuffled_df['Predicted_Region']

# View the first few rows
print(shuffled_df.head())


Region-level accuracy: 58.12%


ValueError: Must have equal len keys and value when setting with an iterable