In [1]:
!git clone https://github.com/adelelwan24/Arabic-Dialect-Classification.git

fatal: destination path 'Arabic-Dialect-Classification' already exists and is not an empty directory.


In [39]:
%%capture
!pip install transformers torch arabert

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [40]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report

In [41]:
train = pd.read_csv('/kaggle/working/Arabic-Dialect-Classification/cleaned data/cleaned_train.csv')
test  = pd.read_csv('/kaggle/working/Arabic-Dialect-Classification/cleaned data/cleaned_test.csv')

train = train.astype(str)
test  = test.astype(str)

train.shape, test.shape

((118084, 2), (29533, 2))

In [42]:
train_EG = train[train['Dialect'] == 'EG'].sample(9224, random_state=2024)
train_LY = train[train['Dialect'] == 'LY'].sample(9224, random_state=2024)
train_LB = train[train['Dialect'] == 'LB'].sample(9224, random_state=2024)
train_SD = train[train['Dialect'] == 'SD'].sample(9224, random_state=2024)
train_MA = train[train['Dialect'] == 'MA'].sample(9224, random_state=2024)

In [43]:
df = pd.concat([train_EG, train_LY, train_LB,train_SD, train_MA])
print(df.shape)
df['Dialect'].value_counts()

(46120, 2)


Dialect
EG    9224
LY    9224
LB    9224
SD    9224
MA    9224
Name: count, dtype: int64

In [44]:
from arabert.preprocess import ArabertPreprocessor

model_name="bert-base-arabertv02"
arabert_prep = ArabertPreprocessor(model_name=model_name)

text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
arabert_prep.preprocess(text)
# >>>"و+ لن نبالغ إذا قل +نا إن هاتف أو كمبيوتر ال+ مكتب في زمن +نا هذا ضروري"


'ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري'

In [45]:
label_enc = LabelEncoder()

In [46]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Sample Arabic data
texts = list(df['Text'].apply(arabert_prep.preprocess))
labels = list(label_enc.fit_transform(df['Dialect']))
print(len(texts), len(labels))

assert len(texts) == len(labels), "Each text must have a label."

# Load pre-trained AraBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02')

# Print relevant attributes
print(f"Model Max Length: {tokenizer.model_max_length}")
print(f"Padding Token: {tokenizer.pad_token}")
print(f"Padding Token ID: {tokenizer.pad_token_id}")
print(f"Vocabulary Size: {tokenizer.vocab_size}")

# Tokenize the data
inputs = tokenizer(texts, return_tensors='pt', max_length=tokenizer.model_max_length, 
                   truncation=True, padding='max_length', return_token_type_ids=False)

# print(inputs['input_ids'][:, :15])
# print(tokenizer.decode(inputs['input_ids'][0]))


# Create PyTorch dataset
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(labels))

# Split dataset into training and validation sets
train_dataset, val_dataset = train_test_split(dataset, test_size=0.1)

# Create DataLoader for training and validation sets
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=8)


46120 46120
Model Max Length: 512
Padding Token: [PAD]
Padding Token ID: 0
Vocabulary Size: 64000


In [47]:
tokenizer.tokenize(arabert_prep.preprocess('السلام عليكم ! كيف الحال يا شباب؟ ..'))

['السلام', 'عليكم', '!', 'كيف', 'الحال', 'يا', 'شباب', '؟', '.', '.']

In [48]:
for idx in range(4):
    print(texts[idx])
    print(tokenizer.tokenize(texts[idx]))
    print("-" * 20)

عامه الشعب : شوفوا الفلوس الشهره الحلاوه
['عامه', 'الشعب', ':', 'شوف', '##وا', 'الفلوس', 'الشهر', '##ه', 'الحل', '##او', '##ه']
--------------------
شريكه طبعا وقولتلك ' مباعه وبالقانون ، وشريكه بدايه بالسكوت وتعمل نايمه ومش جديد زمان . افتكر الترشح لنواب مش اتحاد ' سمعه اعضاءه سبقاه ! قياس بقي وتطبيق عالاقل اهميه نتوقع نتيجته ايه يعني ! ؟ نسكت احسن وبلاش نفتكر , الكلام مر . اشتري اعصابك
['شريكه', 'طبعا', 'وقول', '##تل', '##ك', "'", 'مبا', '##عه', 'وبال', '##قان', '##ون', '،', 'وشريك', '##ه', 'بدا', '##يه', 'بالس', '##كوت', 'وتعمل', 'نايم', '##ه', 'ومش', 'جديد', 'زمان', '.', 'افت', '##كر', 'الترشح', 'لنواب', 'مش', 'اتحاد', "'", 'سمعه', 'اعضاء', '##ه', 'سبق', '##اه', '!', 'قياس', 'بقي', 'وتطبيق', 'عال', '##اق', '##ل', 'اهم', '##يه', 'نتوقع', 'نتيجته', 'ايه', 'يعني', '!', '؟', 'نس', '##كت', 'احسن', 'وبلا', '##ش', 'نفت', '##كر', ',', 'الكلام', 'مر', '.', 'اشت', '##ري', 'اع', '##صاب', '##ك']
--------------------
ده مفيش اسرع الايام
['ده', 'مفيش', 'اسرع', 'الايام']
--------------------
اسمع

In [49]:
# Get the vocabulary
vocab = tokenizer.get_vocab()

# Print the size of the vocabulary
print(f"Vocabulary Size: {len(vocab)}")

# Print a sample of the vocabulary (first 20 items)
for token, token_id in list(vocab.items())[2000:2010]:
    print(f"Token: {token}, Token ID: {token_id}")

Vocabulary Size: 64000
Token: المتش, Token ID: 25311
Token: لشبونة, Token ID: 26535
Token: توزيعات, Token ID: 47028
Token: تجعلهم, Token ID: 49514
Token: [UNUSED_1290], Token ID: 61290
Token: وامكان, Token ID: 54964
Token: [UNUSED_1378], Token ID: 61378
Token: تزال, Token ID: 4038
Token: الشامي, Token ID: 24976
Token: تزوجت, Token ID: 23161


In [50]:
tokens = vocab.keys()

In [51]:
#### Assert that Punctuation exist in the vocab
for token in tokens:
    if token in """ !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~. """:
        print(token, end='\t')

!	?	|	"	@	{	-	]	=	\	^	<	.	%	#	`	,	+	$	'	}	;	*	(	~	[	_	)	>	:	

In [52]:
#### Assert that English Letters exist in the vocab
for token in tokens:
    if token in """ qwertyuiopasdfghjklzxcvbnm """:
        print(token, end='\t')

a	h	d	j	q	g	t	i	r	z	n	m	s	p	l	e	k	f	y	w	o	v	b	u	c	x	

In [26]:
# Load pre-trained AraBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('aubmindlab/bert-base-arabertv02', num_labels=len(label_enc.classes_))

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [24]:
# # Load pre-trained AraBERT model for sequence classification
# model = AutoModelForSequenceClassification.from_pretrained('aubmindlab/bert-base-arabertv02', num_labels=len(label_enc.classes_))

# # Move model to GPUs
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# # Enable multi-GPU training
# if torch.cuda.device_count() > 1:
#     print(f'Cuda count : {torch.cuda.device_count()}')
#     model = torch.nn.DataParallel(model)

In [25]:
# Access the model configuration
model_config = model.config

# Print relevant attributes
print(f"Model Max Length: {model_config.max_position_embeddings}")
print(f"Number of Labels: {model_config.num_labels}")
print(f"Hidden Size: {model_config.hidden_size}")
print(f"Vocab Size: {model_config.vocab_size}")
print(f"Type Vocabulary Size: {model_config.type_vocab_size}")

Model Max Length: 512
Number of Labels: 5
Hidden Size: 768
Vocab Size: 64000
Type Vocabulary Size: 2


In [56]:
from torch.optim import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3

for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        # Clear previously calculated gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
    
    # Validation
    model.eval()
    eval_loss = 0
    eval_steps = 0
    predictions, true_labels = [], []
    
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
            loss = outputs.loss
            logits = outputs.logits
            
        eval_loss += loss.item()
        eval_steps += 1
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(inputs['labels'].cpu().numpy())
    
    avg_eval_loss = eval_loss / eval_steps
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='macro')
    
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Validation Loss: {avg_eval_loss}")
    print(f"Validation Accuracy: {accuracy}")
    print(f"Validation F1 Score: {f1}")
    
    # Save the model
    model_save_path = f"arabert_model_epoch_{epoch + 1}"
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    print(f"Model saved to {model_save_path}")


Epoch 1/3
Validation Loss: 0.6546125508573688
Validation Accuracy: 0.7549869904596704
Validation F1 Score: 0.7531040379663219
Model saved to arabert_model_epoch_1
Epoch 2/3
Validation Loss: 0.6301433282098903
Validation Accuracy: 0.769297484822203
Validation F1 Score: 0.7699146961751101
Model saved to arabert_model_epoch_2
Epoch 3/3
Validation Loss: 0.62845909396958
Validation Accuracy: 0.7862098872506504
Validation F1 Score: 0.7855138673981219
Model saved to arabert_model_epoch_3


In [31]:
# Example Arabic test data
test_texts = list(test['Text'])
test_labels = list(label_enc.transform(test['Dialect']))

# Tokenize test data
test_inputs = tokenizer(test_texts, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], torch.tensor(test_labels))
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=8)



In [57]:
# Evaluate the model
model.eval()
test_predictions, test_true_labels = [], []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    test_predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
    test_true_labels.extend(inputs['labels'].cpu().numpy())

Test Accuracy: 0.8021535231774625
Classification Report: 
              precision    recall  f1-score   support

           0       0.89      0.82      0.86     11525
           1       0.73      0.90      0.81      5523
           2       0.84      0.75      0.79      7296
           3       0.70      0.77      0.74      2308
           4       0.65      0.69      0.67      2881

    accuracy                           0.80     29533
   macro avg       0.76      0.79      0.77     29533
weighted avg       0.81      0.80      0.80     29533



In [58]:
test_accuracy = accuracy_score(test_true_labels, test_predictions)
class_report = classification_report(test_true_labels, test_predictions)
f1 = f1_score(true_labels, predictions, average='macro')



print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {f1}")
print(f"Classification Report: \n{class_report}")

Test Accuracy: 0.8021535231774625
Test F1 Score: 0.7855138673981219
Classification Report: 
              precision    recall  f1-score   support

           0       0.89      0.82      0.86     11525
           1       0.73      0.90      0.81      5523
           2       0.84      0.75      0.79      7296
           3       0.70      0.77      0.74      2308
           4       0.65      0.69      0.67      2881

    accuracy                           0.80     29533
   macro avg       0.76      0.79      0.77     29533
weighted avg       0.81      0.80      0.80     29533



-----

In [53]:
# Load pre-trained AraBERT model for sequence classification
model_loaded = AutoModelForSequenceClassification.from_pretrained('/kaggle/working/arabert_model_fine_tuned', num_labels=len(label_enc.classes_))

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_loaded.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [54]:
# Evaluate the model
model_loaded.eval()
test_predictions, test_true_labels = [], []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
    
    with torch.no_grad():
        outputs = model_loaded(**inputs)
        logits = outputs.logits
    
    test_predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
    test_true_labels.extend(inputs['labels'].cpu().numpy())

In [55]:
test_accuracy = accuracy_score(test_true_labels, test_predictions)
class_report = classification_report(test_true_labels, test_predictions)
f1 = f1_score(test_true_labels, test_predictions, average='macro')



print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {f1}")
print(f"Classification Report: \n{class_report}")

Test Accuracy: 0.8021535231774625
Test F1 Score: 0.9248752183643715
Classification Report: 
              precision    recall  f1-score   support

           0       0.89      0.82      0.86     11525
           1       0.73      0.90      0.81      5523
           2       0.84      0.75      0.79      7296
           3       0.70      0.77      0.74      2308
           4       0.65      0.69      0.67      2881

    accuracy                           0.80     29533
   macro avg       0.76      0.79      0.77     29533
weighted avg       0.81      0.80      0.80     29533



In [56]:
from torch.optim import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 1

for epoch in range(epochs):
    model_loaded.train()
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        # Clear previously calculated gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model_loaded(**inputs)
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
    
    # Validation
    model_loaded.eval()
    eval_loss = 0
    eval_steps = 0
    predictions, true_labels = [], []
    
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        with torch.no_grad():
            outputs = model_loaded(**inputs)
            loss = outputs.loss
            logits = outputs.logits
            
        eval_loss += loss.item()
        eval_steps += 1
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(inputs['labels'].cpu().numpy())
    
    avg_eval_loss = eval_loss / eval_steps
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='macro')
    
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Validation Loss: {avg_eval_loss}")
    print(f"Validation Accuracy: {accuracy}")
    print(f"Validation F1 Score: {f1}")
    
    # Save the model
    model_save_path = f"arabert_model_fine_tuned_{epoch + 1}"
    model_loaded.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    print(f"Model saved to {model_save_path}")


Epoch 1/1
Validation Loss: 0.2531024248494994
Validation Accuracy: 0.9180398959236774
Validation F1 Score: 0.9176213796319722
Model saved to arabert_model_fine_tuned_1


In [57]:
# Evaluate the model
model_loaded.eval()
test_predictions, test_true_labels = [], []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
    
    with torch.no_grad():
        outputs = model_loaded(**inputs)
        logits = outputs.logits
    
    test_predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
    test_true_labels.extend(inputs['labels'].cpu().numpy())

In [59]:
test_accuracy = accuracy_score(test_true_labels, test_predictions)
class_report = classification_report(test_true_labels, test_predictions)
f1 = f1_score(test_true_labels, test_predictions, average='macro')



print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {f1}")
print(f"Classification Report: \n{class_report}")

Test Accuracy: 0.8021535231774625
Test F1 Score: 0.7714423177358425
Classification Report: 
              precision    recall  f1-score   support

           0       0.89      0.82      0.86     11525
           1       0.73      0.90      0.81      5523
           2       0.84      0.75      0.79      7296
           3       0.70      0.77      0.74      2308
           4       0.65      0.69      0.67      2881

    accuracy                           0.80     29533
   macro avg       0.76      0.79      0.77     29533
weighted avg       0.81      0.80      0.80     29533



In [36]:
!zip -r /kaggle/working/arabert_model_fine_tuned.zip  /kaggle/working/arabert_model_fine_tuned

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/arabert_model_fine_tuned/ (stored 0%)
  adding: kaggle/working/arabert_model_fine_tuned/special_tokens_map.json (deflated 80%)
  adding: kaggle/working/arabert_model_fine_tuned/tokenizer_config.json (deflated 79%)
  adding: kaggle/working/arabert_model_fine_tuned/vocab.txt (deflated 65%)
  adding: kaggle/working/arabert_model_fine_tuned/tokenizer.json (deflated 74%)
  adding: kaggle/working/arabert_model_fine_tuned/config.json (deflated 53%)
  adding: kaggle/working/arabert_model_fine_tuned/model.safetensors (deflated 7%)


In [37]:
from IPython.display import FileLink 
# FileLink(r'arabert_classification.zip')
FileLink(r'arabert_model_fine_tuned.zip')


