# Import Required Libraries

In [22]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
from sklearn.metrics import classification_report
import numpy as np


In [23]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/dialects-db/cleaned_x_valid.csv
/kaggle/input/dialects-db/dialects_data.csv
/kaggle/input/dialects-db/cleaned_x_train.csv
/kaggle/input/dialects-db/cleaned_x_test.csv
/kaggle/input/dialects-db/dialects_database.db


# Load Data

In [24]:
# Load your dataset
train_df = pd.read_csv('/kaggle/input/dialects-db/cleaned_x_train.csv')
valid_df = pd.read_csv('/kaggle/input/dialects-db/cleaned_x_valid.csv')
test_df = pd.read_csv('/kaggle/input/dialects-db/cleaned_x_test.csv')

In [25]:
# Drop rows with NaN values
train_df = train_df.dropna()
valid_df = valid_df.dropna()
test_df = test_df.dropna()

In [26]:
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5) 

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokenize the Dataset

In [27]:
train_df.head()

Unnamed: 0,cleaned_x_train,y_train
0,اي خبر او تطور من جل الديب حدا يدقلي ما بفتحلو...,1
1,الشي الوحيد المنيح اللي عملو جورج وسوف بحياته ...,1
2,معلهش معلش معلهش معلهش معلهش معلهش معلهش معلهش...,0
3,اول ما خذا البطاقة جا في بالي اياب اليوفي يارب...,2
4,انا صرت نازلة شي خمس مرات من يوم ما نشهرت المل...,1


In [28]:
# Identify text and label columns
train_text_col = 'cleaned_x_train'  
train_label_col = 'y_train'  
valid_text_col = 'cleaned_x_valid'  
valid_label_col = 'y_valid'  
test_text_col = 'cleaned_x_test'  
test_label_col = 'y_test' 

In [29]:
def tokenize_data(df, tokenizer, text_col, max_length=128):
    return tokenizer(
        df[text_col].tolist(),
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="tf"
    )


In [30]:
# Tokenize and encode the data
train_encodings = tokenize_data(train_df, tokenizer, train_text_col)
valid_encodings = tokenize_data(valid_df, tokenizer, valid_text_col)
test_encodings = tokenize_data(test_df, tokenizer, test_text_col)

# Prepare TensorFlow Datasets

In [31]:
# Prepare TensorFlow datasets
def create_tf_dataset(encodings, labels, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))
    dataset = dataset.shuffle(len(labels)).batch(batch_size)
    return dataset

train_dataset = create_tf_dataset(train_encodings, train_df[train_label_col].values)
valid_dataset = create_tf_dataset(valid_encodings, valid_df[valid_label_col].values)
test_dataset = create_tf_dataset(test_encodings, test_df[test_label_col].values)

# Modeling

## Compile the Model

In [32]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)


In [33]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=5, 
    restore_best_weights=True )


## Train the Model

In [35]:
history = model.fit(train_dataset,validation_data=valid_dataset,epochs=5)

Epoch 1/5
Cause: for/else statement not yet supported


I0000 00:00:1716749693.727630     130 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [42]:
# Evaluate the model
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.6464565396308899
Test Accuracy: 0.8517929315567017


In [48]:
# Get predictions for the test dataset
test_pred_logits = model.predict(test_dataset).logits
test_preds = np.argmax(test_pred_logits, axis=1)



In [49]:
# Extract true labels from the test dataset
test_labels = []
for batch in test_dataset:
    test_labels.extend(batch[1].numpy())


In [50]:
# Generate the classification report
target_names = ["Class 0", "Class 1", "Class 2", "Class 3", "Class 4"] 
report = classification_report(test_labels, test_preds, target_names=target_names)
print(report)

              precision    recall  f1-score   support

     Class 0       0.39      0.41      0.40     11468
     Class 1       0.19      0.19      0.19      5636
     Class 2       0.24      0.22      0.23      7266
     Class 3       0.08      0.08      0.08      2330
     Class 4       0.09      0.09      0.09      2833

    accuracy                           0.27     29533
   macro avg       0.20      0.20      0.20     29533
weighted avg       0.26      0.27      0.26     29533



In [54]:
import joblib
joblib.dump(model, 'ARCbert_model.pkl')





['ARCbert_model.pkl']