In [3]:
! pip install transformers
# ! pip install scipy sklearn
# ! pip install farasapy
# ! pip install pyarabic
# ! git clone https://github.com/UBC-NLP/marbert
# ! git clone https://github.com/aub-mind/arabert
! pip install datasets




In [None]:
# ! pip install huggingface_hub
# ! apt install git-lfs
# ! git config --global user.email "moh.aboajabl@gmail.com"
# ! git config --global user.name "jabalov"

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

In [5]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import TFAutoModelForSequenceClassification

import tensorflow as tf
from transformers import create_optimizer

from datasets import list_datasets, load_dataset, Dataset
from pprint import pprint
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [50]:
df_cl = pd.read_csv("../input/cleaned-df/cleaned_df.csv", engine="python")
df_cl["label"] = df_cl["dialect"]

df_cl.drop(df_cl[df_cl.text == '[]'].index, inplace=True, axis=0)
df_cl.dropna(inplace=True)

df_cl.label = LabelEncoder().fit_transform(df_cl.label)

df_train, df_test = train_test_split(df_cl, test_size=0.3, random_state=911, shuffle=True)

In [7]:
df_train[["label", "text"]].to_csv("df_train.csv", encoding="utf-8-sig", index=False)
# df_test[["label", "text"]].to_csv("df_test.csv", encoding="utf-8", index=False)

In [8]:
df = load_dataset('csv', script_version="master", data_files=["./df_train.csv"], delimiter=",", split="train")
df = df.train_test_split()



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-980b5f13cefe7c1d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-980b5f13cefe7c1d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


In [9]:
df

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 240552
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 80185
    })
})

In [10]:
tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERT")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [11]:
tokenized_df = df.map(preprocess_function, batched=True)
tokenized_df = tokenized_df.remove_columns(["text"]) 

  0%|          | 0/241 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/81 [00:00<?, ?ba/s]

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [13]:
tf_train_dataset = tokenized_df["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_dataset = tokenized_df["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [14]:
batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_df["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [15]:
model = TFAutoModelForSequenceClassification.from_pretrained("UBC-NLP/MARBERT", num_labels=18)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

callbacks = [model_checkpoint_callback]

In [15]:
model.compile(optimizer=optimizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=3
)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as the 'labels' key of the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f87e644f750>

In [26]:
# model.save_pretrained("./")
model.load_weights("./tf_model.h5")

In [None]:
# predict_input = tokenizer.encode(df_cl["text"][0],
#                                  truncation=True,
#                                  padding=True,
#                                  return_tensors="tf")

In [51]:
input_seq_test = [tokenizer.encode(lst, truncation=True, padding=True, return_tensors="tf") 
                      for lst in df_test["text"]]

In [52]:
len(input_seq_test)

137460

In [53]:
input_seq_test[0]

<tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[    2,  1956,  4567, 12795,  9421,     3]], dtype=int32)>

In [60]:
tf_output = [np.argmax(model.predict(lst)[0], axis=1) for lst in input_seq_test]

In [61]:
prediction = [lst[0] for lst in tf_output]

In [62]:
dialects_dict = {
    3: "EG",
    11: "PL",
    6: "KW", 
    8: "LY",
    12: "QA",
    5: "JO",
    7: "LB",
    13: "SA",
    0: "AE",
    1: "BH",
    10: "OM",
    15: "SY",
    2: "DZ",
    4: "IQ",
    9: "MA",
    17: "YE",
    16: "TN"
}

In [63]:
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(df_test["label"], prediction))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.47      0.48      0.97      0.48      0.68      0.44      7970
          1       0.44      0.36      0.97      0.40      0.60      0.33      8002
          2       0.61      0.60      0.99      0.60      0.77      0.57      4855
          3       0.77      0.87      0.96      0.82      0.92      0.83     17287
          4       0.63      0.63      0.99      0.63      0.79      0.60      4654
          5       0.45      0.39      0.97      0.42      0.62      0.36      8390
          6       0.57      0.58      0.96      0.58      0.75      0.54     12620
          7       0.69      0.66      0.98      0.67      0.80      0.63      8247
          8       0.73      0.74      0.98      0.74      0.85      0.71     10903
          9       0.84      0.61      1.00      0.71      0.78      0.59      3452
         10       0.48      0.44      0.98      0.46      0.66      0.41      5738
   