In [1]:
import pandas as pd
import numpy as np
import torch
from datasets import load_metric
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, TextClassificationPipeline
from transformers.data.processors.utils import InputFeatures

In [2]:
df = pd.read_csv('../PreprocessedData/preprocessed_data.csv',sep='\t',encoding='utf-8')

In [3]:
df.sample(5)

Unnamed: 0,id,text,dialect,preprocessed_text
28545,988849183967924224,جميل ان يكون عندك صديق يعرف نوع الهدية اللي تع...,LY,جميل ان يكون عندك صديق يعرف نوع الهدية اللي تع...
262672,1006911293805203584,@Bassant214 ربنا يقدرك كل مرة تكملي ومتكونش دي...,EG,[مستخدم] ربنا يقدرك كل مرة تكملي ومتكونش دي اخ...
406034,1090929201522376832,@AliAbuahmed49 @AJ85555 المعلق رخيص .. طاح في ...,AE,[مستخدم] [مستخدم] المعلق رخيص . . طاح في دولة ...
174982,1018901008792150016,عمر ذيل الكلب ما #بنعدل\nولا في صاحبه،بنت عم، ...,JO,عمر ذيل الكلب ما # بنعدل ولا في صاحبه ، بنت عم...
431575,839604070251921408,@AbduMohd_21 ههه مبروكين انا كنت اشوف السيتي,AE,[مستخدم] هه مبروكين انا كنت اشوف السيتي


In [4]:
df.dropna(subset=['preprocessed_text'],inplace=True)

In [5]:
model_name = "aubmindlab/bert-base-arabertv02-twitter"
arabert_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
class bert_dataset(torch.utils.data.Dataset):
    def __init__(self, text_list, classes_list, tokenizer):
        self.features = []
        self.labels = []
        self.text_list = text_list
        self.classes_list = classes_list
        self.tokenizer = tokenizer
        self.max_len = 0
        self.create_dataset()

    def __getitem__(self,index):
        input_ids = self.features[index]
        attention_mask = [1] * len(input_ids)
        padding_length = self.max_len - len(input_ids)
        input_ids = ([self.tokenizer.pad_token_id] * padding_length) + input_ids
        attention_mask = ([0] * padding_length) + attention_mask    

        return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.labels.iloc[index])

    def __len__(self):
        return len(self.features)

    def create_dataset(self):
        lines = []
  
        for line in self.text_list:
            line_tokenized = self.tokenizer.encode(line)
            self.max_len = max(self.max_len,len(line_tokenized))
            lines.append(line_tokenized)
        self.features = lines
        self.labels = pd.get_dummies(self.classes_list)

In [7]:
dataset = bert_dataset(df['preprocessed_text'], df['dialect'], arabert_tokenizer)

train_len = int(len(dataset)*0.9)
test_len = len(dataset) - train_len

train_set, test_set = torch.utils.data.random_split(dataset, [train_len, test_len], generator=torch.Generator().manual_seed(42))

In [8]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    references = np.argmax(labels, axis=1)
    return metric.compute(predictions=predictions, references=references)

In [9]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=18)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

training_args = TrainingArguments(
    output_dir = "test_trainer",
    save_strategy = 'epoch',
    overwrite_output_dir = True,
    num_train_epochs = 3,
    evaluation_strategy="epoch",
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    seed = 42
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_set,
    eval_dataset = test_set,
    compute_metrics = compute_metrics
)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmi

In [None]:
trainer.train()

In [None]:
# trainer.save_model('../models/transformer_model')
# model = BertForSequenceClassification.from_pretrained("../models/transformer_model")
torch.save(model.state_dict(), '../models/transformer_wts.pth')

In [13]:
model.load_state_dict(torch.load('../models/transformer_wts.pth'))

<All keys matched successfully>

In [22]:
model.to('cpu')
pipe = TextClassificationPipeline(model=model, tokenizer=arabert_tokenizer)

label_dict = {'LABEL_0' : 'AE', 'LABEL_1' : 'BH', 'LABEL_2' : 'DZ', 'LABEL_3' :	'EG', 'LABEL_4' : 'IQ', 'LABEL_5' : 'JO', 'LABEL_6' :	'KW', 'LABEL_7' :	'LB', 'LABEL_8' : 'LY',
              'LABEL_9'	: 'MA', 'LABEL_10' : 'OM', 'LABEL_11' : 'PL', 'LABEL_12' :'QA', 'LABEL_13' :	'SA', 'LABEL_14' :	'SD', 'LABEL_15' :	'SY', 'LABEL_16' : 'TN', 'LABEL_17' : 'YE'}

s = 'يخرب بيت عيونك يا صوفيا شو حلوين'         

for pred in pipe(s):
    print(label_dict[pred['label']])

SY
