In [1]:
import os

from functools import partial
from typing import Callable

import evaluate

import numpy as np
import pandas as pd

from arabert.preprocess import ArabertPreprocessor
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from datasets import Dataset, ClassLabel
from evaluate import evaluator, combine

# Setup Data

In [2]:
dataset_path = "https://nlp-slides.vercel.app/clean-tweets.tsv"
   
dataset = pd.read_csv(filepath_or_buffer=dataset_path, sep="\t")

In [3]:
dataset.head(n=2)

Unnamed: 0,Tweet,Country,Topic,Sentiment,Sentiment_Expression,Sentiment_Target,word_count,char_count,clean_text,clean_stemmed
0,"""أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد م...",lebanon,personal,negative,implicit,بريق العيون,23,132,اومن بان الانسان ينطفي جماله ابتعاد يحب بريق ا...,اوم بان انس نطف جمل بعد يحب برق عين خفي صبح ذب...
1,من الذاكره... @3FInQe . عندما اعتقد كريستيانو ...,jordan,sports,positive,explicit,افضل لاعب في العالم,23,141,الذاكره عندما اعتقد كريستيانو انه افضل لاعب ال...,ذكر عند عقد كريستيانو انه فضل لعب علم ككا يسي ...


In [4]:
data = dataset[["Tweet", "Sentiment"]]
data.head(n=3)

Unnamed: 0,Tweet,Sentiment
0,"""أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد م...",negative
1,من الذاكره... @3FInQe . عندما اعتقد كريستيانو ...,positive
2,لا نخلو من ضغوطات الحياة. فنحن نعيش على أرض أع...,neutral


In [5]:
data = data.rename({"Tweet": "text", "Sentiment": "label"}, axis=1)

In [6]:
data["label"] = data["label"].replace({"negative": 0, "neutral": 1, "positive": 2})

In [7]:
data

Unnamed: 0,text,label
0,"""أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد م...",0
1,من الذاكره... @3FInQe . عندما اعتقد كريستيانو ...,2
2,لا نخلو من ضغوطات الحياة. فنحن نعيش على أرض أع...,1
3,#مصطلحات_لبنانيه_حيرت_البشريه بتوصل عالبيت ، ب...,0
4,نصمت !! لتسير حياتنا على مً يرام فالناّس لم تع...,0
...,...,...
3995,صلاح من لاعب في المقاولون العرب يحلم ان يلعب ل...,2
3996,الملك سلمان بن عبد العزيز: تطبيق الأنظمة بحزم ...,2
3997,@ZahraaIraq9 😂 كل ما ادخل حسابي الكه تغريداتج ...,0
3998,شو هالشعب نحنا اللي عايش بلا مي وكهربا والزبال...,0


In [8]:
data["label"].value_counts()

0    1883
2    1232
1     885
Name: label, dtype: int64

# Setup Model

In [9]:
model_name="aubmindlab/bert-large-arabertv02-twitter"

# Dataset

## Load Dataset

In [10]:
dataset = Dataset.from_pandas(df=data)

## Labelling

In [11]:
classlabel = ClassLabel(num_classes=3, names=["negative", "neutral", "positive"])

In [12]:
dataset = dataset.cast_column(column="label", feature=classlabel)

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None)}

## Train Test Split

## Preprocess Data for Model

In [14]:
arabert_prep = ArabertPreprocessor(model_name=model_name)

In [15]:
sample = dataset[3997]['text']
sample

'@ZahraaIraq9 😂 كل ما ادخل حسابي الكه تغريداتج عن حب العراق وانتي هسه اذا ينطوج جنسيه مال غير دوله و يجيج واتب كل را… https://t.co/MqZmtiTcil'

In [16]:
arabert_prep.preprocess(sample)

'[مستخدم] 😂 كل ما ادخل حسابي الكه تغريداتج عن حب العراق وانتي هسه اذا ينطوج جنسيه مال غير دوله و يجيج واتب كل را … [رابط]'

In [17]:
dataset = dataset.map(function=lambda x: {"text": arabert_prep.preprocess(x)}, input_columns="text")

  0%|          | 0/4000 [00:00<?, ?ex/s]

In [18]:
dataset[0]["text"]

'" أنا أؤمن بأن الانسان ينطفئ جماله عند ابتعاد من يحب ، حتى بريق العيون يختفي فيصبح ذابلا منطفئا ، يتحول ربيعه الى خريف . " . [مستخدم]'

## Train Test Split

In [19]:
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="label")

In [20]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 800
    })
})

## Tokenizer

In [21]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

In [22]:
tokenizer

PreTrainedTokenizerFast(name_or_path='aubmindlab/bert-large-arabertv02-twitter', vocab_size=64000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [23]:
dataset = dataset.map(function=lambda x: tokenizer(x, truncation=True, max_length=64), input_columns="text")

  0%|          | 0/3200 [00:00<?, ?ex/s]

  0%|          | 0/800 [00:00<?, ?ex/s]

In [24]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3200
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 800
    })
})

## Model

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=3
)

Some weights of the model checkpoint at aubmindlab/bert-large-arabertv02-twitter were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubm

In [26]:
# Use DataCollatorWithPadding to pad tokens and prepare batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=64)

# Metrics

In [27]:
f1 = evaluate.load("f1")

In [28]:
def compute_metrics(eval_pred: np.ndarray, metric: evaluate.Metric):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

compute_metrics_fn = partial(compute_metrics, metric=f1)

# Training

## Training Args

In [29]:
training_args = TrainingArguments(
    output_dir=os.path.join(os.curdir, "data"),
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_steps=50,
    logging_strategy="steps",
    logging_steps=50, 
    evaluation_strategy="steps",
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    load_best_model_at_end=True,
    save_steps=50,
    save_total_limit=1
)

## Trainer

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_fn
)

## Train Model

In [31]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3200
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4000
  Number of trainable parameters = 369426435
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
50,0.933,0.805611,0.542343
100,0.708,0.631505,0.746698
150,0.7011,0.612677,0.765455
200,0.5522,0.547639,0.78618
250,0.5902,0.499119,0.797481
300,0.6143,0.521964,0.796336
350,0.6067,0.585292,0.753902


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 8
Saving model checkpoint to ./data/checkpoint-50
Configuration saved in ./data/checkpoint-50/config.json
Model weights saved in ./data/checkpoint-50/pytorch_model.bin
tokenizer config file saved in ./data/checkpoint-50/tokenizer_config.json
Special tokens file saved in ./data/checkpoint-50/special_tokens_map.json
Deleting older checkpoint [data/checkpoint-200] due to args.save_total_limit
Deleting older checkpoint [data/checkpoint-800] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassific

## Evaluate Model