In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np 
import pandas as pd 
import os

In [None]:
def read_csv(file_path):
    return pd.read_csv(file_path)


def read_txt(file_path):
    return set(open(file_path, encoding='utf-8').readlines())


def read_arabic_csv(file_path):
    return pd.read_csv(file_path, lineterminator='\n')


In [None]:
import pandas as pd
import requests
from requests.packages import urllib3
import json
from os.path import join
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


url = 'https://recruitment.aimtechnologies.co/ai-tasks'


def get_tweets_api(ids_list):
    return requests.post(url,
                         headers={'Content-Type': 'application/json'},
                         data=json.dumps(ids_list),
                         verify=False).json()


def get_dataset_df(df, save_directory_path="Dataset"):
    len_df = len(df)
    count = 0
    id_list = []
    dialect_list = []
    text_list = []
    while len_df > 0:
        num_samples = min(1000, len_df)
        end_index = count+num_samples-1
        ids_list = list(map(str, df.loc[count: end_index, "id"].values))
        json_dataset = get_tweets_api(ids_list)
        id_list.extend(ids_list)
        dialect_list.extend(list(df.loc[count: end_index, "dialect"].values))
        text_list.extend(json_dataset.values())
        count = end_index
        len_df -= num_samples

    res_df = pd.DataFrame(list(zip(id_list, dialect_list, text_list)),
                          columns=['Id', 'Dialect', "Text"])
    csv_file = join(save_directory_path, "csv_text_dataset.csv")
    res_df.to_csv(csv_file, index=False, encoding="utf8")


In [None]:
csv_file_path = "/content/drive/MyDrive/Dataset/AIMTask/dialect_dataset.csv"
df = read_csv(csv_file_path)
save_directory_path = ""
get_dataset_df(df, save_directory_path)

In [None]:
csv_file_path = "csv_text_dataset.csv"
df = read_arabic_csv(csv_file_path)
df

Unnamed: 0,Id,Dialect,Text
0,1175358310087892992,IQ,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .
1,1175416117793349632,IQ,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...
2,1175450108898565888,IQ,@KanaanRema مبين من كلامه خليجي
3,1175471073770573824,IQ,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐
4,1175496913145217024,IQ,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺
...,...,...,...
458192,1077946712197599232,BH,يقولون ان حميديتش حاطه في قائمة الانتقالات \nو...
458193,1080096147207843840,BH,@FO_1988 المنتخب الالماني راد للوراء مو بس في ...
458194,1080360549035241600,BH,اتمنى ما يصير مثل صفقة العقرب السام رافينها . ...
458195,1080414477210796032,BH,حميديتش داخل سوق الخضرة \nمو سوق الانتقالات


In [None]:
import torch

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80
Fri Mar 11 14:26:53 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    28W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+--------------------------

In [None]:
!pip install optuna
!pip install transformers
!pip install tokenizers
!pip install farasapy
!pip install pyarabic
!git clone https://github.com/aub-mind/arabert

fatal: destination path 'arabert' already exists and is not an empty directory.


In [None]:
!mkdir data
!mkdir train

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘train’: File exists


In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
all_datasets= []

In [None]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [None]:
DATA_COLUMN = "text"
LABEL_COLUMN = "label"

In [None]:
df = read_arabic_csv(csv_file_path)

df = df[["Text", "Dialect"]]
df.columns = [DATA_COLUMN, LABEL_COLUMN]
print(df[LABEL_COLUMN].value_counts())

train_aim, test_aim = train_test_split(df, test_size=0.06, random_state=42, stratify=df[LABEL_COLUMN])
label_list_aim = np.unique(df[LABEL_COLUMN])

data_aim = Dataset("AIM", train_aim, test_aim, label_list_aim)
all_datasets.append(data_aim)

EG    57694
PL    43785
KW    42151
LY    36536
QA    31100
JO    27949
LB    27645
SA    26859
AE    26322
BH    25860
OM    19135
SY    16259
DZ    16199
IQ    15512
SD    14449
MA    11550
YE     9937
TN     9255
Name: label, dtype: int64


In [None]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 

In [None]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [None]:
dataset_name = 'AIM'
model_name = 'aubmindlab/bert-base-arabertv02'
task_name = 'classification'
max_len = max([len(x.split()) for x in df[DATA_COLUMN].values])
max_len

94

In [None]:
for d in all_datasets:
    if d.name==dataset_name:
        dataset = d
        print('Dataset found')
        break

Dataset found


In [None]:
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

dataset.train[DATA_COLUMN] = dataset.train[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))
dataset.test[DATA_COLUMN] = dataset.test[DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))  

In [None]:
dataset.test[DATA_COLUMN]

347665           # الاتحاد _ النصر الاتي عاد على اخر الموسم
19609                  [مستخدم] كسد معاش فيه سكايب الشيباني
358378    وبعدين صج مانعرف نمسك قلم ولا مضحك للكاميرات ا...
422922    [مستخدم] [مستخدم] عندك في جميرا مقهى ( ريم الب...
446547    [مستخدم] [مستخدم] الله يعطيكم العافيه بس عندي ...
                                ...                        
165570    يقول في ذمتك ماجيت في بالك وأقول في ذمتي مارحت...
109340    ما حرق دمي وقلبي غير أم # الشهيد وهيا بتحكي ما...
292397    [مستخدم] هاهاهاهاهاها انا قولت بردو كدا يبقي ع...
327928    [مستخدم] المشكلة مش بالطفلة بأهلها يللي لازم ي...
319862    [مستخدم] يا حياتي انتي مم شو اعمل عم أتحمل كل ...
Name: text, Length: 27492, dtype: object

In [None]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
        super(BERTDataset).__init__()
        self.text = text
        self.target = target
        self.tokenizer_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_len = max_len
        self.label_map = label_map
      

    def __len__(self):
        return len(self.text)

    def __getitem__(self,item):
        text = str(self.text[item])
        text = " ".join(text.split())
        
        input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
        )     
        attention_mask = [1] * len(input_ids)
        padding_length = self.max_len - len(input_ids)
        input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)    

        return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [None]:
label_map = { v:index for index, v in enumerate(dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(dataset.train[DATA_COLUMN].to_list(),dataset.train[LABEL_COLUMN].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(dataset.test[DATA_COLUMN].to_list(),dataset.test[LABEL_COLUMN].to_list(),model_name,max_len,label_map)

{'AE': 0, 'BH': 1, 'DZ': 2, 'EG': 3, 'IQ': 4, 'JO': 5, 'KW': 6, 'LB': 7, 'LY': 8, 'MA': 9, 'OM': 10, 'PL': 11, 'QA': 12, 'SA': 13, 'SD': 14, 'SY': 15, 'TN': 16, 'YE': 17}


Downloading:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/805k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [None]:
def compute_metrics(p): 
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)
    print(classification_report(p.label_ids,preds))
    print(confusion_matrix(p.label_ids,preds))

    macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[0,1])
    macro_f1 = f1_score(p.label_ids,preds,average='macro')
    macro_precision = precision_score(p.label_ids,preds,average='macro')
    macro_recall = recall_score(p.label_ids,preds,average='macro')
    acc = accuracy_score(p.label_ids,preds)
    return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
    }

In [None]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8


steps_per_epoch = (len(dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio 
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
training_args.save_steps = 100000 
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

13459
107672


In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Downloading:   0%|          | 0.00/518M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [None]:
trainer.train()

***** Running training *****
  Num examples = 430705
  Num Epochs = 8
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 107680


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


In [None]:
# trainer.save_model("arabbert.h5")