# This notebook was run on kaggle GPUs

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dialect/complete_data.csv


In [2]:
pip install emoji

Note: you may need to restart the kernel to use updated packages.


In [3]:
from sklearn.preprocessing import FunctionTransformer
import re
import string
import emoji
import nltk
from nltk.corpus import stopwords
import nltk
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [4]:
data = pd.read_csv('../input/dialect/complete_data.csv' , encoding="utf-8-sig")
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data_cleaned = data[data.text.apply(lambda line : type(line) == str)]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
def clean_data_text(text_col):
    # removing hashtags and mentions
    text_without_hashtags = [re.sub('[@#]\w+','',line) for line in text_col]
    print('Done removing hashtags')
    print('============================================')
    #removing english letters
    clear_text = [re.sub('[A-z]+|\d+','',line) for line in text_without_hashtags]
    print('Done removing foreign letters')
    print('============================================')
    # remove punctuation    
    text_no_punct = list(map(lambda line : "".join([i for i in line if i not in string.punctuation]) , clear_text))
    print('Done removing punctuation')
    print('============================================')
    # no emojis
    emojis_iter = map(lambda y: y, emoji.UNICODE_EMOJI['en'].keys())
    regex_set = re.compile('|'.join(re.escape(em) for em in emojis_iter))
    text_no_emojis = [regex_set.sub('',line) for line in text_no_punct] 
    print('Done removing emojis')
    print('============================================')
    #removing stop words
    nltk.download('stopwords')
    stopwords_list = stopwords.words('arabic')
    stopwords_deleted = list(map(lambda line : " ".join([word for word in line.split() if word not in stopwords_list]) , text_no_emojis))  
    print('Done removing stopping words')
    print('============================================')
#     #stemming
#     stemmed = list(map(lambda line : " ".join([st.stem(word) for word in line.split()]) , stopwords_deleted))
#     print('Done ALL')
    return stopwords_deleted

In [6]:
cleaner_trans = FunctionTransformer(clean_data_text)
data_cleaned['text'] = cleaner_trans.fit_transform(data_cleaned['text'])

Done removing hashtags
Done removing foreign letters
Done removing punctuation
Done removing emojis
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Done removing stopping words


In [7]:
Encoder = LabelEncoder()
labels = Encoder.fit_transform(data_cleaned.dialect)

In [8]:
tokenizer = BertTokenizer.from_pretrained("asafaya/bert-base-arabic")

Downloading:   0%|          | 0.00/326k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/491 [00:00<?, ?B/s]

In [9]:
(train_texts,test_texts,train_labels,test_labels)=train_test_split(data_cleaned.text.tolist(), labels.tolist(), test_size=0.3)
(train_texts,valid_texts,train_labels,valid_labels)=train_test_split(train_texts, train_labels, test_size=0.2)

In [10]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=512)

In [11]:
model=BertForSequenceClassification.from_pretrained("asafaya/bert-base-arabic", num_labels=18)

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at asafaya/bert-base-arabic were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-base-ar

In [12]:
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir = 'True',
    evaluation_strategy="steps",
    eval_steps=1000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    save_strategy = "no",
)

In [13]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [14]:
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, valid_labels)

In [15]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [17]:
os.environ["WANDB_DISABLED"] = "true" # disable wandb

In [18]:
trainer.train()

***** Running training *****
  Num examples = 256555
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 96210
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
1000,2.4242,2.376157
2000,2.2859,2.198798
3000,2.1913,2.185354
4000,2.1129,2.080735
5000,2.0506,2.11196
6000,2.0976,2.02603
7000,2.0097,1.953671
8000,2.0279,1.988394
9000,1.9883,1.953044
10000,1.994,1.915047


***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** Running Evaluation *****
  Num examples = 64139
  Batch size = 8
***** 

TrainOutput(global_step=96210, training_loss=1.5348860460437395, metrics={'train_runtime': 41829.0573, 'train_samples_per_second': 18.4, 'train_steps_per_second': 2.3, 'total_flos': 5.300758969942644e+16, 'train_loss': 1.5348860460437395, 'epoch': 3.0})

In [19]:
trainer.save_model("./output/best_model")

Saving model checkpoint to ./output/best_model
Configuration saved in ./output/best_model/config.json
Model weights saved in ./output/best_model/pytorch_model.bin
