In [1]:
import tensorflow as tf
print(len(tf.config.list_physical_devices('GPU')))
import torch
print(torch.cuda.is_available())

1
True


In [2]:
import numpy as np
import pandas as pd
import pyarabic.araby as ar

# import Stemmer
import re , emoji, functools, operator, string
import torch , optuna, gc, random, os

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample

import logging

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

## Load Dataset

In [3]:
import pandas as pd
df = pd.read_csv("train.csv")
seed=0

## Arabic stop words

In [4]:
arabic_stop_words=[]
with open ('list.txt',encoding='utf-8') as f :
  for i in f.readlines() :
    arabic_stop_words.append(i)
    arabic_stop_words[-1]=arabic_stop_words[-1][:-1]


In [5]:
import numpy as np
import pandas as pd
import re

#============= Read CSV and apply data preperation =============#


def data_preprocessing (data_frame):
  # clean-up: remove #tags, http links and special symbols
  data_frame['tweet']= data_frame['tweet'].apply(lambda x: x[2:-2])
  data_frame['tweet']= data_frame['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
  data_frame['tweet'] = data_frame['tweet'].apply(lambda x: re.sub(r'[@|#]\S*', '', x))
  data_frame['tweet'] = data_frame['tweet'].apply(lambda x: re.sub(r'"+', '', x))

  # Remove arabic signs
  data_frame['tweet'] = data_frame['tweet'].apply(lambda x: re.sub(r'([@A-Za-z0-9_ـــــــــــــ]+)|[^\w\s]|#|http\S+', '', x))

  # Remove repeated letters like "الللللللللللللللله" to "الله"
  data_frame['tweet'] = data_frame['tweet'].apply(lambda x: x[0:2] + ''.join([x[i] for i in range(2, len(x)) if x[i]!=x[i-1] or x[i]!=x[i-2]]))

  # remove stop words
  data_frame['tweet'] = data_frame['tweet'].apply(lambda x: '' if x in arabic_stop_words else x)

  from nltk.stem.isri import ISRIStemmer
  df['tweet']=df['tweet'].apply(lambda x:ISRIStemmer().stem(x))

  return data_frame


In [6]:
# st =  Stemmer.Stemmer('arabic')
import string,emoji
def data_cleaning (text):
  text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  text = re.sub(r"http\S+", "", text)
  text = re.sub(r"https\S+", "", text)
  text = re.sub(r'\s+', ' ', text)
  text = re.sub("(\s\d+)","",text)
  text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "", text)
  text = re.sub("\d+", " ", text)
  text = ar.strip_tashkeel(text)
  text = ar.strip_tatweel(text)
  text = text.replace("#", " ");
  text = text.replace("@", " ");
  text = text.replace("_", " ");
  translator = str.maketrans('', '', string.punctuation)
  text = text.translate(translator)
  em = text
  em_split_emoji = emoji.get_emoji_regexp().split(em)
  em_split_whitespace = [substr.split() for substr in em_split_emoji]
  em_split = functools.reduce(operator.concat, em_split_whitespace)
  text = " ".join(em_split)
  text = re.sub(r'(.)\1+', r'\1', text)
  # text_stem = " ".join([st.stemWord(i) for i in text.split()])
  # text = text +" "+ text_stem
  text = text.replace("آ", "ا")
  text = text.replace("إ", "ا")
  text = text.replace("أ", "ا")
  text = text.replace("ؤ", "و")
  text = text.replace("ئ", "ي")

  return text

In [7]:
df=data_preprocessing(df)
df['tweet']=df['tweet'].apply(lambda x: data_cleaning(x))
df

  em_split_emoji = emoji.get_emoji_regexp().split(em)


Unnamed: 0,tweet,class
0,ان الذين يعيشون على الارض ليسوا ملايكة بل بشر ...,pos
1,كل سنة وانتم طيبين,pos
2,و انتهى مشوار الخواجة,neg
3,مش عارف ابتدى مذاكره منين,neg
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,neg
...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,neu
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,neu
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,pos
2057,انت متناقض جدا يا صلاح,neg


## Label Encoder

In [8]:
from sklearn import preprocessing
# Apply label encoding over the labels
lable_encoder = preprocessing.LabelEncoder()
encoded_labels =lable_encoder.fit_transform(df["class"])
df['class']=encoded_labels
df

Unnamed: 0,tweet,class
0,ان الذين يعيشون على الارض ليسوا ملايكة بل بشر ...,2
1,كل سنة وانتم طيبين,2
2,و انتهى مشوار الخواجة,0
3,مش عارف ابتدى مذاكره منين,0
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,0
...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,1
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,1
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,2
2057,انت متناقض جدا يا صلاح,0


In [9]:
df['length']=df['tweet'].apply(lambda x:len(x.split(' ')))
df['length'].max()

34

## Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation=train_test_split(df['tweet'], df['class'], test_size=0.2, random_state=seed)
X_validation

1826    هتفضل مشاكلي بيني و بين نفسي لحد ما اموت و هيف...
1505             لو فشلنا ادينا حاولنا دا هيدي الامل لغير
1994    اذا جاء اجل اله لايقدم ولا يوخر الموت اقرب الي...
1349                      دايما بتاخد منا اعز واغلى ماعند
781                      كان هيلعن اليوم الي كان فيه مصري
                              ...                        
596                                           امراه رايعه
1887                        مارلين مونرو حلوة فشخ ليه كدا
942                       تحت البطانيه و جنب شاحن الموبيل
634                                            مساء العسل
19            ها ايه رايك اقوله ايه بقا انا بقول مودي حلو
Name: tweet, Length: 412, dtype: object

## TF_IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_ngram(n_gram,X_train,X_val):
    vectorizer = TfidfVectorizer(ngram_range=(n_gram,n_gram))
    x_train_vec = vectorizer.fit_transform(X_train)
    x_test_vec = vectorizer.transform(X_val)
    return x_train_vec,x_test_vec

In [12]:
# Applying tfidf with 1-gram, 2-gram and 3-gram
tfidf_1g_transformation_train,tfidf_1g_transformation_validation= tfidf_ngram(1,X_train,X_validation)
tfidf_2g_transformation_train,tfidf_2g_transformation_validation= tfidf_ngram(2,X_train,X_validation)

## Machine learning models

In [13]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

models=[SVC(),XGBClassifier(),RandomForestClassifier(),DecisionTreeClassifier(),LogisticRegression()]
for m in models :
    m.fit(tfidf_2g_transformation_train,y_train)
    print(m.score(tfidf_2g_transformation_train,y_train))
    print(m.score(tfidf_2g_transformation_validation,y_validation))

0.9878567091681846
0.41262135922330095
0.4596235579842137
0.4077669902912621
0.9878567091681846
0.4223300970873786
0.9878567091681846
0.41262135922330095
0.9878567091681846
0.44902912621359226


# Model and Tokenizer initialization

In [14]:
!pip install transformers



In [15]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

#============= Initialize Arabic Bert =============#
#load your pre_trained model with all its weights
# model_name= 'aubmindlab/bert-base-arabertv02'
model_name='UBC-NLP/MARBERT' #top
# model_name='asafaya/bert-base-arabic'
tokenizer =AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
# model=AutoModel.from_pretrained(model_name,output_hidden_states=True)

Some weights of the model checkpoint at UBC-NLP/MARBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at U

In [16]:
# Tokenize the sentences using bert tokenizer
df["bert_tokens"] = df.tweet.apply(lambda x: tokenizer(x).tokens())
df["bert_tokens_ids"] = df.tweet.apply(lambda x: tokenizer(x).tokens())

In [17]:
df

Unnamed: 0,tweet,class,length,bert_tokens,bert_tokens_ids
0,ان الذين يعيشون على الارض ليسوا ملايكة بل بشر ...,2,25,"[[CLS], ان, الذين, يعيشون, على, الارض, ليسوا, ...","[[CLS], ان, الذين, يعيشون, على, الارض, ليسوا, ..."
1,كل سنة وانتم طيبين,2,4,"[[CLS], كل, سنة, وانتم, طيبين, [SEP]]","[[CLS], كل, سنة, وانتم, طيبين, [SEP]]"
2,و انتهى مشوار الخواجة,0,4,"[[CLS], و, انتهى, مشوار, الخواجة, [SEP]]","[[CLS], و, انتهى, مشوار, الخواجة, [SEP]]"
3,مش عارف ابتدى مذاكره منين,0,5,"[[CLS], مش, عارف, ابتدى, مذاكره, منين, [SEP]]","[[CLS], مش, عارف, ابتدى, مذاكره, منين, [SEP]]"
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,0,20,"[[CLS], اختصر, ##وا, الطريق, بدلا, من, اختيار,...","[[CLS], اختصر, ##وا, الطريق, بدلا, من, اختيار,..."
...,...,...,...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,1,10,"[[CLS], الجمال, مبيح, ##تاح, اي, مكياج, لنا, #...","[[CLS], الجمال, مبيح, ##تاح, اي, مكياج, لنا, #..."
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,1,11,"[[CLS], نتمني, وجود, الفنانة, رنا, سماحة, افضل...","[[CLS], نتمني, وجود, الفنانة, رنا, سماحة, افضل..."
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,2,16,"[[CLS], ولد, الهدى, فالك, ##اينات, ضياء, وف, #...","[[CLS], ولد, الهدى, فالك, ##اينات, ضياء, وف, #..."
2057,انت متناقض جدا يا صلاح,0,5,"[[CLS], انت, متناقض, جدا, يا, صلاح, [SEP]]","[[CLS], انت, متناقض, جدا, يا, صلاح, [SEP]]"


In [18]:
df["encoded"] = df.tweet.apply(lambda x: tokenizer.encode_plus(x,return_tensors='pt')['input_ids'])

In [19]:
df

Unnamed: 0,tweet,class,length,bert_tokens,bert_tokens_ids,encoded
0,ان الذين يعيشون على الارض ليسوا ملايكة بل بشر ...,2,25,"[[CLS], ان, الذين, يعيشون, على, الارض, ليسوا, ...","[[CLS], ان, الذين, يعيشون, على, الارض, ليسوا, ...","[[tensor(2), tensor(1946), tensor(2468), tenso..."
1,كل سنة وانتم طيبين,2,4,"[[CLS], كل, سنة, وانتم, طيبين, [SEP]]","[[CLS], كل, سنة, وانتم, طيبين, [SEP]]","[[tensor(2), tensor(2009), tensor(3171), tenso..."
2,و انتهى مشوار الخواجة,0,4,"[[CLS], و, انتهى, مشوار, الخواجة, [SEP]]","[[CLS], و, انتهى, مشوار, الخواجة, [SEP]]","[[tensor(2), tensor(144), tensor(7609), tensor..."
3,مش عارف ابتدى مذاكره منين,0,5,"[[CLS], مش, عارف, ابتدى, مذاكره, منين, [SEP]]","[[CLS], مش, عارف, ابتدى, مذاكره, منين, [SEP]]","[[tensor(2), tensor(2093), tensor(3323), tenso..."
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,0,20,"[[CLS], اختصر, ##وا, الطريق, بدلا, من, اختيار,...","[[CLS], اختصر, ##وا, الطريق, بدلا, من, اختيار,...","[[tensor(2), tensor(22181), tensor(1958), tens..."
...,...,...,...,...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,1,10,"[[CLS], الجمال, مبيح, ##تاح, اي, مكياج, لنا, #...","[[CLS], الجمال, مبيح, ##تاح, اي, مكياج, لنا, #...","[[tensor(2), tensor(4770), tensor(68899), tens..."
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,1,11,"[[CLS], نتمني, وجود, الفنانة, رنا, سماحة, افضل...","[[CLS], نتمني, وجود, الفنانة, رنا, سماحة, افضل...","[[tensor(2), tensor(39939), tensor(3715), tens..."
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,2,16,"[[CLS], ولد, الهدى, فالك, ##اينات, ضياء, وف, #...","[[CLS], ولد, الهدى, فالك, ##اينات, ضياء, وف, #...","[[tensor(2), tensor(3735), tensor(4880), tenso..."
2057,انت متناقض جدا يا صلاح,0,5,"[[CLS], انت, متناقض, جدا, يا, صلاح, [SEP]]","[[CLS], انت, متناقض, جدا, يا, صلاح, [SEP]]","[[tensor(2), tensor(2030), tensor(27008), tens..."


## Padding and attention mask

In [20]:
from keras_preprocessing.sequence import pad_sequences

# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway.
# In the original paper, the authors used a length of 512.
MAX_LEN = 64
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in df['bert_tokens']]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [21]:
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, encoded_labels,
                                                            random_state=seed, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=seed, test_size=0.1)
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 80

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size)

## Set optimizer parameters

In [22]:
import torch.optim as optim

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay_rate': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}]
# This variable contains all of the hyperparemeter information our training loop needs
# optimizer = optim.BertAdam(optimizer_grouped_parameters,lr=2e-5,warmup=.1)
optimizer = optim.AdamW(optimizer_grouped_parameters,lr=5e-6)

# Training

In [37]:
from tqdm import tqdm, trange
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
t = []

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs
epochs = 20

# Transfer the model to GPU
model.to("cuda")

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):


  # Training

  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    b_input_ids, b_input_mask, b_labels = batch
    b_labels = b_labels.type(torch.LongTensor)   # casting to long
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # Forward pass
    loss = model(b_input_ids.to("cuda"), token_type_ids=None, attention_mask=b_input_mask.to("cuda"), labels=b_labels.to("cuda"))["loss"]
    train_loss_set.append(loss.item())

    # Backward pass
    loss.backward()

    # Update parameters and take a step using the computed gradient
    optimizer.step()


    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    # batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    b_labels = b_labels.type(torch.LongTensor)   # casting to long
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids.to("cuda"), token_type_ids=None, attention_mask=b_input_mask.to("cuda"))

    # Move logits and labels to CPU
    logits = logits["logits"].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Train loss: 0.007775241509079933


Epoch:   5%|▌         | 1/20 [00:10<03:25, 10.81s/it]

Validation Accuracy: 0.7193840579710145
Train loss: 0.010197977254089588


Epoch:  10%|█         | 2/20 [00:21<03:13, 10.75s/it]

Validation Accuracy: 0.7152173913043477
Train loss: 0.006798163851878296


Epoch:  15%|█▌        | 3/20 [00:32<03:02, 10.75s/it]

Validation Accuracy: 0.7121376811594202
Train loss: 0.005740032240282744


Epoch:  20%|██        | 4/20 [00:43<02:52, 10.78s/it]

Validation Accuracy: 0.7266304347826087
Train loss: 0.006551246313999097


Epoch:  25%|██▌       | 5/20 [00:53<02:42, 10.80s/it]

Validation Accuracy: 0.7121376811594202
Train loss: 0.008503883931552991


Epoch:  30%|███       | 6/20 [01:04<02:29, 10.69s/it]

Validation Accuracy: 0.7121376811594202
Train loss: 0.004823497952505325


Epoch:  35%|███▌      | 7/20 [01:14<02:18, 10.62s/it]

Validation Accuracy: 0.7266304347826087
Train loss: 0.008328444562115086


Epoch:  40%|████      | 8/20 [01:25<02:06, 10.58s/it]

Validation Accuracy: 0.716304347826087
Train loss: 0.00453309793859565


Epoch:  45%|████▌     | 9/20 [01:35<01:56, 10.55s/it]

Validation Accuracy: 0.7235507246376812
Train loss: 0.0037235923179347688


Epoch:  50%|█████     | 10/20 [01:46<01:45, 10.53s/it]

Validation Accuracy: 0.7235507246376812
Train loss: 0.0027813007618533447


Epoch:  55%|█████▌    | 11/20 [01:56<01:34, 10.51s/it]

Validation Accuracy: 0.7307971014492755
Train loss: 0.00383834765428522


Epoch:  60%|██████    | 12/20 [02:07<01:24, 10.50s/it]

Validation Accuracy: 0.6954710144927536
Train loss: 0.008061746911456188


Epoch:  65%|██████▌   | 13/20 [02:17<01:13, 10.50s/it]

Validation Accuracy: 0.7327898550724639
Train loss: 0.008963529883961504


Epoch:  70%|███████   | 14/20 [02:28<01:02, 10.49s/it]

Validation Accuracy: 0.7121376811594202
Train loss: 0.006504351966820347


Epoch:  75%|███████▌  | 15/20 [02:38<00:52, 10.49s/it]

Validation Accuracy: 0.7016304347826087
Train loss: 0.0068676988982285065


Epoch:  80%|████████  | 16/20 [02:49<00:41, 10.48s/it]

Validation Accuracy: 0.6954710144927536
Train loss: 0.002551882231879669


Epoch:  85%|████████▌ | 17/20 [02:59<00:31, 10.49s/it]

Validation Accuracy: 0.6913043478260869
Train loss: 0.006357304368672582


Epoch:  90%|█████████ | 18/20 [03:10<00:20, 10.48s/it]

Validation Accuracy: 0.7027173913043478
Train loss: 0.003932676801923662


Epoch:  95%|█████████▌| 19/20 [03:20<00:10, 10.48s/it]

Validation Accuracy: 0.6996376811594204
Train loss: 0.0027347728464519605


Epoch: 100%|██████████| 20/20 [03:31<00:00, 10.56s/it]

Validation Accuracy: 0.7079710144927537





In [38]:
#============= Read CSV and apply data preperation =============#
df_submit = pd.read_csv("test.csv")

# # clean-up: remove #tags, http links and special symbols
# df_submit.tweet = df_submit.tweet.apply(lambda x: x[2:-2])
# df_submit.tweet = df_submit.tweet.apply(lambda x: re.sub(r'http\S+', '', x))
# df_submit.tweet = df_submit.tweet.apply(lambda x: re.sub(r'[@|#]\S*', '', x))
# df_submit.tweet = df_submit.tweet.apply(lambda x: re.sub(r'"+', '', x))
#
# # Remove arabic signs
# df_submit.tweet = df_submit.tweet.apply(lambda x: re.sub(r'([@A-Za-z0-9_ـــــــــــــ]+)|[^\w\s]|#|http\S+', '', x))
#
# # Remove repeated letters like "الللللللللللللللله" to "الله"
# df_submit.tweet = df_submit.tweet.apply(lambda x: x[0:2] + ''.join([x[i] for i in range(2, len(x)) if x[i]!=x[i-1] or x[i]!=x[i-2]]))
#
# # remove stop words
# df_submit.iloc[:,0] = df_submit.iloc[:,0].apply(lambda x: '' if x in arabic_stop_words else x)
df_submit=data_preprocessing(df_submit)

df_submit["tweet"] = df_submit.tweet.apply(lambda x: data_cleaning(x))

# Tokenize the sentences using bert tokenizer
df_submit["bert_tokens"] = df_submit.tweet.apply(lambda x: tokenizer(x).tokens())

  em_split_emoji = emoji.get_emoji_regexp().split(em)


In [39]:
bert_tokens_submit = df_submit["bert_tokens"]

In [40]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 64
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids_submit = [tokenizer.convert_tokens_to_ids(x) for x in bert_tokens_submit]
# Pad our input tokens
input_ids_submit = pad_sequences(input_ids_submit, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks_submit = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_submit:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_submit.append(seq_mask)

In [41]:
# Convert all of our data into torch tensors, the required datatype for our model
inputs_submit = torch.tensor(input_ids_submit)
masks_submit = torch.tensor(attention_masks_submit)

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
batch_size = 64
submit_data = TensorDataset(inputs_submit, masks_submit)

# do not use shuffle, we need the preds to be in same order
submit_dataloader = DataLoader(submit_data, batch_size=batch_size)#, shuffle=True)

In [42]:
# Put the model in an evaluation state
model.eval()

# Transfer model to GPU
model.to("cuda")

outputs = []
for input, masks in submit_dataloader:
  torch.cuda.empty_cache() # empty the gpu memory

  # Transfer the batch to gpu
  input = input.to('cuda')
  masks = masks.to('cuda')

  # Run inference on the batch
  output = model(input, attention_mask=masks)["logits"]

  # Transfer the output to CPU again and convert to numpy
  output = output.cpu().detach().numpy()

  # Store the output in a list
  outputs.append(output)

# Concatenate all the lists within the list into one list
outputs = [x for y in outputs for x in y]

# Inverse transform the label encoding
pred_flat = np.argmax(outputs, axis=1).flatten()
output_labels = lable_encoder.inverse_transform(pred_flat)

In [43]:
submission = pd.DataFrame({"Id":np.arange(1, len(output_labels)+1), "class":output_labels})
# save (submission)
submission.to_csv("submission18.csv", index=False)