In [1]:
pip install pyarabic



In [2]:
#import libraries
import numpy as np
import pandas as pd
import pickle
import re
import matplotlib.pyplot as plt
from itertools import groupby
from pyarabic.araby import *
from sklearn.model_selection import train_test_split
pd.options.display.max_rows = 100

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
pip install pickle5



In [5]:
import pickle5 as pickle

In [6]:
with open('/content/drive/MyDrive/fetched_dialect_dataset.pkl','rb') as hf:
  dataset = pickle.load(hf)

In [7]:
dataset.head()

Unnamed: 0,text,dialect
0,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,IQ
1,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,IQ
2,@KanaanRema مبين من كلامه خليجي,IQ
3,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,IQ
4,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,IQ


In [8]:
def filter_text(text):
    
    """
    Function to filter text
    
    Parameters:
      * text(string): text that is filtered from special characters and others
      
    Return text(string): filtered text using regular expression
    """
    #filter text by removing special characters -> not arabic words or numbers -> trim spcases -> strip to remove spaces at ends

    filtered_text = " ".join(re.findall('[\u0600-\u06ff]+',text))
    filtered_text = re.sub('\s+',' ',re.sub('[٠-٩؟،]','',filtered_text))
    filtered_text = re.sub("[إأآا]", "ا", filtered_text)
    filtered_text = "".join(c for c, _ in groupby(filtered_text))
    filtered_text = strip_tashkeel(filtered_text)
    filtered_text = strip_lastharaka(filtered_text)
    filtered_text = strip_tatweel(filtered_text)

    return filtered_text
    

In [9]:
#copy dataset
data = dataset.copy()

In [10]:
data['text'] = dataset['text'].apply(lambda x:filter_text(x))

In [11]:
data.drop(index=data[data['text']==''].index.to_list(),inplace=True)

In [12]:
data.head()

Unnamed: 0,text,dialect
0,لكن بالنهاية ينتفض يغير,IQ
1,يعني هذا محسوب على البشر حيونه وحشيه وتطلبون م...,IQ
2,مبين من كلامه خليجي,IQ
3,يسلملي مرورك وروحك الحلوه,IQ
4,وين هل الغيبه اخ محمد,IQ


In [13]:
from sklearn.model_selection import train_test_split
train_data ,valid_data = train_test_split(data,test_size=0.2,random_state=0)

In [14]:
valid_data,test_data = train_test_split(valid_data,test_size=0.5,random_state=0)

In [15]:
print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

(366483, 2)
(45810, 2)
(45811, 2)


In [16]:
max([len(i) for i in data['text'].to_list()])

280

In [17]:
pip install transformers



In [18]:
from transformers import BertTokenizer,TFBertModel,BertConfig,TFBertForSequenceClassification
import tensorflow as tf

In [19]:

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
MAX_LEN = 64
# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)

    return input_ids, attention_masks

In [20]:
X_train = train_data['text']
y_train = train_data['dialect']
X_valid = valid_data['text']
y_valid = valid_data['dialect']

In [21]:
from sklearn.preprocessing import OrdinalEncoder
ord = OrdinalEncoder()

In [22]:
train_input,train_mask = preprocessing_for_bert(X_train)
val_input,val_mask = preprocessing_for_bert(X_valid)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [23]:
train_label = tf.convert_to_tensor(ord.fit_transform(np.array(y_train).reshape(-1,1)))
valid_label = tf.convert_to_tensor(ord.transform(np.array(y_valid).reshape(-1,1)))

In [24]:
train_label

<tf.Tensor: shape=(366483, 1), dtype=float64, numpy=
array([[15.],
       [ 8.],
       [ 5.],
       ...,
       [11.],
       [ 1.],
       [ 7.]])>

In [25]:
print('original: ',X_train[0])

original:  لكن بالنهاية ينتفض يغير


In [26]:
print('encoded: ',preprocessing_for_bert([X_train[0]])[0])

encoded:  tf.Tensor(
[[  101  1294 29835 15915  1271 25573 23673 15915 14157 25573 14498 19433
   1300 15915 29817 29833 29827  1300 29831 14498 17149   102     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]], shape=(1, 64), dtype=int32)




In [27]:
train_mask[0]

<tf.Tensor: shape=(64,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)>

In [28]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=18)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
bert_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  13842     
                                                                 
Total params: 109,496,082
Trainable params: 109,496,082
Non-trainable params: 0
_________________________________________________________________


In [30]:
bert_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  with tf.device('/CPU:0'):
      history = bert_model.fit([train_input,train_mask],train_label, epochs=10, batch_size=128,
                          validation_data=([val_input,val_mask],valid_label))

Epoch 1/10
