In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
import ktrain
from ktrain import text
from sklearn.metrics import ConfusionMatrixDisplay
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
import re
import string
from nltk.tokenize import RegexpTokenizer

In [3]:
final_data=pd.read_csv("all_data.csv")

In [4]:
final_data.head()

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,0,﻿حقا\n,pos
1,1,صح جدا\n,pos
2,2,﻿اه و النعمة \n,pos
3,3,كلامك جميل ورائع\n,pos
4,4,﻿و خير الكلام ما قل و دل\n,pos


In [5]:
final_data.isnull().any(axis=0)

Unnamed: 0    False
text          False
sentiment     False
dtype: bool

In [6]:
for letter in '#.][!XR':
    final_data['text'] = final_data['text'].astype(str).str.replace(letter,'')

In [7]:
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

In [8]:
def remove_non_arabic(text):
    return ' '.join(re.sub(u"[^\u0621-\u063A\u0640-\u0652 ]", " ", str(text),  flags=re.UNICODE).split())

In [9]:
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

In [10]:
def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

In [11]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [12]:
def processPost(tweet): 

    #Replace @username with empty string
    tweet = re.sub('@[^\s]+', ' ', tweet)
    
    #Convert www.* or https?://* to " "
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',tweet)
    
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

    # remove punctuations
    tweet= remove_punctuations(tweet)
    
    # normalize the tweet
    tweet= normalize_arabic(tweet)
    
    # remove repeated letters
    #tweet=remove_repeating_char(tweet)


    # remove emoji
    tweet=remove_emoji(tweet)
    
    return tweet

In [13]:
final_data["text"] = final_data['text'].apply(lambda x: processPost(x))

In [14]:
final_data["text"] = final_data['text'].apply(remove_non_arabic)

In [15]:
final_data.head()

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,0,حقا,pos
1,1,صح جدا,pos
2,2,اه و النعمه,pos
3,3,كلامك جميل ورائع,pos
4,4,و خير الكلام ما قل و دل,pos


In [16]:
final_data.shape

(156407, 3)

In [17]:
final_data= final_data.drop('Unnamed: 0',axis=1)
final_data.head()

Unnamed: 0,text,sentiment
0,حقا,pos
1,صح جدا,pos
2,اه و النعمه,pos
3,كلامك جميل ورائع,pos
4,و خير الكلام ما قل و دل,pos


In [18]:
final_data["text"] = final_data['text'].apply(lambda x:remove_repeating_char(x))

In [31]:
#tokenizer = RegexpTokenizer(r'\w+')
#final_data["tweet_text"] = final_data["tweet_text"].apply(tokenizer.tokenize)

In [20]:
final_data["text"].sample(5)

94572    من غبت وانا اضيق ماعرف ابتسم مدري اواخذ نفسي و...
7553                                           ياربي صبرني
61734    سالوني اي رجل تحبين ف قلت من انتظرني تسعه اشهر...
23762                                     تصوير جميلهومبدع
96025    كارين عطيه محره صفحه الشون الدوليه في الواشنطن...
Name: text, dtype: object

In [21]:
df_train = final_data.sample(frac=0.80,random_state=200)

df_train.shape

(125126, 2)

In [22]:
df_test =  final_data.drop(df_train.index)

df_test.shape

(31281, 2)

In [75]:
#X = final_data['tweet_text'].values
#y=final_data['label'].values
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [23]:
# set hyperparameters
maxlen = 64
batch_size = 16
lr = 2e-5
epochs = 3

In [24]:
MODEL_NAME = 'aubmindlab/bert-base-arabertv01'
t = text.Transformer(MODEL_NAME, maxlen=maxlen)

In [25]:
trn = t.preprocess_train(df_train.text.values, df_train.sentiment.values)
tst = t.preprocess_test(df_test.text.values, df_test.sentiment.values)

preprocessing train...
language: ar
train sequence lengths:
	mean : 12
	95percentile : 24
	99percentile : 37


Is Multi-Label? False
preprocessing test...
language: ar
test sequence lengths:
	mean : 12
	95percentile : 24
	99percentile : 38


In [26]:
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=tst, batch_size=batch_size)

In [27]:
history = learner.fit_onecycle(lr, epochs)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [28]:
learner.validate(val_data=tst)

              precision    recall  f1-score   support

           0       0.75      0.69      0.72     12597
           1       0.80      0.85      0.82     18684

    accuracy                           0.78     31281
   macro avg       0.78      0.77      0.77     31281
weighted avg       0.78      0.78      0.78     31281



array([[ 8733,  3864],
       [ 2870, 15814]])

In [29]:
p = ktrain.get_predictor(learner.model, t)

In [31]:
p.predict("الهلال افضل فريق كورة")

'pos'

In [32]:
ktrain.load_predictor

<function ktrain.core.load_predictor(fpath, batch_size=32, custom_objects=None)>

In [38]:
p.save('ar-bert-model-dataset2')