In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Data and splitting them 

In [2]:
df = pd.read_csv('archive (1)\Arabic Sentiment Analysis Dataset - SS2030.csv')
df.head()

Unnamed: 0,text,Sentiment
0,حقوق المرأة 💚💚💚 https://t.co/Mzf90Ta5g1,1
1,RT @___IHAVENOIDEA: حقوق المرأة في الإسلام. ht...,1
2,RT @saud_talep: Retweeted لجنة التنمية بشبرا (...,1
3,RT @MojKsa: حقوق المرأة التي تضمنها لها وزارة ...,1
4,RT @abm112211: ولي امر الزوجة او ولي الزوجة او...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4252 entries, 0 to 4251
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       4252 non-null   object
 1   Sentiment  4252 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 66.6+ KB


In [4]:
df['Sentiment'].unique()

array([1, 0], dtype=int64)

In [5]:
df['Sentiment'].value_counts()

Sentiment
1    2436
0    1816
Name: count, dtype: int64

In [6]:
df.sample(10)

Unnamed: 0,text,Sentiment
1409,@QTF_3 يا رجال سوالف فسويات وفسويين. ما ينصدق,0
3379,#عطاله_التخصصات_الصحيه١\n @tfrabiah @aalaiban8...,0
2169,RT @navyteerih: #سعوديات_نطلب_اسقاط_الولايه819...,1
3039,#وين_قرار_حل_عطالة_اطباء_الاسنان# حسبي الله ون...,0
3321,@Aliceandraiet ياحبي له\n بس نفسي احد يوصله صو...,0
2969,حسبي الله ونعم الوكيل #وين_قرار_حل_عطالة_اطباء...,0
3998,#مهندسون_معطلون_في_ياهلا الحلول الموقته للوضاي...,0
3713,#مع_او_ضد_عمل_وقياده_المراه\n \n %95 من الناس ...,1
2677,@yas0020 @AJABreaking طبعا، المتهم بريء حتى تث...,0
1494,@mickelguirguis لو احنا طبقنا سياسة تعدد الزوج...,1


# Cleaning data

In [7]:
import emoji

def remove_english_char(df):
    
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+|"
        r"[A-Za-z.@:()\n/_0-9]", flags=re.UNICODE)

    df['text'] = df['text'].apply(lambda x: emoji_pattern.sub(r'', str(x)))
    return df


clean_df = remove_english_char(df)
    

In [8]:
clean_df.head()

Unnamed: 0,text,Sentiment
0,حقوق المرأة,1
1,حقوق المرأة في الإسلام,1
2,لجنة التنمية بشبرا ما زال التسجيل مستمر ف...,1
3,حقوق المرأة التي تضمنها لها وزارة العدل,1
4,ولي امر الزوجة او ولي الزوجة او ولي المراة م...,1


# TF-IDF (Embedder)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectroizer = TfidfVectorizer()
X = vectroizer.fit_transform(clean_df['text'])
vectroizer.get_feature_names_out()

array(['آااهه', 'آبار', 'آبتس', ..., '٩٩', 'گله', 'گلهم'], dtype=object)

## Note TF-IDF producses Sparse matrix

In [10]:
X

<4252x24512 sparse matrix of type '<class 'numpy.float64'>'
	with 83991 stored elements in Compressed Sparse Row format>

In [11]:
y = clean_df['Sentiment']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
X_train

<3401x24512 sparse matrix of type '<class 'numpy.float64'>'
	with 67315 stored elements in Compressed Sparse Row format>

In [14]:
y_train

2921    1
1444    0
3793    0
3121    0
1627    1
       ..
3444    0
466     1
3092    0
3772    0
860     0
Name: Sentiment, Length: 3401, dtype: int64

# Naive bayes

In [15]:
gnb = GaussianNB()

# Training

In [16]:
X_train=X_train.toarray()
gnb.fit(X_train,y_train)

In [17]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
X_test = X_test.toarray()


# Testing

In [19]:
y_pred=gnb.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred=y_pred)
print(f"Accuracy of GNB:{accuracy}")

Accuracy:0.7720329024676851


# Inference Pipe line

In [21]:
import emoji

def remove_english_char(df):
    
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+|"
        r"[A-Za-z.@:()\n/_0-9]", flags=re.UNICODE)
    print(f"Text after transforming {df}")
    return df


txt = remove_english_char('حمد ')

Text after transforming حمد 


In [22]:
lst_infer = [txt]
lst_infer_transformed = vectroizer.transform(lst_infer)
lst_pred = gnb.predict(lst_infer_transformed.toarray())


In [23]:
feature_names = vectroizer.get_feature_names_out()
mean = gnb.theta_

In [24]:
import pandas as pd
means_df = pd.DataFrame(mean, columns=feature_names)
means_df.head()

Unnamed: 0,آااهه,آبار,آبتس,آت,آثم,آثناء,آجل,آخر,آخرتها,آخره,...,٦٠٠٠,٧سنين,٧٠,٧٠٠,٨٠,٨٠٠,٨١٠,٩٩,گله,گلهم
0,0.0,0.0,0.000126,0.0,0.0,0.0,0.000285,0.000477,0.000133,0.0,...,0.000142,0.000124,0.000575,0.000124,0.000318,0.000118,0.0,0.000138,0.000118,0.000118
1,0.0,8.4e-05,0.0,0.000108,0.000424,9.3e-05,0.0,0.00055,0.0,9.8e-05,...,0.0,0.0,0.0,0.0,0.0,9.7e-05,0.000144,0.0,0.0,0.0


In [25]:

word_to_search = 'حمد'  

if word_to_search in means_df.columns:
    word_probabilities = means_df[word_to_search]
    print(f"Mean values for '{word_to_search}':")
    print(word_probabilities)


Mean values for 'حمد':
0    0.000000
1    0.000167
Name: حمد, dtype: float64


# Part 2

In [26]:
import nltk
X_tokenized = clean_df['text'].apply(lambda x :nltk.word_tokenize(x))

In [27]:
type(X_tokenized)

pandas.core.series.Series

In [28]:
X_tokenized

0                                          [حقوق, المرأة]
1                             [حقوق, المرأة, في, الإسلام]
2       [لجنة, التنمية, بشبرا, ما, زال, التسجيل, مستمر...
3         [حقوق, المرأة, التي, تضمنها, لها, وزارة, العدل]
4       [ولي, امر, الزوجة, او, ولي, الزوجة, او, ولي, ا...
                              ...                        
4247                             [#, غردبحبكلمحمدبنسلمان]
4248    [#, غردبحبكلمحمدبنسلمان, محمدبن, سلمان, احبه, ...
4249    [#, غردبحبكلمحمدبنسلمان, الله, يحفظك, يا, ذخر,...
4250    [#, غردبحبكلمحمدبنسلمان, الله, يحفظه, ويحميه, ...
4251           [#, غردبحبكلمحمدبنسلمان, احبه, احبه, ياخي]
Name: text, Length: 4252, dtype: object

In [29]:
X_tokenized = X_tokenized.apply(lambda x: ' '.join(x))


In [30]:
vectroizer_t = TfidfVectorizer()

X_tokenized_tf = vectroizer_t.fit_transform(X_tokenized)

In [31]:
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_tokenized_tf, y, test_size=0.2, random_state=0)


In [32]:
# X_train_t = X_train_t.apply(lambda x: ' '.join(x))
# X_test_t = X_test_t.apply(lambda x: ' '.join(x))

In [33]:
gnb.fit(X_train_t.toarray(),y_train_t)

In [34]:
X_train_t.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
y_pred_t = gnb.predict(X_test_t.toarray())

In [36]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test_t,y_pred=y_pred_t)
print(f"Accuracy:{accuracy}")

Accuracy:0.7708578143360753


# We got the Same result without tokinzation.
# Because using a tokenizaer is usless when using tf-idf 
# Because tf-idf tokenize the text by it self

In [37]:
X_tokenized

0                                             حقوق المرأة
1                                  حقوق المرأة في الإسلام
2       لجنة التنمية بشبرا ما زال التسجيل مستمر في دور...
3                 حقوق المرأة التي تضمنها لها وزارة العدل
4       ولي امر الزوجة او ولي الزوجة او ولي المراة من ...
                              ...                        
4247                                # غردبحبكلمحمدبنسلمان
4248    # غردبحبكلمحمدبنسلمان محمدبن سلمان احبه الله و...
4249    # غردبحبكلمحمدبنسلمان الله يحفظك يا ذخر الوطن ...
4250    # غردبحبكلمحمدبنسلمان الله يحفظه ويحميه ويقويه...
4251                 # غردبحبكلمحمدبنسلمان احبه احبه ياخي
Name: text, Length: 4252, dtype: object

In [38]:
vectroizer.get_feature_names_out()

array(['آااهه', 'آبار', 'آبتس', ..., '٩٩', 'گله', 'گلهم'], dtype=object)