In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report

In [10]:
# Loading Data with xlsx format but Pandas blocks this format for security reasons so we used 'openpyxl' engine to load the data
data = pd.read_excel('AJGT.xlsx', engine='openpyxl')
print(data.head())
print(data.sample(5))

   ID                                               Feed Sentiment
0   1   اربد فيها جامعات اكثر من عمان ... وفيها قد عم...  Positive
1   2   الحلو انكم بتحكوا على اساس انو الاردن ما فيه ...  Negative
2   3                            كله رائع بجد ربنا يكرمك  Positive
3   4                                 لسانك قذر يا قمامه  Negative
4   5  ​انا داشره وغير متزوجه ولدي علاقات مشبوه واحشش...  Negative
        ID                                               Feed Sentiment
1776  1777                            يع حاسه حالي بدي استفرغ  Negative
1473  1474  نعم احسنت واجبه على كل مسلم وحق لكل مسلم فتح ا...  Positive
326    327                   اللهم اجعلني من يتشرف به الاسلام  Positive
1283  1284                     لو نريد الخشوع حقا نطلق الدنيا  Positive
1098  1099                                          قرف يقرفك  Negative


In [11]:
# To download stoping words from NLTK
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [12]:
'''
The first step is to subject the data to preprocessing.
This involves removing both arabic and english punctuation
Normalizing different letter variants with one common letter
'''
# first we define a list of arabic and english punctiations that we want to get rid of in our text

punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

# Arabic stop words with nltk
stop_words = stopwords.words()

arabic_diacritics = re.compile("""
                             ّ    | # Shadda
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

def preprocess(text):
    
    '''
    text is an arabic string input
    
    the preprocessed text is returned
    '''
    
    #remove punctuations
    translator = str.maketrans('', '', punctuations)
    text = text.translate(translator)
    
    # remove Tashkeel
    text = re.sub(arabic_diacritics, '', text)
    
    #remove longation
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    text = ' '.join(word for word in text.split() if word not in stop_words)

    return text
  
data['Feed'] = data['Feed'].apply(preprocess)
print(data.head(5))

   ID                                               Feed Sentiment
0   1  اربد جامعات اكثر عمان وفيها عمان ونص لعيبه الم...  Positive
1   2   الحلو انكم بتحكوا علي اساس انو الاردن فساد سرقات  Negative
2   3                            كله راءع بجد ربنا يكرمك  Positive
3   4                                    لسانك قذر قمامه  Negative
4   5  ​انا داشره وغير متزوجه ولدي علاقات مشبوه واحشش...  Negative


In [13]:

# splitting the data into target and feature
feature = data.Feed
target = data.Sentiment
# splitting into train and tests
X_train, X_test, Y_train, Y_test = train_test_split(feature, target, test_size =.2, random_state=100)

# make pipeline
#TF-IDF is term frequency-inverse document frequency, indicates what the importance of the word is in order to understand the document or dataset
pipe = make_pipeline(TfidfVectorizer(), 
                    LogisticRegression())
# make param grid
param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}

# create and fit the model
model = GridSearchCV(pipe, param_grid, cv=5)
model.fit(X_train,Y_train)

# make prediction and print accuracy
prediction = model.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))


Accuracy score is 0.83
              precision    recall  f1-score   support

    Negative       0.85      0.80      0.82       176
    Positive       0.82      0.86      0.84       184

    accuracy                           0.83       360
   macro avg       0.83      0.83      0.83       360
weighted avg       0.83      0.83      0.83       360



In [14]:
pipe = make_pipeline(TfidfVectorizer(),
                    MultinomialNB())
pipe.fit(X_train,Y_train)
prediction = pipe.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.83
              precision    recall  f1-score   support

    Negative       0.89      0.76      0.82       176
    Positive       0.80      0.91      0.85       184

    accuracy                           0.83       360
   macro avg       0.84      0.83      0.83       360
weighted avg       0.84      0.83      0.83       360



In [15]:
pipe = make_pipeline(TfidfVectorizer(),
                    RandomForestClassifier())

param_grid = {'randomforestclassifier__n_estimators':[10, 100, 1000],
             'randomforestclassifier__max_features':['sqrt', 'log2']}

rf_model = GridSearchCV(pipe, param_grid, cv=5)
rf_model.fit(X_train,Y_train)

prediction = rf_model.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")

Accuracy score is 0.82


In [16]:
pipe = make_pipeline(TfidfVectorizer(),
                     SVC())
param_grid = {'svc__kernel': ['rbf', 'linear', 'poly'],
             'svc__gamma': [0.1, 1, 10, 100],
             'svc__C': [0.1, 1, 10, 100]}

svc_model = GridSearchCV(pipe, param_grid, cv=3)
svc_model.fit(X_train, Y_train)

prediction = svc_model.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

Accuracy score is 0.85
              precision    recall  f1-score   support

    Negative       0.83      0.87      0.85       176
    Positive       0.87      0.83      0.85       184

    accuracy                           0.85       360
   macro avg       0.85      0.85      0.85       360
weighted avg       0.85      0.85      0.85       360

