<a href="https://colab.research.google.com/github/Mohamed-S-Helal/-Finding-Donors-for-Charity-ML-Project/blob/main/arabic_sentiment_analysis_in_tweets_nb_bow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Arabic Sentiment Analysis in tweets using Naive Bayes Machine learning Algorithm and unigram features

In [934]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk
from nltk import NaiveBayesClassifier
from nltk.metrics.scores import f_measure, precision, recall
import collections
from builtins import chr
from nltk import word_tokenize, ngrams


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

# Any results you write to the current directory are saved as output.

# define functions 

In [935]:
import re
from itertools import islice
from nltk.corpus import stopwords
nltk.download('stopwords')



def arabicrange():
        """return a list of arabic characteres .
        Return a list of characteres between \u060c to \u0652
        @return: list of arabic characteres.
        @rtype: unicode;
        """
        mylist = [];
        for i in range(0x0620, 0x00650):
            try :
                mylist.append(chr(i));
            except ValueError:
                pass;
        return mylist[:-5]+[' '];

arb = arabicrange()
stopwords_list = stopwords.words('arabic')

def clean_raw_review(body):
        # return ''.join(c for c in body if c in arb)
        return ''.join(c if c in arb else ' ' for c in body)

         # patterns to remove first
        pat = [\
            (u'http[s]?://[a-zA-Z0-9_\-./~\?=%&]+', u''),  # remove links
            (u'www[a-zA-Z0-9_\-?=%&/.~]+', u''),
#            u'\n+': u' ',                     # remove newlines
            (u'<br />', u' '),  # remove html line breaks
            (u'</?[^>]+>', u' '),  # remove html markup
#            u'http': u'',
            (u'[a-zA-Z]+\.org', u''),
            (u'[a-zA-Z]+\.com', u''),
            (u'://', u''),
            (u'&[^;]+;', u' '),
            (u':D', u':)'),
#            (u'[0-9/]+', u''),
#            u'[a-zA-Z.]+': u'',
#            u'[^0-9' + u''.join(self.arabicrange()) + \
#                u"!.,;:$%&*%'#(){}~`\[\]/\\\\\"" + \
#                u'\s^><\-_\u201D\u00AB=\u2026]+': u'',          # remove latin characters
            (u'\s+', u' '),  # remove spaces
            (u'\.+', u'.'),  # multiple dots
            (u'[\u201C\u201D]', u'"'),  # “
            (u'[\u2665\u2764]', u''),  # heart symbol
            (u'[\u00BB\u00AB]', u'"'),
            (u'\u2013', u'-'),  # dash
        ]

        # patterns that disqualify a review
        remove_if_there = [\
            (u'[^0-9' + u''.join(arabicrange()) + \
                u"!.,;:$%&*%'#(){}~`\[\]/\\\\\"" + \
                u'\s\^><\-_\u201D\u00AB=\u2026+|' + \
                u'\u0660-\u066D\u201C\u201D' + \
                u'\ufefb\ufef7\ufef5\ufef9]+', u''),  # non arabic characters
        ]

        # patterns that disqualify if empty after removing
        remove_if_empty_after = [\
            (u'[0-9a-zA-Z\-_]', u' '),  # alpha-numeric
            (u'[0-9' + u".,!;:$%&*%'#(){}~`\[\]/\\\\\"" + \
                u'\s\^><`\-=_+]+', u''),  # remove just punctuation
            (u'\s+', u' '),  # remove spaces
        ]

        # remove again
        # patterns to remove
        pat2 = [\
#            u'[^0-9' + u''.join(self.arabicrange()) + \
#                u"!.,;:$%&*%'#(){}~`\[\]/\\\\\"" + \
#                u'\s^><\-_\u201D\u00AB=\u2026]+': u'',          # remove latin characters
        ]

        skip = False

        # if empty body, skip
        if body == u'': skip = True

        # do some subsitutions
        for k, v in pat:
            body = re.sub(k, v, body)

        # remove if exist
        for k, v in remove_if_there:
            if re.search(k, body):
                skip = True

        # remove if empty after replacing
        for k, v in remove_if_empty_after:
            temp = re.sub(k, v, body)
            if temp == u" " or temp == u"":
                skip = True

        # do some more subsitutions
        if not skip:
            for k, v in pat2:
                body = re.sub(k, v, body)

        # if empty string, skip
        if body == u'' or body == u' ':
            skip = True

        if not skip:
            return body
        else:
            return u""

def load_tsv(data_file, n):
    data_features = list()
    data = list()
    # infile = open(data_file, encoding='utf-8')

    infile = data_file.to_numpy()
    for line in infile:
        # if not line.strip():
        #     continue
        
        # text,label = line.split(',')
        text,label = line
        text = clean_raw_review(text)
        text_features = process_text(text, n)
        if text_features:
            text_features = [s for s in text_features if s and len(s)>1 and s not in stopwords_list]
            if not text_features:
              continue
            # print(text_features)
            text_features = text_features +\
              rec_char_ngram(text_features,3)+\
              rec_char_ngram(text_features,4)

            data_features += text_features
            data.append((text_features, label))
        
    return data, data_features

def process_text(text, n=1,
                 remove_vowel_marks=False,
                 remove_repeated_chars=True,
                 ):
    clean_text = text
    # if remove_vowel_marks:
    #     clean_text = remove_diacritics(clean_text)
    if remove_repeated_chars:
        clean_text = remove_repeating_char(clean_text)

    if n == 1:
        return clean_text.split()
    else:
        tokens = clean_text.split()
        grams = tokens
        for i in range(2, n + 1):
            grams += [' '.join(g) for g in list(window(tokens, i))]

        return grams

def char_ngram(s, n):
  if len(s)>n:
    return [s[i:i+n] for i in range(len(s)-n+1)]
  else:
    return s

def rec_char_ngram(ls,n):
  ls = [i for i in ls if len(i)>n]
  return [j for jj in  [char_ngram(i,n) for i in ls] for j in jj]
    



def window(words_seq, n):
    """Returns a sliding window (of width n) over data from the iterable"""
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(words_seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result


def remove_repeating_char(text):
    # return re.sub(r'(.)\1+', r'\1', text)     # keep only 1 repeat
    return re.sub(r'(.)\1+', r'\1\1', text)  # keep 2 repeat

def document_features(document, corpus_features):
    document_words = set(document)
    features = {}
    for word in corpus_features:
        features['has({})'.format(word)] = (word in document_words)
    return features

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load corpus

In [936]:
for i in [1,2,4]:
  if True:
    continue
  print(i)

In [937]:
l = ['assds', 'asasdaads', 'asdaasd','jj']

In [938]:
rec_char_ngram(l,3)

['ass',
 'ssd',
 'sds',
 'asa',
 'sas',
 'asd',
 'sda',
 'daa',
 'aad',
 'ads',
 'asd',
 'sda',
 'daa',
 'aas',
 'asd']

In [939]:
char_ngram('as', 4)

'as'

In [940]:
''.join(('s','t'))

'st'

In [941]:
char_ngram("asaddddfgg",4)

['asad', 'sadd', 'addd', 'dddd', 'dddf', 'ddfg', 'dfgg']

In [942]:
char_ngram('student',3)

['stu', 'tud', 'ude', 'den', 'ent']

In [943]:
df = pd.read_csv('/content/CY_train.csv')
clean_raw_review(df.iloc[6,0])

'  يا عيون عطشان عطشانة سهر يا قلوب تعبانة تعبانة سفر كتروا من الحب تلاقوا تلاقوا في الضلمة ألف قمر  '

In [944]:
# pos_train_file = '../input/train_Arabic_tweets_positive_20190413.tsv'
# neg_train_file = '../input/train_Arabic_tweets_negative_20190413.tsv'

# pos_test_file = '../input/test_Arabic_tweets_positive_20190413.tsv'
# neg_test_file = '../input/test_Arabic_tweets_negative_20190413.tsv'
# print('data files')
# print('train file (pos)', pos_train_file)
# print('train file (neg)', neg_train_file)
# print('test file (pos)', pos_test_file)
# print('test file (neg)', neg_test_file)

# Parameters (ngrams)

In [945]:
print('parameters')
n = 1
print('n grams:', n)

parameters
n grams: 1


# loading train data .... 

In [946]:
# print('loading train data ....')
# pos_train_data, pos_train_feat = load_tsv(pos_train_file, n)
# neg_train_data, neg_train_feat = load_tsv(neg_train_file, n)
# print('loading test data ....')
# pos_test_data, pos_test_feat = load_tsv(pos_test_file, n)
# neg_test_data, neg_test_feat = load_tsv(neg_test_file, n)

# Training data information

In [947]:
# print('train data info')
# train_data = pos_train_data + neg_train_data
# print('train data size', len(train_data))
# print('# of positive', len(pos_train_data))
# print('# of negative', len(neg_train_data))

In [948]:
# train_data,feat = load_tsv('/content/CY_train.csv', n)

In [949]:
from sklearn.model_selection import train_test_split

In [950]:
train, test = train_test_split(df,stratify=df['class'],test_size=0.2)

In [951]:
train['class'].value_counts()

pos    836
neg    781
neu    579
Name: class, dtype: int64

In [952]:
test['class'].value_counts()

pos    210
neg    195
neu    145
Name: class, dtype: int64

In [953]:
from sklearn.naive_bayes import MultinomialNB

In [954]:
m = MultinomialNB()

In [955]:
train_data, train_feat = load_tsv(train,n)
test_data, test_feat = load_tsv(test,n)

In [956]:
test_data

[(['سكايب',
   'كول',
   'ظاظا',
   'تلت',
   'ساعات',
   'ونص',
   'عشان',
   'نخلص',
   'سكشن',
   'تمانية',
   'وهنام',
   'وش',
   'الصبح',
   'قحة',
   'سكا',
   'كاي',
   'ايب',
   'ظاظ',
   'اظا',
   'ساع',
   'اعا',
   'عات',
   'عشا',
   'شان',
   'نخل',
   'خلص',
   'سكش',
   'كشن',
   'تما',
   'مان',
   'اني',
   'نية',
   'وهن',
   'هنا',
   'نام',
   'الص',
   'لصب',
   'صبح',
   'سكاي',
   'كايب',
   'ساعا',
   'اعات',
   'تمان',
   'ماني',
   'انية',
   'وهنا',
   'هنام',
   'الصب',
   'لصبح'],
  'neg'),
 (['بدات',
   'الحن',
   'النشيد',
   'بدا',
   'دات',
   'الح',
   'لحن',
   'الن',
   'لنش',
   'نشي',
   'شيد',
   'النش',
   'لنشي',
   'نشيد'],
  'neu'),
 (['منا',
   'خدت',
   'بالي',
   'حلوة',
   'فعلا',
   'انا',
   'مش',
   'ميال',
   'ليها',
   'انا',
   'اصلا',
   'بحب',
   'المانفسيتو',
   'اوي',
   'اقوي',
   'ويسترن',
   'يونيون',
   'انا',
   'بحب',
   'ويسترن',
   'اكتر',
   'بال',
   'الي',
   'حلو',
   'لوة',
   'فعل',
   'علا',
   'ميا',
   'يال',
  

# Sample training data 

In [957]:
import random
sample_size = 100
print('{} random tweets .... '.format(sample_size))
for s in random.sample(train_data, sample_size):
    print(s)

100 random tweets .... 
(['الواحد', 'بقي', 'مبضون', 'الناس', 'اللي', 'حوله', 'حاجة', 'الو', 'لوا', 'واح', 'احد', 'مبض', 'بضو', 'ضون', 'الن', 'لنا', 'ناس', 'الل', 'للي', 'حول', 'وله', 'حاج', 'اجة', 'الوا', 'لواح', 'واحد', 'مبضو', 'بضون', 'النا', 'لناس'], 'neg')
(['أصابك', 'عشقا', 'أصا', 'صاب', 'ابك', 'عشق', 'شقا', 'أصاب', 'صابك'], 'neu')
(['الى', 'الكنترول', 'انهاردة', 'اخر', 'يوم', 'حرام', 'امك', 'بقى', 'عايزين', 'نشوف', 'دعاء', 'السباعي', 'ليث', 'ابو', 'جودة', 'ارحم', 'امنا', 'العيانة', 'الك', 'لكن', 'كنت', 'نتر', 'ترو', 'رول', 'انه', 'نها', 'هار', 'ارد', 'ردة', 'حرا', 'رام', 'عاي', 'ايز', 'يزي', 'زين', 'نشو', 'شوف', 'دعا', 'عاء', 'الس', 'لسب', 'سبا', 'باع', 'اعي', 'جود', 'ودة', 'ارح', 'رحم', 'امن', 'منا', 'الع', 'لعي', 'عيا', 'يان', 'انة', 'الكن', 'لكنت', 'كنتر', 'نترو', 'ترول', 'انها', 'نهار', 'هارد', 'اردة', 'عايز', 'ايزي', 'يزين', 'السب', 'لسبا', 'سباع', 'باعي', 'العي', 'لعيا', 'عيان', 'يانة'], 'neg')
(['اصلاا', 'ليث', 'ده', 'مصرري', 'ليث', 'جودة', 'اصل', 'صلا', 'لاا', 'مصر', 'صرر

# Test data info

In [958]:
# print('test data info')
# test_data = 
# print('test data size', len(train_data))
# print('# of positive', len(pos_test_data))
# print('# of negative', len(neg_test_data))

# merging all features ...

In [959]:
print('merging all features ... ')
all_features = train_feat + test_feat
print('len(all_features):', len(all_features))

merging all features ... 
len(all_features): 112650


# Sample features 

In [960]:
print('{} sample features ...'.format(sample_size))
print(random.sample(all_features, sample_size))

100 sample features ...
['عدش', 'ليثي', 'اضحكوا', 'مسا', 'قصو', 'البلونة', 'الم', 'اعه', 'احا', 'بيد', 'ستم', 'فضل', 'يحطو', 'فرح', 'الا', 'السا', 'حزان', 'تشبهني', 'حبيبه', 'واحش', 'لناد', 'الجر', 'شهي', 'الصع', 'حد', 'شان', 'السي', 'لات', 'يكو', 'دري', 'كسم', 'عايزة', 'يران', 'يرن', 'تايهين', 'خلي', 'لافتتاح', 'لأوط', 'لنكد', 'شرموطه', 'اعات', 'غرت', 'الد', 'جوه', 'سطي', 'لمن', 'روح', 'تحطلها', 'الص', 'جين', 'تأدية', 'الرح', 'رير', 'كاتف', 'كنت', 'ينا', 'بلقاء', 'وفيق', 'سود', 'ايف', 'محبت', 'انتر', 'خير', 'خلـقا', 'عقاب', 'وسيك', 'وهته', 'الزي', 'وبلن', 'اخدي', 'عارف', 'وال', 'ايدة', 'بيتنا', 'لمهم', 'قلب', 'شعل', 'لوق', 'وات', 'يارب', 'الدي', 'لأع', 'كري', 'الس', 'الن', 'وال', 'وعل', 'قعها', 'لعال', 'ازا', 'وال', 'رافع', 'وتعددت', 'فقك', 'صاد', 'لشا', 'حد', 'اقف', 'يبو', 'لمر']


# compute frequencies

In [961]:
all_features_count = {}
for w in all_features:
    all_features_count[w] = all_features_count.get(w, 0) + 1

In [962]:
all_features_count

{'صلاة': 9,
 'الفجر': 8,
 'ركعتين': 2,
 'ادين': 8,
 'ربنا': 86,
 'الدنيا': 39,
 'كلها': 20,
 'صلا': 46,
 'لاة': 9,
 'الف': 165,
 'لفج': 8,
 'فجر': 12,
 'ركع': 2,
 'كعت': 2,
 'عتي': 9,
 'تين': 30,
 'ادي': 58,
 'دين': 73,
 'ربن': 86,
 'بنا': 146,
 'الد': 127,
 'لدن': 45,
 'دني': 53,
 'نيا': 58,
 'كله': 75,
 'لها': 82,
 'الفج': 8,
 'لفجر': 8,
 'ركعت': 2,
 'كعتي': 2,
 'عتين': 4,
 'الدن': 43,
 'لدني': 43,
 'دنيا': 46,
 'مهو': 17,
 'شغال': 6,
 'عادى': 5,
 'المشكله': 6,
 'فى': 135,
 'الفون': 1,
 'حاول': 23,
 'تحب': 32,
 'اللاب': 1,
 'بقى': 58,
 'شغا': 7,
 'غال': 19,
 'عاد': 52,
 'ادى': 12,
 'الم': 488,
 'لمش': 30,
 'مشك': 22,
 'شكل': 31,
 'لفو': 11,
 'فون': 12,
 'حاو': 25,
 'اول': 75,
 'الل': 618,
 'للا': 21,
 'لاب': 34,
 'المش': 29,
 'لمشك': 13,
 'مشكل': 21,
 'شكله': 13,
 'الفو': 9,
 'لفون': 2,
 'اللا': 6,
 'للاب': 1,
 'انا': 295,
 'عايز': 38,
 'حد': 85,
 'يكون': 21,
 'يفضل': 6,
 'يايا': 1,
 'بدون': 5,
 'أسباب': 2,
 'دورت': 1,
 'فـ': 6,
 'كشف': 6,
 'اللى': 69,
 'أعرفهم': 1,
 'لقيتنى': 1,
 'أ

# Sample Frequency

In [963]:
print('sample frequencies')
print(random.sample(list(all_features_count.items()), 30))
word = 'في'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'فى'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'من'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))

sample frequencies
[('ازله', 3), ('آرف', 1), ('دروس', 1), ('إعل', 2), ('ذائ', 1), ('ابتد', 5), ('اشوفه', 1), ('اخرب', 1), ('وامشى', 1), ('بروو', 6), ('كاء', 1), ('بصنعك', 1), ('حاجتين', 1), ('سعو', 8), ('دريـ', 1), ('تبشر', 1), ('تخص', 5), ('يــلعــبش', 1), ('لاتج', 1), ('اعرة', 1), ('جمه', 13), ('اوزة', 3), ('ملكش', 1), ('تربيهاا', 1), ('اطف', 8), ('تمتع', 3), ('حط', 1), ('سيصن', 1), ('غله', 4), ('لوبه', 2)]
freq of word في is 0
freq of word فى is 135
freq of word من is 0


# Compute Threshold

In [964]:
print('size of training data:',  len(train_data))
min_df = int(0.001 * len(train_data))
max_df = int(0.98 * len(train_data))
print('min document frequency:', min_df)
print('max document frequency:', max_df)

size of training data: 2191
min document frequency: 2
max document frequency: 2147


# Selecting Features 

In [965]:
# remove features that have frequency below/above the threshold
my_features = set([word for word, freq in all_features_count.items() if  max_df > freq > min_df ])
print(len(my_features), 'are kept out of', len(all_features))

8176 are kept out of 112650


In [966]:
# re.sub("[a-zA-Z]+", "","abv-")

# Sample of selected features 

In [967]:
print('{} sample of selected features:'.format(sample_size))
print(random.sample(list(my_features), sample_size))

100 sample of selected features:
['ممت', 'ريت', 'الاف', 'أتي', 'طــ', 'فهو', 'حبب', 'إبراهيم', 'قلوبنا', 'الشمال', 'اطف', 'لرجل', 'جرح', 'قهم', 'كادي', 'تسلم', 'لساع', 'للات', 'دام', 'داري', 'النف', 'آدم', 'جيني', 'واصا', 'لمق', 'أشر', 'لذاك', 'يحفظ', 'يصير', 'حيان', 'فاغ', 'خطاء', 'حمن', 'ومة', 'اغن', 'اسى', 'تتوقع', 'اصبح', 'تفرح', 'نيين', 'اكت', 'موقف', 'شمال', 'شلت', 'عدهم', 'خصص', 'عبي', 'واصل', 'شتغ', 'مدحت', 'ستق', 'بقو', 'هادي', 'البرد', 'سبه', 'تمنى', 'لأمل', 'عجر', 'ورت', 'حساب', 'هفض', 'بحو', 'محتا', 'الاثنين', 'روع', 'نما', 'ندنا', 'اسا', 'الكي', 'فيهم', 'فإنه', 'جوو', 'قبض', 'نفسه', 'تنف', 'لمسا', 'رمك', 'لحدي', 'نجوم', 'لموه', 'شاه', 'كلن', 'يغا', 'يجعل', 'لان', 'مميز', 'ده', 'عيم', 'نهاية', 'طاب', 'لبه', 'الدا', 'حلي', 'الماتشات', 'المولد', 'رادي', 'وحش', 'يشي', 'بطلت', 'تظهر']


# generating features for training documents ...

In [968]:
feature_sets = [(document_features(d, my_features), c) for (d, c) in train_data]

# training ...

In [969]:
classifier = nltk.NaiveBayesClassifier.train(feature_sets)
print('training is done')

training is done


# Most informative features 

In [970]:
classifier.show_most_informative_features(40)

Most Informative Features
                has(لسي) = True              neu : pos    =     21.5 : 1.0
               has(السي) = True              neu : pos    =     20.6 : 1.0
                has(لفي) = True              neu : pos    =     15.8 : 1.0
               has(الفي) = True              neu : pos    =     15.8 : 1.0
                has(واق) = True              neg : pos    =     13.9 : 1.0
                has(لوط) = True              neu : pos    =     11.8 : 1.0
               has(لوطن) = True              neu : pos    =     11.8 : 1.0
               has(الوط) = True              neu : pos    =     11.8 : 1.0
                has(فاي) = True              neg : pos    =     11.7 : 1.0
                has(وطن) = True              neu : neg    =     11.6 : 1.0
               has(اهين) = True              pos : neg    =     11.6 : 1.0
                 has(رب) = True              pos : neg    =     11.6 : 1.0
              has(الوطن) = True              neu : pos    =     11.2 : 1.0

# generating features for test documents ...

In [971]:
test_features = [(document_features(d, my_features), c) for (d, c) in train_data]

# classify test instances 

In [973]:
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_features):
    ref_sets[label].add(i)
    observed = classifier.classify(feats)
    test_sets[observed].add(i)

# Results 

In [974]:
print('accuracy: ', nltk.classify.accuracy(classifier, test_features))
print('pos precision: ', precision(ref_sets['pos'], test_sets['pos']))
print('pos recall:', recall(ref_sets['pos'], test_sets['pos']))
print('neg precision: ', precision(ref_sets['neg'], test_sets['neg']))
print('neg recall:', recall(ref_sets['neg'], test_sets['neg']))
print('neu precision: ', precision(ref_sets['neu'], test_sets['neu']))
print('neu recall:', recall(ref_sets['neu'], test_sets['neu']))
print('positive f-score:', f_measure(ref_sets['pos'], test_sets['pos']))
print('negative f-score:', f_measure(ref_sets['neg'], test_sets['neg']))
print('neutral f-score:', f_measure(ref_sets['neu'], test_sets['neu']))

accuracy:  0.8452761296211776
pos precision:  0.7933467741935484
pos recall: 0.9459134615384616
neg precision:  0.8697318007662835
neg recall: 0.8730769230769231
neu precision:  0.9230769230769231
neu recall: 0.6632124352331606
positive f-score: 0.8629385964912281
negative f-score: 0.871401151631478
neutral f-score: 0.7718592964824121


In [975]:
accuracy:  0.7659671532846716
pos precision:  0.7375381485249237
pos recall: 0.8682634730538922
neg precision:  0.7790849673202614
neg recall: 0.7650834403080873
neu precision:  0.8063063063063063
neu recall: 0.6193771626297578
positive f-score: 0.7975797579757976
negative f-score: 0.772020725388601
neutral f-score: 0.700587084148728

SyntaxError: ignored