# Arabic Sentiment Analysis in tweets using Naive Bayes Machine learning Algorithm and unigram features

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk
from nltk import NaiveBayesClassifier
from nltk.metrics.scores import f_measure, precision, recall
import collections
from builtins import chr
from nltk import word_tokenize, ngrams


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

# Any results you write to the current directory are saved as output.

# define functions 

In [None]:
import re
from itertools import islice
from nltk.corpus import stopwords
nltk.download('stopwords')



def arabicrange():
        """return a list of arabic characteres .
        Return a list of characteres between \u060c to \u0652
        @return: list of arabic characteres.
        @rtype: unicode;
        """
        mylist = [];
        for i in range(0x0620, 0x00650):
            try :
                mylist.append(chr(i));
            except ValueError:
                pass;
        return mylist[:-5]+[' '];

arb = arabicrange()
stopwords_list = stopwords.words('arabic')

def clean_raw_review(body):
        # return ''.join(c for c in body if c in arb)
        return ''.join(c if c in arb else ' ' for c in body)

         # patterns to remove first
        pat = [\
            (u'http[s]?://[a-zA-Z0-9_\-./~\?=%&]+', u''),  # remove links
            (u'www[a-zA-Z0-9_\-?=%&/.~]+', u''),
#            u'\n+': u' ',                     # remove newlines
            (u'<br />', u' '),  # remove html line breaks
            (u'</?[^>]+>', u' '),  # remove html markup
#            u'http': u'',
            (u'[a-zA-Z]+\.org', u''),
            (u'[a-zA-Z]+\.com', u''),
            (u'://', u''),
            (u'&[^;]+;', u' '),
            (u':D', u':)'),
#            (u'[0-9/]+', u''),
#            u'[a-zA-Z.]+': u'',
#            u'[^0-9' + u''.join(self.arabicrange()) + \
#                u"!.,;:$%&*%'#(){}~`\[\]/\\\\\"" + \
#                u'\s^><\-_\u201D\u00AB=\u2026]+': u'',          # remove latin characters
            (u'\s+', u' '),  # remove spaces
            (u'\.+', u'.'),  # multiple dots
            (u'[\u201C\u201D]', u'"'),  # “
            (u'[\u2665\u2764]', u''),  # heart symbol
            (u'[\u00BB\u00AB]', u'"'),
            (u'\u2013', u'-'),  # dash
        ]

        # patterns that disqualify a review
        remove_if_there = [\
            (u'[^0-9' + u''.join(arabicrange()) + \
                u"!.,;:$%&*%'#(){}~`\[\]/\\\\\"" + \
                u'\s\^><\-_\u201D\u00AB=\u2026+|' + \
                u'\u0660-\u066D\u201C\u201D' + \
                u'\ufefb\ufef7\ufef5\ufef9]+', u''),  # non arabic characters
        ]

        # patterns that disqualify if empty after removing
        remove_if_empty_after = [\
            (u'[0-9a-zA-Z\-_]', u' '),  # alpha-numeric
            (u'[0-9' + u".,!;:$%&*%'#(){}~`\[\]/\\\\\"" + \
                u'\s\^><`\-=_+]+', u''),  # remove just punctuation
            (u'\s+', u' '),  # remove spaces
        ]

        # remove again
        # patterns to remove
        pat2 = [\
#            u'[^0-9' + u''.join(self.arabicrange()) + \
#                u"!.,;:$%&*%'#(){}~`\[\]/\\\\\"" + \
#                u'\s^><\-_\u201D\u00AB=\u2026]+': u'',          # remove latin characters
        ]

        skip = False

        # if empty body, skip
        if body == u'': skip = True

        # do some subsitutions
        for k, v in pat:
            body = re.sub(k, v, body)

        # remove if exist
        for k, v in remove_if_there:
            if re.search(k, body):
                skip = True

        # remove if empty after replacing
        for k, v in remove_if_empty_after:
            temp = re.sub(k, v, body)
            if temp == u" " or temp == u"":
                skip = True

        # do some more subsitutions
        if not skip:
            for k, v in pat2:
                body = re.sub(k, v, body)

        # if empty string, skip
        if body == u'' or body == u' ':
            skip = True

        if not skip:
            return body
        else:
            return u""

def load_tsv(data_file, n):
    data_features = list()
    data = list()
    # infile = open(data_file, encoding='utf-8')

    infile = data_file.to_numpy()
    for line in infile:
        # if not line.strip():
        #     continue
        
        # text,label = line.split(',')
        text,label = line
        text = clean_raw_review(text)
        text_features = process_text(text, n)
        if text_features:
            text_features = [s for s in text_features if s and len(s)>1 and s not in stopwords_list]
            if not text_features:
              continue
            print(text_features)
            text_features = text_features +\
            rec_char_ngram(text_features,2)+\
            rec_char_ngram(text_features,3)+\
            rec_char_ngram(text_features,4)+\
            rec_char_ngram(text_features,5)+\

            data_features += text_features
            data.append((text_features, label))
        
    return data, data_features

def process_text(text, n=1,
                 remove_vowel_marks=False,
                 remove_repeated_chars=True,
                 ):
    clean_text = text
    # if remove_vowel_marks:
    #     clean_text = remove_diacritics(clean_text)
    if remove_repeated_chars:
        clean_text = remove_repeating_char(clean_text)

    if n == 1:
        return clean_text.split()
    else:
        tokens = clean_text.split()
        grams = tokens
        for i in range(2, n + 1):
            grams += [' '.join(g) for g in list(window(tokens, i))]

        return grams

def char_ngram(s, n):
  if len(s)>n:
    return [s[i:i+n] for i in range(len(s)-n+1)]
  else:
    return s

def rec_char_ngram(ls,n):
  ls = [i for i in ls if len(i)>n]
  return [j for jj in  [char_ngram(i,n) for i in ls] for j in jj]
    



def window(words_seq, n):
    """Returns a sliding window (of width n) over data from the iterable"""
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(words_seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result


def remove_repeating_char(text):
    # return re.sub(r'(.)\1+', r'\1', text)     # keep only 1 repeat
    return re.sub(r'(.)\1+', r'\1\1', text)  # keep 2 repeat

def document_features(document, corpus_features):
    document_words = set(document)
    features = {}
    for word in corpus_features:
        features['has({})'.format(word)] = (word in document_words)
    return features

SyntaxError: ignored

# Load corpus

In [None]:
for i in [1,2,4]:
  if True:
    continue
  print(i)

In [None]:
l = ['assds', 'asasdaads', 'asdaasd','jj']

In [None]:
rec_char_ngram(l,3)

In [None]:
char_ngram('as', 4)

In [None]:
''.join(('s','t'))

In [None]:
char_ngram("asaddddfgg",4)

In [None]:
char_ngram('student',3)

In [None]:
df = pd.read_csv('/content/CY_train.csv')
clean_raw_review(df.iloc[6,0])

In [None]:
# pos_train_file = '../input/train_Arabic_tweets_positive_20190413.tsv'
# neg_train_file = '../input/train_Arabic_tweets_negative_20190413.tsv'

# pos_test_file = '../input/test_Arabic_tweets_positive_20190413.tsv'
# neg_test_file = '../input/test_Arabic_tweets_negative_20190413.tsv'
# print('data files')
# print('train file (pos)', pos_train_file)
# print('train file (neg)', neg_train_file)
# print('test file (pos)', pos_test_file)
# print('test file (neg)', neg_test_file)

# Parameters (ngrams)

In [None]:
print('parameters')
n = 1
print('n grams:', n)

# loading train data .... 

In [None]:
# print('loading train data ....')
# pos_train_data, pos_train_feat = load_tsv(pos_train_file, n)
# neg_train_data, neg_train_feat = load_tsv(neg_train_file, n)
# print('loading test data ....')
# pos_test_data, pos_test_feat = load_tsv(pos_test_file, n)
# neg_test_data, neg_test_feat = load_tsv(neg_test_file, n)

# Training data information

In [None]:
# print('train data info')
# train_data = pos_train_data + neg_train_data
# print('train data size', len(train_data))
# print('# of positive', len(pos_train_data))
# print('# of negative', len(neg_train_data))

In [None]:
# train_data,feat = load_tsv('/content/CY_train.csv', n)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df,stratify=df['class'],test_size=0.2)

In [None]:
train['class'].value_counts()

In [None]:
test['class'].value_counts()

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
m = MultinomialNB()

In [None]:
train_data, train_feat = load_tsv(train,n)
test_data, test_feat = load_tsv(test,n)

In [None]:
test_data

[(['الافضل',
   'ان',
   'تعيش',
   'حياتك',
   'وتدعهم',
   'يعيشوا',
   'حياتهم',
   'وعلي',
   'قدر',
   'اقترابهم',
   'منك',
   'اقترب',
   'انت',
   'واجعل',
   'لمشاعرك',
   'وزنا',
   'تسريب',
   'الشرق',
   'لمكتب',
   'السيسي',
   'الا',
   'لاف',
   'افض',
   'فضل',
   'تعي',
   'عيش',
   'حيا',
   'يات',
   'اتك',
   'وتد',
   'تدع',
   'دعه',
   'عهم',
   'يعي',
   'عيش',
   'يشو',
   'شوا',
   'حيا',
   'يات',
   'اته',
   'تهم',
   'وعل',
   'علي',
   'اقت',
   'قتر',
   'ترا',
   'راب',
   'ابه',
   'بهم',
   'اقت',
   'قتر',
   'ترب',
   'واج',
   'اجع',
   'جعل',
   'لمش',
   'مشا',
   'شاع',
   'اعر',
   'عرك',
   'وزن',
   'زنا',
   'تسر',
   'سري',
   'ريب',
   'الش',
   'لشر',
   'شرق',
   'لمك',
   'مكت',
   'كتب',
   'الس',
   'لسي',
   'سيس',
   'يسي',
   'الاف',
   'لافض',
   'افضل',
   'حيات',
   'ياتك',
   'وتدع',
   'تدعه',
   'دعهم',
   'يعيش',
   'عيشو',
   'يشوا',
   'حيات',
   'ياته',
   'اتهم',
   'اقتر',
   'قترا',
   'تراب',
   'رابه',
   'ابهم',
   

# Sample training data 

In [None]:
import random
sample_size = 100
print('{} random tweets .... '.format(sample_size))
for s in random.sample(train_data, sample_size):
    print(s)

100 random tweets .... 
(['الباقي', 'الزبالة', 'الب', 'لبا', 'باق', 'اقي', 'الز', 'لزب', 'زبا', 'بال', 'الة', 'البا', 'لباق', 'باقي', 'الزب', 'لزبا', 'زبال', 'بالة', 'ال', 'لب', 'با', 'اق', 'قي', 'ال', 'لز', 'زب', 'با', 'ال', 'لة'], 'neg')
(['الرد', 'كمان', 'مش', 'حنضحك', 'روحنا', 'اخوانا', 'العرب', 'بيجيوا', 'الصورة', 'دى', 'فاكرين', 'ان', 'مصر', 'بيت', 'دعارة', 'مفتوح', 'وان', 'ستاتها', 'مستباحة', 'الواقعية', 'الر', 'لرد', 'كما', 'مان', 'حنض', 'نضح', 'ضحك', 'روح', 'وحن', 'حنا', 'اخو', 'خوا', 'وان', 'انا', 'الع', 'لعر', 'عرب', 'بيج', 'يجي', 'جيو', 'يوا', 'الص', 'لصو', 'صور', 'ورة', 'فاك', 'اكر', 'كري', 'رين', 'دعا', 'عار', 'ارة', 'مفت', 'فتو', 'توح', 'ستا', 'تات', 'اته', 'تها', 'مست', 'ستب', 'تبا', 'باح', 'احة', 'الو', 'لوا', 'واق', 'اقع', 'قعي', 'عية', 'حنضح', 'نضحك', 'روحن', 'وحنا', 'اخوا', 'خوان', 'وانا', 'العر', 'لعرب', 'بيجي', 'يجيو', 'جيوا', 'الصو', 'لصور', 'صورة', 'فاكر', 'اكري', 'كرين', 'دعار', 'عارة', 'مفتو', 'فتوح', 'ستات', 'تاته', 'اتها', 'مستب', 'ستبا', 'تباح', 'باحة', 'ال

# Test data info

In [None]:
# print('test data info')
# test_data = 
# print('test data size', len(train_data))
# print('# of positive', len(pos_test_data))
# print('# of negative', len(neg_test_data))

# merging all features ...

In [None]:
print('merging all features ... ')
all_features = train_feat + test_feat
print('len(all_features):', len(all_features))

merging all features ... 
len(all_features): 195204


# Sample features 

In [None]:
print('{} sample features ...'.format(sample_size))
print(random.sample(all_features, sample_size))

100 sample features ...
['بصر', 'مس', 'بيد', 'لمفأ', 'ود', 'يبتى', 'سف', 'نفس', 'يين', 'وا', 'فسترتني', 'لا', 'لمز', 'هم', 'ني', 'انب', 'عامله', 'ادس', 'قبر', 'ال', 'للي', 'لبسا', 'يس', 'اص', 'فش', 'منى', 'باس', 'لت', 'اموا', 'يـه', 'اختب', 'ال', 'اع', 'حق', 'تلا', 'هيف', 'بتي', 'لوم', 'ال', 'هل', 'بض', 'احلى', 'لاخر', 'لشوق', 'عاف', 'وت', 'بن', 'كفتة', 'وهم', 'عا', 'الب', 'ان', 'مك', 'عتنا', 'وا', 'ضاال', 'لى', 'تعات', 'اك', 'قه', 'هالحلم', 'مريك', 'وال', 'عندها', 'اعم', 'الع', 'يز', 'ادا', 'لر', 'ان', 'دسس', 'وي', 'منز', 'لام', 'الت', 'لم', 'لان', 'بل', 'نا', 'البيت', 'لح', 'عرف', 'ال', 'ده', 'الش', 'حد', 'المن', 'تكون', 'سماس', 'قر', 'لن', 'شا', 'الله', 'فا', 'يت', 'ال', 'قـا', 'ومكم', 'له', 'ست']


# compute frequencies

In [None]:
all_features_count = {}
for w in all_features:
    all_features_count[w] = all_features_count.get(w, 0) + 1

In [None]:
all_features_count

{'الفاشل': 1,
 'تخرج': 2,
 'بنظام': 1,
 'التحسين': 1,
 'اختراعك': 1,
 'الف': 165,
 'لفا': 15,
 'فاش': 7,
 'اشل': 6,
 'تخر': 4,
 'خرج': 7,
 'بنظ': 1,
 'نظا': 7,
 'ظام': 6,
 'الت': 195,
 'لتح': 12,
 'تحس': 8,
 'حسي': 7,
 'سين': 24,
 'اخت': 21,
 'ختر': 3,
 'ترا': 30,
 'راع': 8,
 'اعك': 6,
 'الفا': 14,
 'لفاش': 2,
 'فاشل': 6,
 'بنظا': 1,
 'نظام': 5,
 'التح': 9,
 'لتحس': 1,
 'تحسي': 2,
 'حسين': 5,
 'اختر': 3,
 'خترا': 3,
 'تراع': 2,
 'راعك': 2,
 'ال': 5185,
 'لف': 207,
 'فا': 255,
 'اش': 146,
 'شل': 26,
 'تخ': 70,
 'خر': 104,
 'رج': 109,
 'بن': 310,
 'نظ': 31,
 'ظا': 32,
 'ام': 682,
 'لت': 320,
 'تح': 269,
 'حس': 136,
 'سي': 293,
 'ين': 913,
 'اخ': 163,
 'خت': 41,
 'تر': 255,
 'را': 458,
 'اع': 315,
 'عك': 26,
 'الحق': 28,
 'يتقال': 1,
 'مسلسل': 5,
 'فشيخ': 2,
 'بصراحة': 2,
 'الح': 275,
 'لحق': 29,
 'يتق': 5,
 'تقا': 17,
 'قال': 61,
 'مسل': 20,
 'سلس': 8,
 'لسل': 41,
 'فشي': 2,
 'شيخ': 11,
 'بصر': 7,
 'صرا': 18,
 'راح': 26,
 'احة': 41,
 'يتقا': 1,
 'تقال': 5,
 'مسلس': 7,
 'سلسل': 8,
 'بصرا'

# Sample Frequency

In [None]:
print('sample frequencies')
print(random.sample(list(all_features_count.items()), 30))
word = 'في'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'فى'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'من'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))

sample frequencies
[('فيل', 14), ('إك', 2), ('انكم', 3), ('ودان', 2), ('صري', 56), ('ملاء', 2), ('ضفدع', 1), ('فرنس', 2), ('القهوه', 2), ('سانجيرمان', 1), ('اءه', 1), ('يوع', 1), ('كي', 128), ('أخبا', 7), ('والجنون', 1), ('تستغرب', 1), ('بأو', 1), ('تمشي', 1), ('اقصا', 1), ('راسو', 1), ('فودا', 2), ('بفستان', 1), ('وباكله', 1), ('مغير', 1), ('واغفرلي', 1), ('تفجر', 1), ('طبعاا', 1), ('غند', 2), ('وليش', 1), ('ويول', 1)]
freq of word في is 282
freq of word فى is 162
freq of word من is 330


# Compute Threshold

In [None]:
print('size of training data:',  len(train_data))
min_df = int(0.001 * len(train_data))
max_df = int(0.98 * len(train_data))
print('min document frequency:', min_df)
print('max document frequency:', max_df)

size of training data: 2189
min document frequency: 2
max document frequency: 2145


# Selecting Features 

In [None]:
# remove features that have frequency below/above the threshold
my_features = set([word for word, freq in all_features_count.items() if  max_df > freq > min_df ])
print(len(my_features), 'are kept out of', len(all_features))

8921 are kept out of 195204


In [None]:
# re.sub("[a-zA-Z]+", "","abv-")

# Sample of selected features 

In [None]:
print('{} sample of selected features:'.format(sample_size))
print(random.sample(list(my_features), sample_size))

100 sample of selected features:
['عيل', 'قتصا', 'ليوم', 'ترين', 'الأه', 'حو', 'موج', 'قرض', 'ريش', 'لفصل', 'رقص', 'لظا', 'عنو', 'الرسول', 'لين', 'بصوت', 'يسمع', 'مصدر', 'جبل', 'البت', 'تذر', 'عبو', 'الجن', 'ردن', 'عاهم', 'الظالمين', 'تمني', 'أنها', 'امت', 'ختصا', 'كلوز', 'طاهر', 'نعيش', 'فطي', 'قتهم', 'نــي', 'ساسا', 'حبب', 'تقر', 'اضع', 'يلنت', 'سلوب', 'اراة', 'عذ', 'التواصل', 'شتاق', 'رضا', 'بأس', 'إنتي', 'عوو', 'لخلي', 'عده', 'لبح', 'ركب', 'تث', 'مآ', 'افات', 'تكت', 'لفتر', 'هوى', 'متني', 'دفة', 'لسن', 'قبر', 'رحة', 'كيل', 'كتو', 'تلها', 'مصد', 'أخي', 'نمو', 'مله', 'مسلس', 'إق', 'بنحبك', 'الري', 'بيك', 'سابا', 'رجول', 'ابه', 'مغني', 'الإق', 'شق', 'فري', 'عار', 'فظة', 'بقر', 'يره', 'جأ', 'لأهل', 'بتز', 'يكف', 'لى', 'أك', 'حلقة', 'قصة', 'تكن', 'لبدء', 'سهل', 'روحك']


# generating features for training documents ...

In [None]:
feature_sets = [(document_features(d, my_features), c) for (d, c) in train_data]

# training ...

In [None]:
classifier = nltk.NaiveBayesClassifier.train(feature_sets)
print('training is done')

# Most informative features 

In [None]:
classifier.show_most_informative_features(40)

# generating features for test documents ...

In [None]:
test_features = [(document_features(d, my_features), c) for (d, c) in train_data]

In [None]:
print(test_features)

# classify test instances 

In [None]:
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_features):
    ref_sets[label].add(i)
    observed = classifier.classify(feats)
    test_sets[observed].add(i)

# Results 

In [None]:
print('accuracy: ', nltk.classify.accuracy(classifier, test_features))
print('pos precision: ', precision(ref_sets['pos'], test_sets['pos']))
print('pos recall:', recall(ref_sets['pos'], test_sets['pos']))
print('neg precision: ', precision(ref_sets['neg'], test_sets['neg']))
print('neg recall:', recall(ref_sets['neg'], test_sets['neg']))
print('neu precision: ', precision(ref_sets['neu'], test_sets['neu']))
print('neu recall:', recall(ref_sets['neu'], test_sets['neu']))
print('positive f-score:', f_measure(ref_sets['pos'], test_sets['pos']))
print('negative f-score:', f_measure(ref_sets['neg'], test_sets['neg']))
print('neutral f-score:', f_measure(ref_sets['neu'], test_sets['neu']))

In [None]:
accuracy:  0.7659671532846716
pos precision:  0.7375381485249237
pos recall: 0.8682634730538922
neg precision:  0.7790849673202614
neg recall: 0.7650834403080873
neu precision:  0.8063063063063063
neu recall: 0.6193771626297578
positive f-score: 0.7975797579757976
negative f-score: 0.772020725388601
neutral f-score: 0.700587084148728