<a href="https://colab.research.google.com/github/MohamedKKhalaf/NLP/blob/main/Selected_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

import nltk
from nltk import NaiveBayesClassifier
from nltk.metrics.scores import f_measure, precision, recall
import collections

import re
from itertools import islice

def load_tsv(data_file, n):
    data_features = list()
    data = list()
    infile = open(data_file, encoding='utf-8')
    for line in infile:
        if not line.strip():
            continue
        label, text = line.split('\t')
        text_features = process_text(text, n)
        if text_features:
            data_features += text_features
            data.append((text_features, label))
    return data, data_features

def process_text(text, n=1,
                 remove_vowel_marks=False,
                 remove_repeated_chars=False,
                 ):
    clean_text = text
    if remove_vowel_marks:
        clean_text = remove_diacritics(clean_text)
    if remove_repeated_chars:
        clean_text = remove_repeating_char(clean_text)

    if n == 1:
        return clean_text.split()
    else:
        tokens = clean_text.split()
        grams = tokens
        for i in range(2, n + 1):
            grams += [  ' '.join(g) for g in list(window(tokens, i))  ]
        return grams



def window(words_seq, n):
    """Returns a sliding window (of width n) over data from the iterable"""
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(words_seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result


def remove_repeating_char(text):
    # return re.sub(r'(.)\1+', r'\1', text)     # keep only 1 repeat
    return re.sub(r'(.)\1+', r'\1\1', text)  # keep 2 repeat

def document_features(document, corpus_features):
    document_words = set(document)
    features = {}
    for word in corpus_features:
        features['has({})'.format(word)] = (word in document_words)
    return features

In [None]:
pos_train_file = 'train_Arabic_tweets_positive_20190413.tsv'
neg_train_file = 'train_Arabic_tweets_negative_20190413.tsv'

pos_test_file = 'test_Arabic_tweets_positive_20190413.tsv'
neg_test_file = 'test_Arabic_tweets_negative_20190413.tsv'
print('data files')
print('train file (pos)', pos_train_file)
print('train file (neg)', neg_train_file)
print('test file (pos)', pos_test_file)
print('test file (neg)', neg_test_file)

data files
train file (pos) train_Arabic_tweets_positive_20190413.tsv
train file (neg) train_Arabic_tweets_negative_20190413.tsv
test file (pos) test_Arabic_tweets_positive_20190413.tsv
test file (neg) test_Arabic_tweets_negative_20190413.tsv


In [None]:
n = 1
print('loading train data ....')
pos_train_data, pos_train_feat = load_tsv(pos_train_file, n)
neg_train_data, neg_train_feat = load_tsv(neg_train_file, n)
print('loading test data ....')
pos_test_data, pos_test_feat = load_tsv(pos_test_file, n)
neg_test_data, neg_test_feat = load_tsv(neg_test_file, n)

loading train data ....
loading test data ....


# Training data information

In [None]:
print('train data info')
train_data = pos_train_data + neg_train_data
print('train data size', len(train_data))
print('# of positive', len(pos_train_data))
print('# of negative', len(neg_train_data))

train data info
train data size 47000
# of positive 23879
# of negative 23121


# Sample training data

In [None]:
import random
sample_size = 100
print('{} random tweets .... '.format(sample_size))
for s in random.sample(train_data, sample_size):
    print(s)

100 random tweets .... 
(['اشراقة', '/', 'لن', 'تغلبنا', 'الدنيا', 'ونحن', 'نملك', 'قلوبا', 'نعاهد', 'ﷲ', 'صباحا', 'ومساء', 'بقولنا', ':{', 'إياك', 'نعبد', 'وإياك', 'نستعين', '}', 'طبتم', 'وطاب', 'يومكم', '🌹'], 'pos')
(['ذلحين', 'نفسيتي', 'محتاجه', 'ترويقه', 'يلا', 'شوف', 'لي', 'حل', 'اروق', '😢'], 'neg')
(['ٰ', 'مشش', 'معقول', '😨'], 'neg')
(['لا', 'نحتاجك', 'ي', 'سطحي', 'ي', 'مقرف', '😷'], 'neg')
(['\u200cسر', 'المشاهير', 'بوزن', 'مثالي', 'معنا', 'رشاقتي', '🤗هي', 'البدايه', 'الصحيه', 'لجسم', 'سليم', 'وقوام', 'رش☺يق', 'الدايت', 'الاول', 'حمية', 'كلين…'], 'neg')
(['الله', 'يرحم', 'ميو', 'بن', 'ميو', 'الشرازي', '💔'], 'neg')
(['وكيل', 'الشيطان', '#قطر', 'لماذا', '#الدوحه', 'و', 'في', 'عهد', '#تنظيم_الحمدين', 'دعمها', 'للمعاض', 'السعودي', 'الاماراتي', 'الموريتاني', 'الليبي', 'الصومالي', 'ا…'], 'neg')
(['ظهور', 'راقصه', 'عاريه', 'تماما', 'في', 'احدى', 'فيديوهات', 'الفرق', 'الكوريه', '😱', 'شاهد', 'قبل', 'الحذف', '!!🔥🔥'], 'neg')
(['الله', 'يابن', 'زنان', 'المفروض', 'انت', 'اخر', 'واحد', 'تتكلم'

In [None]:
print('test data info')
test_data = pos_test_data + neg_test_data
print('test data size', len(train_data))
print('positive:', len(pos_test_data))
print('negative:', len(neg_test_data))

test data info
test data size 47000
positive: 5970
negative: 5781


In [None]:
#merging features
all_features = pos_train_feat + neg_train_feat + \
               pos_test_feat + pos_test_feat
print('length of features:', len(all_features))

length of features: 770508


In [None]:
print('{} sample features ...'.format(sample_size))
print(random.sample(all_features, sample_size))

100 sample features ...
['تتفرج', 'وانته', ':(', 'و', 'فالدنيا', '#حرم_السفير', 'ولابق…', 'يا', 'المفرطة', 'الاخطاء', 'هي', 'ايامها', 'لاعب', 'اشوف', 'حراام', '😂', 'الغيره', 'ال', 'الزاخرة', 'يشجعون', 'البملا', 'تونا', 'القلب', 'كان', 'على', 'السنين', 'جيفآرا', 'أحبها', 'الله', 'الجو', 'سحب', 'ستيشن', 'وهو', 'مقاومه', 'الخير', 'أجلك', '🌹', 'قو', 'ما', '.', '🌷', 'اليومي', 'تقريبا', 'هبوط', 'هاي', 'لك', 'من', 'والتي', 'هذا', 'تسكت', 'ويمجدوا', 'تثبت', 'التاق', '#ڪبريآء_آميرﮩ', 'في', 'قوتنا', 'يشتغل', 'صيااح', 'ما', '.', 'يارب', '.!', 'اهديت', 'إذا', 'سأكمل', '#مسابقه', 'خطيبة', 'قسم', 'يومهم', '👍🏻', 'عديل', '💙', 'على', 'الفشل', 'لكل', 'محمد', '..', '#النصر_الاتحاد', 'بعطائك..', '..', 'بنت', 'قلب', '..', 'من', 'نطلع', 'جبت', 'كيف', 'مساح', 'مرتين', '👌', 'درجه', 'من', '.', 'ما', 'رقصتنا"', 'العطرة', 'فيه', 'تاثير', 'لاتسأل', 'لما']


In [None]:
#frequencies

all_features_count = {}
for w in all_features:
    all_features_count[w] = all_features_count.get(w, 0) + 1

In [None]:
#sample frequencies
print('sample frequencies')
print(random.sample(list(all_features_count.items()), 30))
word = 'في'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'فى'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'من'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))

sample frequencies
[('وهلت', 7), ('تل…', 1), ('وانشقاقات', 10), ('ترقصص', 1), ('عطها', 1), ('يلازمنا', 1), ('أدبا', 3), ('ابطني', 1), ('❤ك', 1), ('السنقل', 1), ('تبوق', 1), ('فاهمتك', 1), ('ارسله', 2), ('السعيده', 7), ('عرفتني', 1), ('مربع', 2), ('حرمان…', 4), ('يالهوي', 1), ('يحرقون', 1), ('الجيش', 23), ('القوى', 4), ('الرتوته', 2), ('تمام؟', 2), ('طرد', 107), ('الإعلامية،النادي', 8), ('العلم؛…', 2), ('بدلا…', 4), ('اعطتك', 1), ('Star', 1), ('❄', 5)]
freq of word في is 9550
freq of word فى is 220
freq of word من is 12655


In [None]:
#threshold
print('size of training data:',  len(train_data))
min_df = int(0.001 * len(train_data))
max_df = int(0.98 * len(train_data))
print('min document frequency:', min_df)
print('max document frequency:', max_df)

size of training data: 47000
min document frequency: 47
max document frequency: 46060


In [None]:
# remove features that have frequency below/above the threshold
my_features = set([word for word, freq in all_features_count.items() if  max_df > freq > min_df ])
print(len(my_features), 'are kept out of', len(all_features))

1961 are kept out of 770508


In [None]:
print('{} sample of selected features:'.format(sample_size))
print(random.sample(list(my_features), sample_size))

100 sample of selected features:
['نبي', 'الجميلة', 'يلا', 'لي', 'حتي', 'كذا', 'الثاني', 'يصلون', 'لله', 'ونحن', 'الجمعة', 'دع', 'أجل', 'ياللي', 'الا', 'ع', 'وغايات', 'لابد', 'الأغاني', 'بنشكر', '✴️', 'دول', 'زرع', 'ليه', 'يخيب', 'البشير', '🍯', 'غدا', 'ينام', 'الجديد', 'فترة', 'لاحول', 'تماما', 'بعض', 'ضمن', 'بما', 'بل', 'سألوا', 'اصبحنا', 'لتغريداتك؟!', 'توجيه', 'مندس', 'فيك', 'النفس', 'لقاء', 'الدولية', 'يستاهل', 'مباراة', '👍🏻', 'اكتب', 'وشافت', 'ويوم', 'الي', 'فيها', 'لم', 'خلف', 'الجمال', 'فلا', 'برج', 'بها', 'اتوقع', 'تجاوزنا', '😫', 'نسينا', '↴', 'مالي', 'ﻭﻓﻲ', 'صلوا', 'ماوحشتك', 'ماحنا', 'لمن', 'واللي', ':', 'اشوف', 'بدري', 'عشق', 'عنك', 'تلفزيونية', 'الجاهل', 'ﷺ', 'كن', 'سيدنا', 'ذكر', '♪', 'القدر', 'لحدي', 'المرأة', 'العروس', 'مره', '↓', '🥀', 'تستاهل', 'الأهلي', '||', 'الحق', 'كانوا', 'زعماء', 'تجد', 'يقدر', 'اجل']


# generating features for training documents ...

In [None]:
feature_sets = [(document_features(d, my_features), c) for (d, c) in train_data]

# training ...

In [None]:
classifier = nltk.NaiveBayesClassifier.train(feature_sets)
print('training is done')

training is done


# Most informative features

In [None]:
classifier.show_most_informative_features(40)

Most Informative Features
               has(موثق) = True              pos : neg    =    238.5 : 1.0
                  has(😭) = True              neg : pos    =    202.0 : 1.0
                  has(😢) = True              neg : pos    =    171.3 : 1.0
            has(المسيار) = True              pos : neg    =    170.1 : 1.0
              has(وصلوا) = True              pos : neg    =    166.9 : 1.0
                  has(😳) = True              neg : pos    =    164.2 : 1.0
             has(الشروط) = True              pos : neg    =    151.4 : 1.0
              has(وتابع) = True              pos : neg    =    143.9 : 1.0
               has(ببكي) = True              neg : pos    =    143.6 : 1.0
                  has(🥀) = True              neg : pos    =    132.4 : 1.0
              has(السحب) = True              pos : neg    =    118.4 : 1.0
                  has(💐) = True              pos : neg    =    116.5 : 1.0
             has(العروس) = True              neg : pos    =    113.3 : 1.0

# generating features for test documents ...

In [None]:
test_features = [(document_features(d, my_features), c) for (d, c) in test_data]

# classify test instances

In [None]:
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_features):
    ref_sets[label].add(i)
    observed = classifier.classify(feats)
    test_sets[observed].add(i)

# Results

In [None]:
print('accuracy: ', nltk.classify.accuracy(classifier, test_features))
print('pos precision: ', precision(ref_sets['pos'], test_sets['pos']))
print('pos recall:', recall(ref_sets['pos'], test_sets['pos']))
print('neg precision: ', precision(ref_sets['neg'], test_sets['neg']))
print('neg recall:', recall(ref_sets['neg'], test_sets['neg']))
print('positive f-score:', f_measure(ref_sets['pos'], test_sets['pos']))
print('negative f-score:', f_measure(ref_sets['neg'], test_sets['neg']))

accuracy:  0.8913283975831844
pos precision:  0.9198425478618716
pos recall: 0.8611390284757119
neg precision:  0.8654657578708211
neg recall: 0.9225047569624633
positive f-score: 0.8895233151656718
negative f-score: 0.8930754416813196
