# Sentiment Analysis

In [None]:
import pandas as pd
import numpy as np

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

# set the font size of plots
plt.rcParams['font.size'] = 14

In [None]:
np.set_printoptions(precision=3)

*****

## Read corpus
We are going to use a different corpus. This corpus is already labelled.

In [None]:
corpus_files = [ '..data/sentiment_data_ARA_pos.txt', '..data/sentiment_data_ARA_neg.txt', '..data/sentiment_data_TUN_pos.txt', '..data/sentiment_data_TUN_neg.txt' ]

In [None]:
def read_text_file(filename):
    print('Reading file ' + filename + "...")
    with open(filename, "r", encoding='utf8') as textfile:
        L = []
        for line in textfile:
            L.append(line.strip())
        print('File contains ', len(L), "lines.\n")
        return L

In [None]:
ara_corpus_pos = read_text_file(corpus_files[0])
ara_corpus_neg = read_text_file(corpus_files[1])
tun_corpus_pos = read_text_file(corpus_files[2])
tun_corpus_neg = read_text_file(corpus_files[3])

In [None]:
## Verify corpus

type(ara_corpus_pos),type(ara_corpus_neg),type(tun_corpus_pos),type(tun_corpus_neg)

In [None]:
len(ara_corpus_pos),len(ara_corpus_neg),len(tun_corpus_pos),len(tun_corpus_neg)

In [None]:
print(ara_corpus_pos[0])
print(ara_corpus_neg[0])
print(tun_corpus_pos[0])
print(tun_corpus_neg[0])

In [None]:
## Combine pos and neg corpus into a single corpus for easy manipulation

ara_corpus = ara_corpus_pos + ara_corpus_neg
ara_corpus_sentiment = len(ara_corpus_pos)*["POS"] + len(ara_corpus_neg)*["NEG"]
tun_corpus = tun_corpus_pos + tun_corpus_neg
tun_corpus_sentiment = len(tun_corpus_pos)*["POS"] + len(tun_corpus_neg)*["NEG"]

In [None]:
len(ara_corpus),len(ara_corpus_sentiment),len(tun_corpus),len(tun_corpus_sentiment)

***

## Text Preprocessing & Cleaning

We are going to follow the same pipeline of Language classification, except that here the operations should be **adapted to the Arabic language** (instead of the French language).

1. Remove useless characters (using ``cleanup_text`` function from TD2)
2. Language identification and filtering (using language identification model from TD2)
3. Letter normalization
4. Tokenization
5. Remove stop words
6. Word normalization (stemming)
7. Remove words that are too short or too long.

In [None]:
##1. Remove useless characters using cleanup_text function
import re
import html
# regexp for word elongation: matches 3 or more repetitions of a word character.
two_plus_letters_RE = re.compile(r"(\w)\1{1,}", re.DOTALL)
three_plus_letters_RE = re.compile(r"(\w)\1{2,}", re.DOTALL)
# regexp for repeated words
two_plus_words_RE = re.compile(r"(\w+\s+)\1{1,}", re.DOTALL)


def cleanup_text(text):
    
    # REMOVE NUMBERS
    text = re.sub('[0-9٠-٩]', '', text)
    
    # Remove URLs
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text)

    # Remove user mentions of the form @username
    text = re.sub('@[^\s]+', '', text)
    
    # Replace special html-encoded characters with their ASCII equivalent, for example: &#39 ==> '
    if re.search("&#",text):
        text = html.unescape(text)

    # Remove special useless characters such as _x000D_
    text = re.sub(r'_[xX]000[dD]_', '', text)

    # Replace all non-word characters (such as emoticons, punctuation, end of line characters, etc.) with a space
    text = re.sub('[\W_]', ' ', text)

    # Remove redundant white spaces
    text = text.strip()
    text = re.sub('[\s]+', ' ', text)

    # normalize word elongations (characters repeated more than twice)
    text = two_plus_letters_RE.sub(r"\1\1", text)

    # remove repeated words
    text = two_plus_words_RE.sub(r"\1", text)

    return text

def cleanup(text):
    
    #Remove all documents that contain a large fraction of latin characters (for example more than 80%)
    text = [doc for doc in text if (len(doc)>0 and len(re.findall('[a-zA-Z]',doc))/len(doc)<0.8)]
    
    #Remove very short documents
    text = [doc for doc in text if len(doc)>=10]
    
    return text

In [None]:
# Apply this function to each document in the corpus
ara_corpus_clean = []
tun_corpus_clean = []
for doc in ara_corpus:
    ara_corpus_clean.append(cleanup_text(doc))
for doc in tun_corpus:
    tun_corpus_clean.append(cleanup_text(doc))

ara_corpus_clean = cleanup(ara_corpus_clean)
tun_corpus_clean = cleanup(tun_corpus_clean)
print(len(tun_corpus_clean))


In [None]:
##2. Language identification and filtering

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle


# #import the model
# loaded_model = pickle.load(open('C:\\Users\\21620\\TextMiningProject\\Language detection\\lang_model.sav', 'rb'))

# #import the bow model
# bow_model = pickle.load(open('C:\\Users\\21620\\TextMiningProject\\Language detection\\bow_model.sav', 'rb'))

#classification function
# def predict_lang(text):
#     inst=[]
#     inst.append(text)
#     text_vect=bow_model.transform(inst)
#     prob=loaded_model.predict_proba(text_vect)[0]
#     x=prob[0]
#     #print(x,prob[1])
#     if x <0.7 and x>0.3:
#         return("OTHER")
#     else:
#         return loaded_model.predict(text_vect)[0]
    
#test
#print(predict_lang("مجموعة من الأغاني التي قدمها طارق العربي طرقان و أبناؤه محمد العربي و ديمة و تالة في برنامج صاحبة السعادة"))
ara_corpus_filtered = []
tun_corpus_filtered = []
ara_corpus_filtered = ara_corpus_clean
tun_corpus_filtered = tun_corpus_clean
#for doc in ara_corpus_clean:
 #   if (predict_lang(doc)=="ARA"):
  #      ara_corpus_filtered.append(doc)
#for doc in tun_corpus_clean:
    #if (predict_lang(doc)=="TUN"):
        #tun_corpus_filtered.append(doc)
print(len(tun_corpus_filtered),len(ara_corpus_filtered))

In [None]:
print(ara_corpus_filtered[0])
print(tun_corpus_filtered[0])

In [None]:
##3. Letter normalization
# Hint: which Arabic letters are equivalent n social media text?  e.g. alef, tah marbuta, dhad and dhad toushel, etc.

def normalize(text):
    
    alif='[ﺀٱءآإأ]'
    baa='[ﺒپ]'
    laa='[ﻵﻹﻷ]'
    kaf='[کگ]'
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    k=[]
    for doc in text:
        doc = re.sub('ة','ت', doc)
        doc = re.sub('ظ','ض' ,doc)
        doc = re.sub(alif,'ا' ,doc)
        doc = re.sub(laa,'لا' ,doc)
        doc = re.sub(baa,'ب' ,doc)
        doc = re.sub(kaf,'ك' ,doc)
        doc = re.sub('چ','ج' ,doc)
        doc = re.sub('ڤ','ف' ,doc)
        doc = re.sub('ڼ','ن', doc)
        doc = re.sub('ۋ','و', doc)
        doc = re.sub('ھ','ه', doc)
        doc = re.sub('ژ','ر', doc)
        doc = re.sub('ڜ','ش', doc)
        doc = re.sub("ى", "ي", doc)
        doc = re.sub("ئ", "ا", doc)
        doc = re.sub('ؤ','ا', doc)
        doc = re.sub(noise, '', doc)  #remove short vowels and other symbols 
        k.append(doc)

    text=k
    
    return text

In [None]:
ara_corpus_filtered = normalize(ara_corpus_filtered)
tun_corpus_filtered = normalize(tun_corpus_filtered)
print(len(tun_corpus_filtered))

In [None]:
##4. Tokenization -- Complete code below (same as in TD1)

# COMPLETE THE CODE BELOW

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('[^_\W]+')
ara_corpus_tokenized = [tokenizer.tokenize(doc) for doc in ara_corpus_filtered]
tun_corpus_tokenized = [tokenizer.tokenize(doc) for doc in tun_corpus_filtered]
print(ara_corpus_tokenized[:5])
print(tun_corpus_tokenized[:5])
print(len(tun_corpus_tokenized))

In [None]:
# verify first document in corpus
print(ara_corpus_tokenized[0])
print(tun_corpus_tokenized[0])
print(len(tun_corpus_tokenized))

In [None]:
##5. Remove stop words -- based on a 'standard' list of stopwords for the Arabic language.

# COMPLETE THE CODE BELOW  (See TD1)

# Load stop words from NLTK library
from nltk.corpus import stopwords
stop_words_ar = stopwords.words('arabic')
stop_words_ar = stop_words_ar + ['كان','أصبح','أضحى','أمسى','بات','ظلَّ','صار','من','إلى','حتى','خلا','عدا','في',
                               'عن','على','مذ','منذ','ومتى','متى','أنت','أنتما','أنتم','أنتن','إياك','إياكما','إياكن','أنا','نحن','هو','هي','هن','هما']
type(stop_words_ar),len(stop_words_ar)
print(stop_words_ar[0:10])

# FEEL FREE TO ADD MORE WORDS TO THIS LIST IF YOU WANT ...


# For each document, remove stop words
ara_corpus_tokenized = [[word for word in doc  if word not in stop_words_ar] for doc in ara_corpus_tokenized]
tun_corpus_tokenized = [[word for word in doc  if word not in stop_words_ar] for doc in tun_corpus_tokenized]
len(tun_corpus_tokenized)



In [None]:
##6. Stemming
# Hints: stemming is a difficult task for the Arabic language because words are often combined into one word (called agglutination).
#     You should first visually inspect all the words in your corpus to get an idea about which words are good candidates for stemming ...
#     Then try to think of a few simple stemming heuristics (regular expressions), such as: remove certain prefixes (e.g. al), remove certain suffixes (e.g. 'ouna') ...
# SKIP THIS STEP IN CLASS TO SAVE TIME. COMPLETE IT AT HOME.
from snowballstemmer import stemmer
ara_corpus_doc= []
ara_corpus_stemmed=[]
tun_corpus_doc= []
tun_corpus_stemmed= []
ar_stemmer = stemmer("arabic")
for doc in ara_corpus_tokenized:
    for word in doc:
        ara_corpus_doc.append(ar_stemmer.stemWord(word))
    ara_corpus_doc= []
    ara_corpus_stemmed.append(ara_corpus_doc)
    
for doc in tun_corpus_tokenized:
    for word in doc:
        tun_corpus_doc.append(ar_stemmer.stemWord(word))
    tun_corpus_doc= []
    tun_corpus_stemmed.append(tun_corpus_doc)
print(len(ara_corpus_stemmed),len(tun_corpus_stemmed))

****

#### Prepare the corpus for BOW

In [None]:
# First, concatenate the words in the cleaned corpus (because BOW method in scikit-learn requires this format)
ara_corpus_bow = [' '.join(doc) for doc in ara_corpus_stemmed]
tun_corpus_bow = [' '.join(doc) for doc in tun_corpus_stemmed]

#### Build the vocabulary set
Extract the vocabulary set from our corpus and calculate IDF values of each word in this set.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Configuration parameters of the BOW model
# FEEL FREE TO MODIFY THESE PARAMETERS AS NEEDED ...
max_words = 10000
maxdf = 1.0
mindf = 0.01

In [None]:
# create an instance of this class
bow_model_ara = TfidfVectorizer(max_df=maxdf, min_df=mindf, stop_words=[], use_idf = True)
bow_model_tun = TfidfVectorizer(max_df=maxdf, min_df=mindf, stop_words=[], use_idf = True)

In [None]:
# call fit() method in order to prepare BOW method (determine vocabulary and IDF values)
bow_model_ara.fit(ara_corpus_bow)
bow_model_tun.fit(tun_corpus_bow)

#### Build the DTM matrix

In [None]:
# Call the transform method in order to calculate DTM matrix of our corpus
ara_bow_dtm = bow_model_ara.transform(ara_corpus_bow)
tun_bow_dtm = bow_model_tun.transform(tun_corpus_bow)

In [None]:
# Verify the type and size of this matrix
print(type(ara_bow_dtm))
print(ara_bow_dtm.shape)

print(type(tun_bow_dtm))
print(tun_bow_dtm.shape)

#### Visually inspect the vocabulary
This should help you **tune** the BOW configuration parameters (i.e. min_df, max_df, etc.) ...

In [None]:
# The vocabulary of BOW -- i.e. the words that were selected by BOW method to be in the vocabulary
bow_vocab_ara = bow_model_ara.get_feature_names()
print(type(bow_vocab_ara), len(bow_vocab_ara))
bow_vocab_tun = bow_model_tun.get_feature_names()
print(type(bow_vocab_tun), len(bow_vocab_tun))

In [None]:
# The words that were ignored (and were not included in the vocabulary)
ignored_words_ara = bow_model_ara.stop_words_
print(type(ignored_words_ara),len(ignored_words_ara))
ignored_words_tun = bow_model_tun.stop_words_
print(type(ignored_words_tun),len(ignored_words_tun))

In [None]:
#### DON'T DO THIS !!! THERE ARE TOO MANY IGNORED WORDS
#print(ignored_words)

In [None]:
# Put vocavulary and their IDF values in a data frame
df_ara = pd.DataFrame(dict(Word=bow_vocab_ara,IDF=bow_model_ara.idf_))
df_tun = pd.DataFrame(dict(Word=bow_vocab_tun,IDF=bow_model_tun.idf_))

In [None]:
# Show vocabulary words that have SMALLEST IDF values (i.e. that have the largest document frequencies)
df_ara.sort_values("IDF", inplace=False, ascending = True).head(10)
print(df_ara['Word'])
df_tun.sort_values("IDF", inplace=False, ascending = True).head(10)
print(df_tun['Word'])

In [None]:
# Show vocabulary words that have LARGEST IDF values (i.e. that have the smallest document frequencies)
df_ara.sort_values("IDF", inplace=False, ascending = False).head(10)
df_tun.sort_values("IDF", inplace=False, ascending = False).head(10)

In [None]:
# If you want, you can save the vocabulary into a file
# df.sort_values("IDF", inplace=False, ascending = True).to_csv("./models/bow_vocab.csv", index=False, header=True)

#### Remove documents that do not contain any vocabulary terms
i.e. remove rows in the DTM that are all zeros.

In [None]:
nb_terms_per_doc = np.array((ara_bow_dtm>0).sum(axis=1)) 
nb_terms_per_doc = nb_terms_per_doc.ravel()
idx = nb_terms_per_doc>0
ara_bow_dtm_filt = ara_bow_dtm[idx,:]
ara_corpus_bow_filt = [ara_corpus_bow[i] for i,x in enumerate(idx) if x]
ara_corpus_sentiment_filt = [ara_corpus_sentiment[i] for i,x in enumerate(idx) if x]

nb_terms_per_doc1 = np.array((tun_bow_dtm>0).sum(axis=1)) 
nb_terms_per_doc1 = nb_terms_per_doc1.ravel()
idx1 = nb_terms_per_doc1>0
tun_bow_dtm_filt = tun_bow_dtm[idx1,:]
tun_corpus_bow_filt = [tun_corpus_bow[i] for i,x in enumerate(idx1) if x]
tun_corpus_sentiment_filt = [tun_corpus_sentiment[i] for i,x in enumerate(idx1) if x]
#print(len(tun_corpus_sentiment),len(idx))

## Build Sentiment Classifier

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [None]:
X_ara = ara_bow_dtm_filt
y_ara = ara_corpus_sentiment_filt
X_tun = tun_bow_dtm_filt
y_tun = tun_corpus_sentiment_filt


In [None]:
# Split the data into training and testing

X_train_ara, X_test_ara, y_train_ara, y_test_ara = train_test_split(X_ara, y_ara, test_size = 0.3, random_state=1996)
X_train_tun, X_test_tun, y_train_tun, y_test_tun = train_test_split(X_tun, y_tun, test_size = 0.3, random_state=1996)

### Train classifier using logistic regression

In [None]:
# Train the model using Logistic Regression method

LR_model_ara = LogisticRegression(penalty='l2')
LR_model_ara.fit(X_train_ara, y_train_ara)
print(X_train_ara)
print(y_train_ara)


LR_model_tun = LogisticRegression(penalty='l2')
LR_model_tun.fit(X_train_tun, y_train_tun)
print(X_train_tun)
print(y_train_tun)

In [None]:
# Use this model to predict the sentiment category of test documents
y_pred_LR_ara = LR_model_ara.predict(X_test_ara)
y_pred_LR_tun = LR_model_tun.predict(X_test_tun)

In [None]:
type(y_pred_LR_ara),len(y_pred_LR_ara)
type(y_pred_LR_tun),len(y_pred_LR_tun)

In [None]:
# Calculate the classification rate of this classifier
print(metrics.accuracy_score(y_test_ara, y_pred_LR_ara),metrics.accuracy_score(y_test_tun, y_pred_LR_tun))

### Interpretation

- The logistic regression model has one parameter per feature (i.e. vocabulary word).
- Most positive values indicate parameters that contribute most to class 1
- Most negative values indicate parameters contribute most to class -1

In [None]:
# Re-train the model using ALL DATA
LR_model2_ara = LogisticRegression(penalty='l2')
LR_model2_ara.fit(X_ara, y_ara)
LR_model2_tun = LogisticRegression(penalty='l2')
LR_model2_tun.fit(X_tun, y_tun)

In [None]:
# get the coefficients (parameter) of the LR model
LR_coefs_ara = LR_model2_ara.coef_   #2D array with only one row
LR_coefs_ara = LR_coefs_ara.ravel()  #convert to a 1D array
LR_coefs_tun = LR_model2_tun.coef_   #2D array with only one row
LR_coefs_tun = LR_coefs_tun.ravel()  #convert to a 1D array
print(type(LR_coefs_ara),type(LR_coefs_tun))
print(LR_coefs_ara.shape,LR_coefs_tun.shape)

### Train classifier using Naive Bayes

In [None]:
## Build Naive Bayes classification model

NB_model_ara = MultinomialNB(alpha = 1.0)
NB_model_ara.fit(X_train_ara, y_train_ara)
print(X_train_ara)
print(y_train_ara)


NB_model_tun = MultinomialNB(alpha = 1.0)
NB_model_tun.fit(X_train_tun, y_train_tun)
print(X_train_tun)
print(y_train_tun)

In [None]:
# read documentation
# ?MultinomialNB

In [None]:
# Use this model to predict the sentiment category of test documents
y_pred_NB_ara = NB_model_ara.predict(X_test_ara)
y_pred_NB_tun = NB_model_tun.predict(X_test_tun)

In [None]:
# Classification rate
print(metrics.accuracy_score(y_test_ara, y_pred_NB_ara),metrics.accuracy_score(y_test_tun, y_pred_NB_tun))

****

### Dump classifiers

In [None]:
import pickle
filename1 = 'sentiment_analysis_model_ara.sav'
filename2 = 'sentiment_analysis_model_tun.sav'
filename3 = 'bow_model_ara.sav'
filename4 = 'bow_model_tun.sav'
pickle.dump(NB_model_ara, open(filename1, 'wb'))
pickle.dump(NB_model_tun, open(filename2, 'wb'))
pickle.dump(bow_model_ara, open(filename3, 'wb'))
pickle.dump(bow_model_tun, open(filename4, 'wb'))


loaded_model_ara = pickle.load(open('sentiment_analysis_model_ara.sav', 'rb'))
loaded_model_tun = pickle.load(open('sentiment_analysis_model_tun.sav', 'rb'))
bow_model_ara = pickle.load(open('bow_model_ara.sav', 'rb'))
bow_model_tun = pickle.load(open('bow_model_tun.sav', 'rb'))

In [None]:
def predict_sentiment_ara(text):
    inst=[]
    inst.append(text)
    text_vect=bow_model_ara.transform(inst)
    prob=loaded_model_ara.predict_proba(text_vect)[0]
    x=prob[0]
    print(x)
    if x <0.8 and x>0.2:
        return("neutre")
    else:
        return loaded_model_ara.predict(text_vect)[0]

def predict_sentiment_tun(text):
    inst=[]
    inst.append(text)
    text_vect=bow_model_tun.transform(inst)
    prob=loaded_model_tun.predict_proba(text_vect)[0]
    x=prob[0]
    print(x)
    if x <0.7 and x>0.3:
        return("neutre")
    else:
        return loaded_model_tun.predict(text_vect)[0]
    
#predict_sentiment_ara(" لحميد العيب ")
predict_sentiment_tun("رة عملوها الفقراء والمحتاجين و المظلومين ثورة القضاء عالفقر و البطالة و الاحتياج أما هذا الكل مصارش منو شي ... الثورة فكوها الأحزاب و السياسيين ثورة الشباب فكوها الشياب و العجائز و فكوها السراق و المافيات .. البطالة في إرتفاع متزايد و الفقر في إرتفاع متزايد و حتى الطبقة المتوسطة قضاو علاها ... ثورة فيبالنا باش تحسن أوضاعنا المعيشية فجأة ركبو الشواذ و المثليين و العاهرات و القرودة عالحدث و ولاو تقول علاهم اصحاب حق في الثورة باش يحققو مطالبهم اللاأخلاقية ... ثورة زيادة العهر و زيادة الشواذ و المثليين و القرودة و زيادة البطالة و الفقر و التهميش الخ الخ الخ")