In [40]:
import nltk
import re
import csv
from collections import Counter
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
import ArStemmerLib as lib
stemmer=  lib.ArStemmer()
stemmer.loadDicts(["general.txt", "lex_list.txt", "wiki_general.txt"])
stemmer.loadDict("wiki_dict.txt")

In [41]:
def readNames(filePath):
    names=[]
    f=open(filePath,'r')
    for i in f:
        i=i.strip()
        names.append(i.lower())
    return names


In [42]:
def StopWords(filePath):
    stop_words=[]
    f=open(filePath,'r',encoding="utf-8") 
    for i in f:
        i=i.strip()
        stop_words.append(i)
    return stop_words

In [43]:
def removeStopWords(tweet,stop_words):
    new_tweet=''
    for word in tweet.split():
        if word not in stop_words:
            new_tweet+=word+' '
    return new_tweet

In [44]:
#stem the word
def stem(word):
    return stemmer.stem(word)


In [45]:
def removeDuplicates(s):
    if s!='':
            a=s[0]
            for i in s:
                if i != a[-1]:
                    a=a+i
            return  a
    else:
        return ''
removeDuplicates('')

''

In [46]:
def changePolToNum(pol):
    if pol=='pos':
        return 1
    elif pol=='neg':
        return -1
    else:
        return 0
    

In [86]:
def preProcessing(tweet):
    
    
    #remove lower for some english words in the tweet
    #tweet=tweet.lower()
    ##remove links
    tweet=re.sub(r"http\S+ | www\S+" , "لينك", tweet)
    ##remove # chracter '#messi'-->'messi'
    tweet=re.sub(r"#" , "", tweet)
    ##remove mentions (@messi)
    tweet=re.sub(r'@\S+','منشن',tweet)
    #remove consecutive chracters from a string ('شكراااااا','شكرا')
    tweet=removeDuplicates(tweet)
    ##remove stop words()
    stop_words=StopWords('negators.txt')
    tweet=removeStopWords(tweet,stop_words)
    #remove special chracters,#remove words length less than 2 chracters , first and last chracter of a word is not number
    tweet=''.join(e+' ' for e in tweet.split() if e.isalnum() and len(e) >= 3 and not(e[0].isdigit() or e[-1].isdigit()))
    #stem words
    t=''
    for i in tweet.split():
          t+=stem(i)+' '
    tweet=t
    
    return tweet


# Data set 1

# TfidfVectorizer

In [79]:
with open("DS1.csv", 'r') as file:
    train = list(csv.reader(file))
train=train[1:] #remove header row
train_data=[preProcessing(t[0]) for t in train]
train_label=[changePolToNum(t[1]) for t in train]
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data)

with open("testData.csv", 'r') as file:
    test = list(csv.reader(file))
test=test[1:]
test_data=[preProcessing(t[0]) for t in test]
test_label=[changePolToNum(t[1]) for t in test]
test_vectors = vectorizer.transform(test_data)

clf = svm.SVC()
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print("DS1 svm acc ",accuracy_score(test_label, pred))


#kernel svc
clf = svm.SVC(kernel='linear')
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print(("DS1 svm kernel linear acc ",accuracy_score(test_label, pred)))
    
#naive bayes
clf = GaussianNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS1 naive ",accuracy_score(test_label, pred))     
      
#multinomial naive bayes
clf = MultinomialNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS1 multinomianl naive ",accuracy_score(test_label, pred))

      


DS1 svm acc  0.385065885798
('DS1 svm kernel linear acc ', 0.53440702781844807)
DS1 naive  0.472913616398
DS1 multinomianl naive  0.543191800878


# CountVectorizer

In [80]:
with open("DS1.csv", 'r') as file:
    train = list(csv.reader(file))
train=train[1:]
train_data=[preProcessing(t[0]) for t in train]
train_label=[changePolToNum(t[1]) for t in train]
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(train_data)

with open("testData.csv", 'r') as file:
    test = list(csv.reader(file))
test=test[1:]
test_data=[preProcessing(t[0]) for t in test]
test_label=[changePolToNum(t[1]) for t in test]
test_vectors = vectorizer.transform(test_data)

clf = svm.SVC()
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print("DS1 svm acc ",accuracy_score(test_label, pred))


#kernel svc
clf = svm.SVC(kernel='linear')
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print(("DS1 svm kernel linear acc ",accuracy_score(test_label, pred)))
    
#naive bayes
clf = GaussianNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS1 naive ",accuracy_score(test_label, pred))     
      
#multinomial naive bayes
clf = MultinomialNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS1 multinomianl naive ",accuracy_score(test_label, pred))

      


DS1 svm acc  0.385065885798
('DS1 svm kernel linear acc ', 0.51537335285505126)
DS1 naive  0.449487554905
DS1 multinomianl naive  0.559297218155


# Data set 2

# TfidfVectorizer

In [81]:
with open("DS2.csv", 'r') as file:
    train = list(csv.reader(file))
train=train[1:]
train_data=[preProcessing(t[0]) for t in train]
train_label=[changePolToNum(t[1]) for t in train]
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data)

with open("testData.csv", 'r') as file:
    test = list(csv.reader(file))
test=test[1:]
test_data=[preProcessing(t[0]) for t in test]
test_label=[changePolToNum(t[1]) for t in test]
test_vectors = vectorizer.transform(test_data)

clf = svm.SVC()
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print("DS2 svm acc ",accuracy_score(test_label, pred))


#kernel svc
clf = svm.SVC(kernel='linear')
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print("DS2 svm kernel linear acc ",accuracy_score(test_label, pred))
    
#naive bayes
clf = GaussianNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS2 naive ",accuracy_score(test_label, pred))     
      
#multinomial naive bayes
clf = MultinomialNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS2 multinomianl naive ",accuracy_score(test_label, pred))

      


DS2 svm acc  0.333821376281
DS2 svm kernel linear acc  0.541727672035
DS2 naive  0.469985358712
DS2 multinomianl naive  0.544655929722


# CountVectorizer

In [82]:
with open("DS2.csv", 'r') as file:
    train = list(csv.reader(file))
train=train[1:]
train_data=[preProcessing(t[0]) for t in train]
train_label=[changePolToNum(t[1]) for t in train]
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(train_data)

with open("testData.csv", 'r') as file:
    test = list(csv.reader(file))
test=test[1:]
test_data=[preProcessing(t[0]) for t in test]
test_label=[changePolToNum(t[1]) for t in test]
test_vectors = vectorizer.transform(test_data)

clf = svm.SVC()
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print("DS1 svm acc ",accuracy_score(test_label, pred))


#kernel svc
clf = svm.SVC(kernel='linear')
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print(("DS1 svm kernel linear acc ",accuracy_score(test_label, pred)))
    
#naive bayes
clf = GaussianNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS1 naive ",accuracy_score(test_label, pred))     
      
#multinomial naive bayes
clf = MultinomialNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS1 multinomianl naive ",accuracy_score(test_label, pred))

      


DS1 svm acc  0.333821376281
('DS1 svm kernel linear acc ', 0.53440702781844807)
DS1 naive  0.474377745242
DS1 multinomianl naive  0.538799414348


# Data set 3

# TfidfVectorizer

In [83]:
with open("DS3.csv", 'r') as file:
    train = list(csv.reader(file))
train=train[1:]
train_data=[preProcessing(t[0]) for t in train]
train_label=[changePolToNum(t[1]) for t in train]
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data)

with open("testData.csv", 'r') as file:
    test = list(csv.reader(file))
test=test[1:]
test_data=[preProcessing(t[0]) for t in test]
test_label=[changePolToNum(t[1]) for t in test]
test_vectors = vectorizer.transform(test_data)

clf = svm.SVC()
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print("DS3 svm acc ",accuracy_score(test_label, pred))


#kernel svc
clf = svm.SVC(kernel='linear')
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print("DS3 svm kernel linear acc ",accuracy_score(test_label, pred))
    
#naive bayes
clf = GaussianNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS3 naive ",accuracy_score(test_label, pred))     
      
#multinomial naive bayes
clf = MultinomialNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS3 multinomianl naive ",accuracy_score(test_label, pred))

      


DS3 svm acc  0.385065885798
DS3 svm kernel linear acc  0.516837481698
DS3 naive  0.433382137628
DS3 multinomianl naive  0.537335285505


# CountVectorizer

In [84]:
with open("DS3.csv", 'r') as file:
    train = list(csv.reader(file))
train=train[1:]
train_data=[preProcessing(t[0]) for t in train]
train_label=[changePolToNum(t[1]) for t in train]
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(train_data)

with open("testData.csv", 'r') as file:
    test = list(csv.reader(file))
test=test[1:]
test_data=[preProcessing(t[0]) for t in test]
test_label=[changePolToNum(t[1]) for t in test]
test_vectors = vectorizer.transform(test_data)

clf = svm.SVC()
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print("DS3 svm acc ",accuracy_score(test_label, pred))


#kernel svc
clf = svm.SVC(kernel='linear')
clf.fit(train_vectors, train_label)
pred=clf.predict(test_vectors)
print("DS3 svm kernel linear acc ",accuracy_score(test_label, pred))
    
#naive bayes
clf = GaussianNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS3 naive ",accuracy_score(test_label, pred))     
      
#multinomial naive bayes
clf = MultinomialNB()
clf.fit(train_vectors.toarray(), train_label)
pred=clf.predict(test_vectors.toarray())
print("DS3 multinomianl naive ",accuracy_score(test_label, pred))

      


DS3 svm acc  0.385065885798
DS3 svm kernel linear acc  0.481698389458
DS3 naive  0.440702781845
DS3 multinomianl naive  0.551976573939
