In [2]:
import csv
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [22]:
twdf = pd.read_csv("dubai-tweets.csv", encoding = 'latin-1')
poldf = pd.read_csv("dubai-politics-binary.csv")
reldf = pd.read_csv("dubai-religion-binary.csv")

In [35]:
comdf = twdf.set_index('tweeterid').join(poldf.set_index('tweeterid'))

In [45]:
comdf = comdf.dropna(subset = ['tweet'])
comdf = comdf.dropna(subset = ['politics'])
selfText  = comdf.tweet
labels = comdf.politics
trainPct = 0.7

In [46]:
X_train, X_test, y_train, y_test = train_test_split(selfText, labels, test_size = 1.0 - trainPct)

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import sklearn.metrics as sm

In [50]:
#Model using Support Vector Machines classifier, no text preprocess

from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect',CountVectorizer()), ('tfidf',TfidfTransformer()), ('clf', SGDClassifier())])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print("Accuracy: ",text_clf.score(X_test, y_test))
print("F1 score: ",sm.f1_score(y_test,predicted))
print("AUC score: ",sm.roc_auc_score(y_test,predicted))



Accuracy:  0.7976521158889527
F1 score:  0.18734224548938955
AUC score:  0.5516373364241169


In [51]:
#Model using Support Vector Machines classifier, stop words removed

from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect',CountVectorizer(stop_words='english')), ('tfidf',TfidfTransformer()), ('clf', SGDClassifier())])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print("Accuracy: ",text_clf.score(X_test, y_test))
print("F1 score: ",sm.f1_score(y_test,predicted))
print("AUC score: ",sm.roc_auc_score(y_test,predicted))



Accuracy:  0.8003289831008209
F1 score:  0.20711116588612272
AUC score:  0.5576925618296578


In [52]:
#Model using Support Vector Machines classifier, stemmed

import sklearn.metrics as sm
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_svm_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),
                             ('svm_clf', SGDClassifier())])
text_svm_stemmed = text_svm_stemmed.fit(X_train, y_train)
predicted_svm_stemmed = text_svm_stemmed.predict(X_test)
print("Accuracy: ", text_svm_stemmed.score(X_test, y_test))
print("F1 score: ", sm.f1_score(y_test,predicted_svm_stemmed))
print("AUC score: ", sm.roc_auc_score(y_test,predicted_svm_stemmed))



Accuracy:  0.8010505733927158
F1 score:  0.21247581313922415
AUC score:  0.5593529265704016


In [66]:
#Model using Naive Bayes classifier, stemmed
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as sm
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),
                             ('clf', MultinomialNB())])
text_mnb_stemmed = text_mnb_stemmed.fit(X_train, y_train)
predicted_mnb_stemmed = text_mnb_stemmed.predict(X_test)
print("Accuracy: ", text_mnb_stemmed.score(X_test, y_test))
print("F1 score: ", sm.f1_score(y_test,predicted_mnb_stemmed))
print("AUC score: ", sm.roc_auc_score(y_test,predicted_mnb_stemmed))

Accuracy:  0.8459133160565478
F1 score:  0.4842755862570442
AUC score:  0.6598039160172051


In [67]:
#Model using Random Forest classifier, stemmed
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as sm
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_rf_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),
                             ('clf', RandomForestClassifier(n_estimators=10))])
text_rf_stemmed = text_rf_stemmed.fit(X_train, y_train)
predicted_rf_stemmed = text_rf_stemmed.predict(X_test)
print("Accuracy: ", text_rf_stemmed.score(X_test, y_test))
print("F1 score: ", sm.f1_score(y_test,predicted_rf_stemmed))
print("AUC score: ", sm.roc_auc_score(y_test,predicted_rf_stemmed))

Accuracy:  0.8613305193898294
F1 score:  0.5952898550724638
AUC score:  0.7163180718959544


In [68]:
#Model using AdaBoost classifier, stemmed
from sklearn.ensemble import AdaBoostClassifier
import sklearn.metrics as sm
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_ada_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),
                             ('clf', AdaBoostClassifier(n_estimators=100))])
text_ada_stemmed = text_ada_stemmed.fit(X_train, y_train)
predicted_ada_stemmed = text_ada_stemmed.predict(X_test)
print("Accuracy: ", text_ada_stemmed.score(X_test, y_test))
print("F1 score: ", sm.f1_score(y_test,predicted_ada_stemmed))
print("AUC score: ", sm.roc_auc_score(y_test,predicted_ada_stemmed))

Accuracy:  0.809344982231809
F1 score:  0.2996636835204925
AUC score:  0.5866231127532767
