In [110]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.model_selection import  cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
import fasttext
from sklearn.model_selection import cross_val_predict
import pickle
import multiprocessing
from gensim.models import Word2Vec

In [111]:
#loading the csv file

df_tweets = pd.read_csv("tweet_preprocessed.csv", parse_dates=True,na_values=' ',encoding="ISO-8859-1")
df_tweets.set_index(['time','username'], inplace = True)

In [112]:
output_resources = pickle.load(open( "resources_pickled.p", "rb" ))
tweet_nostopwords= output_resources['tweet_nostopwords']
tweet_tokens_joined = output_resources['tweet_tokens_joined']

In [113]:
# using word2vec model to find similar words
# attempt to build neural network model

w2v_model = Word2Vec(min_count=5,
                     window=4,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)
w2v_model.build_vocab(tweet_nostopwords, progress_per=10000)
w2v_model.train(tweet_nostopwords, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
w2v_model.init_sims(replace=True)
w2v_model.wv.most_similar(topn=10,positive='militant')

[('entered', 0.9975644946098328),
 ('leaving', 0.9974784255027771),
 ('officials', 0.9973977208137512),
 ('trapped', 0.9973905086517334),
 ('airstrike', 0.9973317980766296),
 ('fleeing', 0.9972130060195923),
 ('anbar', 0.9972053170204163),
 ('province', 0.997189462184906),
 ('neighbourhood', 0.9971358180046082),
 ('confirmed', 0.9970868825912476)]

In [114]:
# using fasttext model to find similar words
# attempt to build neural network model

model = fasttext.train_unsupervised('tweets_clean.txt')
model.get_nearest_neighbors('militant')

[(0.8713728785514832, '"militant'),
 (0.8628817796707153, 'militant.'),
 (0.8179283142089844, 'eyeonmilitant'),
 (0.7054753303527832, 'militant"'),
 (0.6604416966438293, '155mm'),
 (0.6502629518508911, 'saraya'),
 (0.64931321144104, 'ingushetia'),
 (0.6468783020973206, 'enters'),
 (0.6227799654006958, 'yussuf'),
 (0.6176290512084961, 'dautant')]

In [115]:
#Building a Naive bayes classification model using nltk library

from sklearn.model_selection import train_test_split
X=df_tweets["tweet_nostopwords"]
y=df_tweets["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=42, stratify=y)

X_train = X_train.tolist() 
X_test = X_test.tolist() 
y_train = y_train.tolist() 
y_test = y_test.tolist() 

In [116]:
def processed_tweets(X,y):
    tweets=[]
    for tweet, label in zip( X,y):
        tweets.append((tweet,label))
    return tweets

In [117]:
def buildVocabulary(train_data):
    all_words = []
    
    for (words, sentiment) in train_data:
        all_words.extend(words)
    wordlist = nltk.FreqDist(all_words)
    word_features = wordlist.keys()
    
    return word_features

In [118]:
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in word_features:        
        features['contains(%s)' % word] = (word in tweet_words)
    return features 


In [119]:
word_features = buildVocabulary(processed_tweets(X_train, y_train))
trainingFeatures = nltk.classify.apply_features(extract_features, processed_tweets(X_train, y_train))

In [120]:
NBayesClassifier = nltk.NaiveBayesClassifier.train(trainingFeatures)

In [121]:
pickle.dump( NBayesClassifier , open( "tweet_pickled.p", "wb" ) )

In [122]:
NBayesClassifier = pickle.load(open( "tweet_pickled.p", "rb" ))

In [123]:
NBResultLabels = [NBayesClassifier.classify(extract_features(tweet[0])) for tweet in processed_tweets(X_test,y_test)]

In [124]:
#checking the model performance

def model_performance():      
    print("Accuracy:",sklearn.metrics.accuracy_score(y_test, NBResultLabels))    
    print("F1 score:",sklearn.metrics.f1_score(y_test, NBResultLabels))
    print("Precision:",sklearn.metrics.precision_score(y_test, NBResultLabels))     
    print("roc-auc score:",sklearn.metrics.roc_auc_score(y_test, NBResultLabels))
    print("Recall",sklearn.metrics.recall_score(y_test, NBResultLabels))
    print("Confusion_matrix: \n",sklearn.metrics.confusion_matrix(y_test, NBResultLabels))
model_performance()

Accuracy: 0.7093624353819644
F1 score: 0.5541850220264317
Precision: 0.7075365579302587
roc-auc score: 0.6658582289527314
Recall 0.4554670528602462
Confusion_matrix: 
 [[1841  260]
 [ 752  629]]


In [125]:
vectorizer = CountVectorizer()
df_tweets.tweet_tokens_joined
X = vectorizer.fit_transform(tweet_tokens_joined)

#X = X.tocsc()  
#y = tweet.label.values.astype(np.int)

In [126]:
#Building a Naive bayes clssification model using scikit learn as nltk doesn't  support cross validation

def make_xy(tweet, vectorizer=None):    
    if vectorizer is None:
        vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(tweet_tokens_joined)
    X = X.tocsc()  
    y = tweet.label.values.astype(np.int)
    return X,y
X,y= make_xy(df_tweets)

In [127]:
#Model performnce after cross validation

def model_performance_crossval():
    clf = MultinomialNB(alpha=.1)
    k_fold = KFold( n_splits=5, shuffle=True, random_state=0)
    print("Accuracy:", cross_val_score(clf, X, y, cv=k_fold, n_jobs=1,scoring='accuracy'))
    print("F1 score:",cross_val_score(clf, X, y, n_jobs=1, cv=k_fold,scoring='f1'))
    print("roc-auc score:",cross_val_score(clf, X, y, cv=k_fold, n_jobs=1,scoring='roc_auc'))
    print("precision:",cross_val_score(clf, X, y, cv=k_fold, n_jobs=1,scoring='precision'))
    print("recall:",cross_val_score(clf, X, y, cv=k_fold, n_jobs=1,scoring='recall'))
    y_pred = cross_val_predict(clf, X, y, cv=k_fold)
    conf_mat = confusion_matrix(y, y_pred)
    print("Confusion Matrix:\n", conf_mat)
model_performance_crossval()

Accuracy: [0.79063756 0.79293509 0.80097645 0.78144744 0.78977599]
F1 score: [0.70473876 0.70365804 0.72313224 0.69523428 0.7182448 ]
roc-auc score: [0.89980203 0.89810494 0.90790626 0.89300481 0.895485  ]
precision: [0.79963235 0.79259259 0.78832753 0.79052823 0.78867287]
recall: [0.62997828 0.63266814 0.66789668 0.62044317 0.65936396]
Confusion Matrix:
 [[9342 1165]
 [2471 4432]]


In [128]:
#Removing empty rows and rows with a single word 
def emptyrows_remove(tweet, nostopwords):    
    min_words = 1
    tweet = tweet[nostopwords.apply(lambda x: True  if len(x)>min_words else False)]      
    return tweet

df_tweets = emptyrows_remove(df_tweets, df_tweets['tweet_nostopwords'])

In [129]:
# Checking performance after removing empty rows and rows with a single word 

X, y = make_xy(df_tweets)
model_performance_crossval()

Accuracy: [0.79063756 0.79293509 0.80097645 0.78144744 0.78977599]
F1 score: [0.70473876 0.70365804 0.72313224 0.69523428 0.7182448 ]
roc-auc score: [0.89980203 0.89810494 0.90790626 0.89300481 0.895485  ]
precision: [0.79963235 0.79259259 0.78832753 0.79052823 0.78867287]
recall: [0.62997828 0.63266814 0.66789668 0.62044317 0.65936396]
Confusion Matrix:
 [[9342 1165]
 [2471 4432]]


In [130]:

df = pd.DataFrame(y,columns=['Output'])
df.to_excel(excel_writer = "test.xlsx",encoding="ISO-8859-1")
