In [47]:
#Notice: there is a pickle dump available.

#downloads:  
#!pip install regex
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download("words")

In [9]:
import pickle
import pandas as pd
import numpy as np
import string
import math
import nltk
import regex as re
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.sentiment.util import mark_negation
from math import log
from sklearn.model_selection import train_test_split

In [3]:
url = "https://raw.githubusercontent.com/thanhtut/info284_lab/master/assignment1/twitter-airline-sentiment/Tweets.csv"

#Creating dataset of csv-file from url.
df = pd.read_csv(url)

#Create dataset with airline sentiment and text (notice I'm only using text as a feature).
dataset = df[["airline_sentiment","text"]]

#Pickle file with model data. Contains: [vocabulary, logprior, loglike]
#read_ml = open("ml-oblig1","rb")

In [4]:
#Ex 1: cleaning and splitting of data

def data_cleaner(dataframe):
    #unicodes for different emojis
    RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE) 
    #stopwords
    stopwords_helpset = set(stopwords.words("english"))
    #stemmer 
    stemmer = SnowballStemmer("english")
    #list for clean data
    clean_data = []
    
    #iterates the data
    for index, row in dataframe.iterrows():
        #all words in lowercase and bigger than or equal to 3
        filtered_word = [word.lower() for word in row.text.split() if len(word) >= 3]
        #words without @, http and is not RT. Word also has to start with a character
        cleaned_word = [word for word in filtered_word \
                        if word != "RT"\
                        and "@" not in word\
                        and "#" not in word\
                        and "http" not in word\
                        and word[0].isalpha()\
                        ]

        #list of words without emojis
        remove_smiley = [RE_EMOJI.sub(r'', word) for word in cleaned_word]
        #list of words with negation mark
        mark_neg = mark_negation(remove_smiley)
        #remove stopwords with nltk
        non_stopwords = [word for word in mark_neg if not word in stopwords_helpset]
        #stemming words
        stemmed_list = [stemmer.stem(word) for word in non_stopwords]
        #removing punctuation
        non_punct = [word.translate(str.maketrans('', '', string.punctuation)) for word in stemmed_list]
        #clean data:
        clean_data.append((non_punct, row.airline_sentiment))
        
        
    return clean_data 


def clean_frame(dataframe, set_test_size = 0.1):
    clean_list = data_cleaner(dataframe)
    clean_dataset = pd.DataFrame(clean_list, columns = ["text","sentiment"])
    
    #split dataset into training and testing, test is 10% of dataset
    data_train, data_test = train_test_split(clean_dataset, test_size = set_test_size)
    return data_train, data_test


data_train, data_test = clean_frame(dataset)

In [5]:
#Ex 2 (vocabularies) and 8(Laplace smoothing):
def word_counter2(data_train):
    
    #All positive, negative and then neutral words in dataset, will use this list for counting.
    flat_pos = [val for sublist in\
                  data_train[data_train["sentiment"]\
                             =="positive"]["text"].tolist()\
                  for val in sublist]

    flat_neg = [val for sublist in\
                      data_train[data_train["sentiment"]\
                                 =="negative"]["text"].tolist()\
                      for val in sublist]

    flat_neu = [val for sublist in\
                      data_train[data_train["sentiment"]\
                                 =="neutral"]["text"].tolist()\
                      for val in sublist]
    
    #vocabulary
    vocabulary = [val for sublist in\
                      data_train["text"].tolist()\
                      for val in sublist]
    
    #Dictonaries will have to form of i.e.: pos_dict["late"] = 2,
    #where "late" is the word and 2 is the number of occourance of word in flat_pos.
    #Also notice that I'm using Laplace smoothing
    pos_dict = dict(zip((set(vocabulary)), [flat_pos.count(word) + 1 for word in set(vocabulary)]))
    pos_dict["total_word"] = sum(value for value in pos_dict.values())
    
    neg_dict = dict(zip((set(vocabulary)), [flat_neg.count(word) + 1 for word in set(vocabulary)]))
    neg_dict["total_word"] = sum(value for value in neg_dict.values())
    
    neu_dict = dict(zip((set(vocabulary)), [flat_neu.count(word) + 1 for word in set(vocabulary)]))
    neu_dict["total_word"] = sum(value for value in neu_dict.values())
        
       
    return (vocabulary, pos_dict, neg_dict, neu_dict)

vocabulary, pos_dict, neg_dict, neu_dict = word_counter2(data_train)

#Ex 3: I do not use metadata.

In [6]:
#Ex 4 logprior:

def logprior(data_train):
    
    N_doc = len(data_train) #Number of docs in training dataset.
    N_c = data_train["sentiment"].value_counts() #Number of docs for each sentiment.
    
    #Making logpriors
    logprior = {}
    logprior["positive"]= math.log10(N_c["positive"]/N_doc)
    logprior["negative"]= math.log10(N_c["negative"]/N_doc)
    logprior["neutral"]= math.log10(N_c["neutral"]/N_doc)
    
    return logprior

logprior = logprior(data_train)

In [7]:
#Ex 5 loglikelihood:

def create_loglike(pos_dict,neg_dict,neu_dict):
    #I append the calculated loglikelihoods for each word to the list loglike.
    loglike = [] 
    for word in set(vocabulary):
        pos = math.log10((pos_dict[word])/(pos_dict["total_word"]))
        neg = math.log10((neg_dict[word])/(neg_dict["total_word"]))
        neu = math.log10((neu_dict[word])/(neu_dict["total_word"]))
        loglike.append((word,pos,neg,neu))
    
    return loglike

loglike = create_loglike(pos_dict,neg_dict,neu_dict)

In [19]:
#Ex 6 naive bayes prediction model:

def pred_nb(text, logprior, loglike, vocabulary):
    summ = logprior.copy()
    pred_classes = ["positive", "negative", "neutral"]
    
    for word in text:
        if word in set(vocabulary):
            value_list = [x for x in loglike if x[0] == word]
            
            i = 1
            for pred in pred_classes:
                summ[pred] += summ[pred] + value_list[0][i]
                i += 1
    
    maxi = max(summ["positive"],summ["negative"],summ["neutral"])
    
    if maxi == summ["neutral"]:
        return "neutral"
        
    elif maxi == summ["positive"]:
        return "positive"
    
    else:
        return "negative"

In [12]:
#Ex 7 evaluating the model:

def evaluate_nb(data_test, logprior, loglike, vocabulary):
    correct = 0
    total = len(data_test)


    for index, row in data_test.iterrows():
        result = pred_nb(row.text,logprior, loglike, vocabulary)

        if result==row.sentiment:
            correct += 1

    pred_rate = correct/total
    return pred_rate

evaluate_nb(data_test, logprior, loglike, vocabulary)

#result: 0.68

0.6584699453551912

In [20]:
def confusion_matrix(data_test, logprior, loglike, vocabulary):
    
    pos_result = {"pos":0,"neg":0,"neu":0}
    neg_result = {"pos":0,"neg":0,"neu":0}
    neu_result = {"pos":0,"neg":0,"neu":0}
    keys = ["pos","neg","neu"]
    
    for index, row in data_test.iterrows():
        result = pred_nb(row.text,logprior, loglike, vocabulary)
        
        if row.sentiment == "positive":
            if result == "positive": pos_result["pos"] += 1
            elif result == "negative": pos_result["neg"] += 1
            elif result == "neutral": pos_result["neu"] += 1
                
        elif row.sentiment == "negative":
            if result == "positive": neg_result["pos"] += 1
            elif result == "negative": neg_result["neg"] += 1
            elif result == "neutral": neg_result["neu"] += 1
                
        elif row.sentiment == "neutral":
            if result == "positive": neu_result["pos"] += 1
            elif result == "negative": neu_result["neg"] += 1
            elif result == "neutral": neu_result["neu"] += 1
                

    conf_matrix = np.matrix([[pos_result[i] for i in keys],\
                         [neg_result[i] for i in keys],\
                         [neu_result[i] for i in keys]])
    
    return conf_matrix

    
conf_mat = confusion_matrix(data_test, logprior, loglike, vocabulary)

#[[ 39 182   2]
# [  2 907   6]
# [  6 304  16]]

[[ 39 182   2]
 [  2 907   6]
 [  6 304  16]]


In [116]:
#Ex 10 explanation:

def explanation_nb(text, logprior, loglike, vocabulary):
    pred_classes = ["positive", "negative", "neutral"]
    summ = logprior.copy()

    expl = {pred: [] for pred in pred_classes}
    for pred in pred_classes:
        expl[pred].append(("logprior:",summ[pred]))

    for word in text:
        if word in set(vocabulary):
            value_list = [x for x in loglike if x[0] == word]
            
            tup_word = str(word)+":"
            i = 1
            for pred in pred_classes:
                expl[pred].append((tup_word,value_list[0][i]))
                i += 1
                
    pred = pred_nb(text,logprior, loglike, vocabulary)
    
    print("Prediction:", pred)
    print("List of words used in analysis after cleaning: ", text)
    
    for pred_class in pred_classes: 
        print("-----------------------------------------")
        print(pred_class.capitalize()+" score: {}".format(round(summ[pred_class],2)))
        print("Explanation:")
        
        for tup in expl[pred_class]:
            print(str(tup[0]),round(tup[1],3))
        if pred_class == "neutral":
            print("-----------------------------------------")
    
    return pred

In [1]:
#Ex 9 command line:

def commandline_test_nb(text, expl = None):
    text = text.split()
    filtered_word = [word.lower() for word in text if len(word) >= 3]
    #words without @, http and is not RT. Word also has to start with a character
    cleaned_word = [word for word in filtered_word \
                    if "@" not in word\
                    and "#" not in word\
                    and word != "RT"\
                    and word[0].isalpha()\
                    and "http" not in word\
                    ]
    
    #list of words without emojis
    remove_smiley = [RE_EMOJI.sub(r'', word) for word in cleaned_word]
    #list of words with negation mark
    mark_neg = mark_negation(remove_smiley)
    #remove stopwords with nltk
    non_stopwords = [word for word in mark_neg if not word in stopwords_helpset]
    #stemming words
    stemmed_list = [stemmer.stem(word) for word in non_stopwords]
    #removing punctuation
    pred_list = [word.translate(str.maketrans('', '', string.punctuation)) for word in stemmed_list]
    
    if expl == "y": 
        result = explanation_nb(pred_list,logprior, loglike, vocabulary)
        return result
    else: 
        result = pred_nb(pred_list,logprior, loglike, vocabulary)
        return result

def command_line():
    
    #Pickle dump.
    read_ml = open("ml-oblig1","rb")
    #Contains: [vocabulary, logprior, loglike].
    try:
        save_list1 = pickle.load(read_ml)
        print("Model data loaded.")
    except:
        print("Failed to load model data.")
        print("Make sure data dump is in correct directory.")
    
    vocabulary=save_list1[0]
    logprior=save_list1[1]
    loglike=save_list1[2]
    
    expl = str(input("Press y if you want an explanation, if else press enter.\n"))
    expl = expl.lower()
    print("Enter to quit in following sequence.")
    
    while True:
        tweet = str(input("Tweet to analyse: \n"))
        print("*****************************************")
        
        if tweet == "":
            #save_ml = open("ml-oblig1","ab")
            #save_list = [vocabulary, logprior, loglike]
            #pickle.dump(save_list,"ml-oblig1")
            print("Terminating")
            
            break
            
        
        if expl == "y": result = commandline_test_nb(tweet, "y")
        else: 
            result = commandline_test_nb(tweet)
            print("Predicted sentiment: ",result)
        print("*****************************************")
        
   
if __name__ == '__main__':
    import pickle
    import pandas as pd
    import string
    import math
    import nltk
    import regex as re
    
    from nltk import sent_tokenize, word_tokenize
    from nltk.stem.snowball import SnowballStemmer
    from nltk.corpus import stopwords
    from nltk.sentiment.util import mark_negation
    from math import log 

    command_line()

Model data loaded.


KeyboardInterrupt: 

In [None]:
#ex 11:

#Confusion matrix =
#      pos neg  neu
#[pos[ 39  182   2]
# neg[  2  907   6]
# neu[  6  304  16]]

#accuracy = 0.68

#The main problem of this model arises from the huge imbalance in the dataset. Most tweets are negative, this
#means that logprior most of the times will outweigh loglikelihood. The effect of this is a model that mostly
#predicts negative tweets. An accuracy of 68 is a little better than if the model would predict negative
#for all inputs. 
#The tweets the model predict correctly are longer positive tweets where the number of positive words
#(loglikelihood) outweigh negative to positive ratio (logprior) in the dataset.