In [1]:
from collections import Counter
import nltk
#nltk.download('all')
import pandas as pd
from emoticons import EmoticonDetector
import re as regex
import numpy as np
import plotly
from plotly import graph_objs
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from time import time
import gensim

# plotly configuration
plotly.offline.init_notebook_mode()


detected Windows; aliasing chunkize to chunkize_serial

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [2]:
class TwitterData_Initialize():
    data = []
    processed_data = []
    wordlist = []

    data_model = None
    data_labels = None
    is_testing = False
    
    def initialize(self, csv_file, is_testing_set=False, from_cached=None):
        if from_cached is not None:
            self.data_model = pd.read_csv(from_cached)
            return

        self.is_testing = is_testing_set

        if not is_testing_set:
            self.data = pd.read_csv(csv_file, header=0, names=["tweet_id", "sentiment", "author", "content"])
            self.data = self.data[self.data["sentiment"]
                                  .isin(["sadness", "worry", 
                                         "happiness", "anger"])]
        else:
            self.data = pd.read_csv(csv_file, header=0, names=["tweet_id", "content"],dtype={"id":"int64","text":"str"},nrows=4000)
            not_null_text = 1 ^ pd.isnull(self.data["content"])
            not_null_id = 1 ^ pd.isnull(self.data["tweet_id"])
            self.data = self.data.loc[not_null_id & not_null_text, :]

        self.processed_data = self.data
        self.wordlist = []
        self.data_model = None
        self.data_labels = None

In [3]:
data = TwitterData_Initialize()
data.initialize("data\\text_emotion.csv")
data.processed_data.head(5)

Unnamed: 0,tweet_id,sentiment,author,content
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ..."
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down


In [5]:
df = data.processed_data

sad = len(df[df["sentiment"] == "sadness"])
wor = len(df[df["sentiment"] == "worry"])
hap = len(df[df["sentiment"] == "happiness"])
ang = len(df[df["sentiment"] == "anger"])

#plotly.offline.iplot({"data":dist, "layout":graph_objs.Layout(title="Sentiment type distribution in data set")})
sad, wor, hap, ang

(5165, 8459, 5209, 110)

In [6]:
class TwitterCleanuper:
    def iterate(self):
        for cleanup_method in [self.remove_urls,
                               self.remove_usernames,
                               self.remove_na,
                               self.remove_special_chars,
                               self.remove_numbers]:
            yield cleanup_method

    @staticmethod
    def remove_by_regex(tweets, regexp):
        tweets.loc[:, "content"].replace(regexp, "", inplace=True)
        return tweets

    def remove_urls(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))

    def remove_na(self, tweets):
        return tweets[tweets["content"] != "Not Available"]

    def remove_special_chars(self, tweets):  # it unrolls the hashtags to normal words
        for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
                                                                     "@", "%", "^", "*", "(", ")", "{", "}",
                                                                     "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                     "!", "?", ".", "'",
                                                                     "--", "---", "#"]):
            tweets.loc[:, "content"].replace(remove, "", inplace=True)
        return tweets

    def remove_usernames(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?"))

    def remove_numbers(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))

In [7]:
class TwitterData_Cleansing(TwitterData_Initialize):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        
    def cleanup(self, cleanuper):
        t = self.processed_data
        for cleanup_method in cleanuper.iterate():
            if not self.is_testing:
                t = cleanup_method(t)
            else:
                if cleanup_method.__name__ != "remove_na":
                    t = cleanup_method(t)

        self.processed_data = t

In [8]:
data = TwitterData_Cleansing(data)
data.cleanup(TwitterCleanuper())
data.processed_data.head(5)

Unnamed: 0,tweet_id,sentiment,author,content
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhhwaitin on y...
2,1956967696,sadness,coolfunky,Funeral ceremonygloomy friday
5,1956968477,worry,xxxPEACHESxxx,Repinging why didnt you go to prom BC my bf di...
6,1956968487,sadness,ShansBee,I should be sleep but im not thinking about an...
7,1956968636,worry,mcsleazy,Hmmm is down


In [9]:
class TwitterData_TokenStem(TwitterData_Cleansing):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        
    def stem(self, stemmer=nltk.PorterStemmer()):
        def stem_and_join(row):
            row["content"] = list(map(lambda str: stemmer.stem(str.lower()), row["content"]))
            return row

        self.processed_data = self.processed_data.apply(stem_and_join, axis=1)

    def tokenize(self, tokenizer=nltk.word_tokenize):
        def tokenize_row(row):
            row["content"] = tokenizer(row["content"])
            row["tokenized_text"] = [] + row["content"]
            return row

        self.processed_data = self.processed_data.apply(tokenize_row, axis=1)


In [10]:
data = TwitterData_TokenStem(data)
data.tokenize()
data.stem()
data.processed_data.head(5)

Unnamed: 0,tweet_id,sentiment,author,content,tokenized_text
1,1956967666,sadness,wannamama,"[layin, n, bed, with, a, headach, ughhhhwaitin...","[Layin, n, bed, with, a, headache, ughhhhwaiti..."
2,1956967696,sadness,coolfunky,"[funer, ceremonygloomi, friday]","[Funeral, ceremonygloomy, friday]"
5,1956968477,worry,xxxPEACHESxxx,"[reping, whi, didnt, you, go, to, prom, bc, my...","[Repinging, why, didnt, you, go, to, prom, BC,..."
6,1956968487,sadness,ShansBee,"[i, should, be, sleep, but, im, not, think, ab...","[I, should, be, sleep, but, im, not, thinking,..."
7,1956968636,worry,mcsleazy,"[hmmm, is, down]","[Hmmm, is, down]"


In [11]:
words = Counter()
for idx in data.processed_data.index:
    words.update(data.processed_data.loc[idx, "content"])

words.most_common(5)

[('i', 10003), ('to', 7282), ('the', 6237), ('a', 4769), ('it', 4419)]

In [12]:
stopwords=nltk.corpus.stopwords.words("english")
whitelist = ["n't", "not"]
for idx, stop_word in enumerate(stopwords):
    if stop_word not in whitelist:
        del words[stop_word]
words.most_common(5)

[('im', 2465), ('go', 1773), ('day', 1642), ('not', 1435), ('get', 1427)]

In [13]:
class TwitterData_Wordlist(TwitterData_TokenStem):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        
    whitelist = ["n't","not"]
    wordlist = []
        
    def build_wordlist(self, min_occurrences=3, max_occurences=500, stopwords=nltk.corpus.stopwords.words("english"),
                       whitelist=None):
        self.wordlist = []
        whitelist = self.whitelist if whitelist is None else whitelist
        import os
        if os.path.isfile("data\\wordlist.csv"):
            word_df = pd.read_csv("data\\wordlist.csv")
            word_df = word_df[word_df["occurrences"] > min_occurrences]
            self.wordlist = list(word_df.loc[:, "word"])
            return

        words = Counter()
        for idx in self.processed_data.index:
            words.update(self.processed_data.loc[idx, "content"])

        for idx, stop_word in enumerate(stopwords):
            if stop_word not in whitelist:
                del words[stop_word]

        word_df = pd.DataFrame(data={"word": [k for k, v in words.most_common() if min_occurrences < v < max_occurences],
                                     "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]},
                               columns=["word", "occurrences"])

        word_df.to_csv("data\\wordlist.csv", index_label="idx")
        self.wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences]

In [14]:
data = TwitterData_Wordlist(data)
data.build_wordlist()

In [15]:
words = pd.read_csv("data\\wordlist.csv")
x_words = list(words.loc[0:10,"word"])
x_words.reverse()
y_occ = list(words.loc[0:10,"occurrences"])
y_occ.reverse()

dist = [
    graph_objs.Bar(
        x=y_occ,
        y=x_words,
        orientation="h"
)]
plotly.offline.iplot({"data":dist, "layout":graph_objs.Layout(title="Top words in built wordlist")})

In [16]:
class TwitterData_BagOfWords(TwitterData_Wordlist):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        self.wordlist = previous.wordlist
    
    def build_data_model(self):
        label_column = []
        if not self.is_testing:
            label_column = ["label"]

        columns = label_column + list(
            map(lambda w: str(w) + "_bow",self.wordlist))
        labels = []
        rows = []
        for idx in self.processed_data.index:
            current_row = []

            if not self.is_testing:
                # add label
                current_label = self.processed_data.loc[idx, "sentiment"]
                labels.append(current_label)
                current_row.append(current_label)

            # add bag-of-words
            tokens = set(self.processed_data.loc[idx, "content"])
            for _, word in enumerate(self.wordlist):
                current_row.append(1 if word in tokens else 0)

            rows.append(current_row)

        self.data_model = pd.DataFrame(rows, columns=columns)
        self.data_labels = pd.Series(labels)
        return self.data_model, self.data_labels

In [17]:
data = TwitterData_BagOfWords(data)
bow, labels = data.build_data_model()
bow.head(5)

Unnamed: 0,label,school_bow,awesom_bow,play_bow,guy_bow,tweet_bow,hey_bow,call_bow,movi_bow,best_bow,...,primavera_bow,nonetheless_bow,flippin_bow,heidi_bow,jailbreak_bow,fianc_bow,quotgt_bow,counter_bow,switzerland_bow,foto_bow
0,sadness,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,sadness,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,worry,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,sadness,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,worry,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
import random
seed = 666
random.seed(seed)

In [20]:
def test_classifier(X_train, y_train, X_test, y_test, classifier):
    log("")
    log("===============================================")
    classifier_name = str(type(classifier).__name__)
    log("Testing " + classifier_name)
    now = time()
    list_of_labels = sorted(list(set(y_train)))
    model = classifier.fit(X_train, y_train)
    log("Learing time {0}s".format(time() - now))
    now = time()
    predictions = model.predict(X_test)
    log("Predicting time {0}s".format(time() - now))

    precision = precision_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    recall = recall_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    log("=================== Results ===================")
    log("            Anger     Happiness     Sadness      Worry")
    log("F1       " + str(f1))
    log("Precision" + str(precision))
    log("Recall   " + str(recall))
    log("Accuracy " + str(accuracy))
    log("===============================================")

    return precision, recall, accuracy, f1

def log(x):
    #can be used to write to log file
    print(x)

In [21]:
from sklearn.naive_bayes import BernoulliNB
X_train, X_test, y_train, y_test = train_test_split(bow.iloc[:, 1:], bow.iloc[:, 0],
                                                    train_size=0.7, stratify=bow.iloc[:, 0],
                                                    random_state=seed)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, BernoulliNB())


From version 0.21, test_size will always complement train_size unless both are specified.




Testing BernoulliNB
Learing time 4.4743218421936035s
Predicting time 1.7126679420471191s
            Anger     Happiness     Sadbness      Worry
F1       [0.         0.50793651 0.28394552 0.59884774]
Precision[0.         0.56603774 0.39359268 0.51427764]
Recall   [0.         0.46065259 0.22207876 0.71670607]
Accuracy 0.5073024810839345



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.

