In [1]:
import pickle

import gensim
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import twint
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier


In [2]:
import os

dir = './emotions_prediction'
os.chdir("PycharmProjects")
os.chdir("SocialMediaAnalysis")
print(os.getcwd())

with open("{}/data/dataset.raw.pickle".format(dir), "rb") as dataset_file:
    dataset = pickle.load(dataset_file, encoding='latin1')


/Users/r.makowiecki/PycharmProjects/SocialMediaAnalysis


In [3]:
labels = ['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt']

# print dataset type and sample
print(type(dataset))
print(dataset['info'][:10])
print(dataset['texts'][:10])

In [4]:
X = dataset['texts']
Y = [item['label'] for item in dataset['info']]
    
#print labels sample
print(Y[:10])

[array([1., 0., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 1., 0., 0., 0., 0.]), array([0., 0., 0., 1., 0., 0., 0.]), array([0., 0., 0., 0., 1., 0., 0.]), array([0., 0., 0., 0., 0., 1., 0.]), array([0., 0., 0., 0., 0., 0., 1.]), array([1., 0., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 1., 0., 0., 0., 0.])]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [6]:
# model taken from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM. Place in appropriate directory
model = KeyedVectors.load_word2vec_format('./emotions_prediction/model/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [7]:
def get_english_stopwords():
    file_path = './emotions_prediction/data/nltk_english_stopwords'
    with open(file_path, 'r') as stop_words_file:
        return stop_words_file.read().splitlines()


In [8]:

stop_words_list = set('for a of the and to in'.split())

def vectorize_sentence(model_w2v, sentence, stop_words=stop_words_list):
    words = gensim.utils.simple_preprocess(sentence)

    sentence_vector = []
    for word in words:
        if word not in stop_words and word in model_w2v.vocab:
            sentence_vector.append(model_w2v[word])

    sentence_vector = np.stack(sentence_vector, axis=0)

    return np.mean(sentence_vector, axis=0)

In [9]:
print(X_train[:10])
X_train_vectorized = [vectorize_sentence(model, sentence) for sentence in X_train]
X_test_vectorized = [vectorize_sentence(model, sentence) for sentence in X_test]

y_test = np.stack(y_test, axis=0)
print(y_test)


["When dropping a carton of boiling soup onto a co-worker's leg - causing great pain, and for me, severe embarassment at my own ineptitude, and shame that another person suffered as a result.", "When a colleague was rude with me because she didn't understand the subject about which I was discussing.", 'A holiday at a whitesand deserted beach. A cool evening near the end of summer and we had a fire in the sand - toasted marshmellows, cuddled and drank champagne.', 'I lost a close friend.', 'This is a kind of confession - my mum found me stealing meat from the pot.', "When I found my boyfriend, who I had been dating for a year, at another girl's house. We had had previous conflicts concerning the girl, but he had always claimed they were just friends.", 'I was skiing with friends and they wanted to ski-jump. I did not want to be left behind, so I braced myself and followed them.', 'Heard about someone telling lies about me to my best friend.', 'Got a big fish in fishing.', 'The time I le

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
def save_binarized_model(model, name):
    with open(name + '.pkl', 'wb') as save_file:
        pickle.dump(model, save_file, pickle.HIGHEST_PROTOCOL)

def load_binarized_model(name):
    with open(name + '.pkl', 'rb') as load_file:
        return pickle.load(load_file)

def decode_one_hot(labels):
    return [np.where(label == 1)[0][0] for label in labels]

print(y_test)
print(decode_one_hot(y_test))


In [11]:
#learn the model against various architectures
best_accuracy = .0
best_mlp = None

for hidden_layers_size in [(250,), (220,), (200,), (180,), (150,), (100,), (200, 100), (200, 50), (220, 30), (200, 10)]:
    print('Learning MLP with hidden layers of size {}'.format(hidden_layers_size))

    mlp_classifier = MLPClassifier(hidden_layer_sizes=hidden_layers_size, alpha=0.05, max_iter=300, random_state=1337)
    mlp_classifier.fit(X_train_vectorized, decode_one_hot(y_train))

    predictions = mlp_classifier.predict(X_test_vectorized)
    predictions = np.stack(predictions, axis=0)

    current_accuracy = accuracy_score(decode_one_hot(y_test), predictions)
    print('Achieved accuracy {}'.format(current_accuracy))
    if current_accuracy > best_accuracy:
        best_accuracy = current_accuracy
        best_mlp = mlp_classifier

save_binarized_model(best_mlp, './emotions_prediction/model/mlp_model')

Learning MLP with hidden layers of size (250,)




Achieved accuracy 0.5421122994652406
Learning MLP with hidden layers of size (220,)




Achieved accuracy 0.5327540106951871
Learning MLP with hidden layers of size (200,)




Achieved accuracy 0.5548128342245989
Learning MLP with hidden layers of size (180,)




Achieved accuracy 0.5508021390374331
Learning MLP with hidden layers of size (150,)




Achieved accuracy 0.5407754010695187
Learning MLP with hidden layers of size (100,)




Achieved accuracy 0.5401069518716578
Learning MLP with hidden layers of size (200, 100)


Achieved accuracy 0.5414438502673797
Learning MLP with hidden layers of size (200, 50)


Achieved accuracy 0.5441176470588235
Learning MLP with hidden layers of size (220, 30)


Achieved accuracy 0.5300802139037433
Learning MLP with hidden layers of size (200, 10)


Achieved accuracy 0.5193850267379679




In [17]:
tweets_path = './emotions_prediction/data/tweets_on_trump.json'

c = twint.Config()
c.Search = '#trump'
c.Limit = 100
c.Lang = 'en'
c.Output = tweets_path
c.Store_json = True
twint.run.Search(c)

RuntimeError: This event loop is already running

In [28]:
for i in range(len(labels)):
    path = './data/' + labels[i] + '.json'
    
    if i == 0:
        emotion_data = pd.read_json(path, lines=True)
        emotion_data['label'] = i
    else:
        df = pd.read_json(path, lines=True)
        df['label'] = i
        emotion_data = pd.concat([emotion_data, df])

emotion_data.reset_index(inplace=True)
emotion_data = emotion_data[['tweet', 'label']]