In [1]:
import pandas as pd
import numpy as np
import spacy
import gensim

from nltk.tokenize import word_tokenize

In [2]:
data = pd.read_csv('data/emotion_data_merged.csv')

In [3]:
data

Unnamed: 0,Sentences,Emotions
0,Dorian Gray with Rainbow Scarf #LoveWins (from...,happiness
1,@SelectShowcase @Tate_StIves ... Replace with ...,happiness
2,@Sofabsports thank you for following me back. ...,happiness
3,@britishmuseum @TudorHistory What a beautiful ...,happiness
4,@NationalGallery @ThePoldarkian I have always ...,happiness
...,...,...
433308,A reminder that >![NAME]!< unironically likes ...,happiness
433309,Hopefully they got a chance to see [NAME].,happiness
433310,Perhaps you are right and the stereotype that ...,disgust
433311,I just called the Capitol Police. They are not...,anger


In [4]:
sentences = data.Sentences.apply(gensim.utils.simple_preprocess)
sentences

0         [dorian, gray, with, rainbow, scarf, lovewins,...
1         [selectshowcase, tate_stives, replace, with, y...
2         [sofabsports, thank, you, for, following, me, ...
3         [britishmuseum, tudorhistory, what, beautiful,...
4         [nationalgallery, thepoldarkian, have, always,...
                                ...                        
433308    [reminder, that, name, unironically, likes, th...
433309        [hopefully, they, got, chance, to, see, name]
433310    [perhaps, you, are, right, and, the, stereotyp...
433311    [just, called, the, capitol, police, they, are...
433312    [couldnt, find, this, one, in, google, where, ...
Name: Sentences, Length: 433313, dtype: object

In [5]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

In [6]:
model.build_vocab(sentences, progress_per=1000)

In [7]:
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

(26454205, 36029725)

In [11]:
def word_embeddings(sentence):
    words = word_tokenize(sentence.lower())
    # Initialize an empty vector
    total_vector = np.zeros(model.vector_size)
    for word in words:
        if word in model.wv:
            total_vector += model.wv[word]
    # Average the vectors
    if len(words) > 0:
        total_vector /= len(words)
    return total_vector

In [12]:
def add_embeddings_to_dataset(dataset, column):
    embeddings = []
    for sentence in dataset[column]:
        embeddings.append(word_embeddings(sentence))
    dataset['embedding'] = embeddings

In [13]:
add_embeddings_to_dataset(data, 'Sentences')

In [14]:
data

Unnamed: 0,Sentences,Emotions,embedding
0,Dorian Gray with Rainbow Scarf #LoveWins (from...,happiness,"[-0.2156420982339316, -0.39359256914920276, -0..."
1,@SelectShowcase @Tate_StIves ... Replace with ...,happiness,"[-0.24448896430078007, -0.17164118366227263, -..."
2,@Sofabsports thank you for following me back. ...,happiness,"[-0.4075755175823967, 0.047608294000383466, -0..."
3,@britishmuseum @TudorHistory What a beautiful ...,happiness,"[0.04615092230960727, 0.3647947927054606, -0.2..."
4,@NationalGallery @ThePoldarkian I have always ...,happiness,"[-0.005486067723144184, 0.2538980638439005, 0...."
...,...,...,...
433308,A reminder that >![NAME]!< unironically likes ...,happiness,"[-0.24532882845960557, -0.06553940010053338, -..."
433309,Hopefully they got a chance to see [NAME].,happiness,"[-0.0942016565664248, -0.7372868054292419, -0...."
433310,Perhaps you are right and the stereotype that ...,disgust,"[0.08553577214479446, -0.11938284923830493, 0...."
433311,I just called the Capitol Police. They are not...,anger,"[-0.31818711827509105, -0.4231173995261391, -0..."
