In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('tweets.csv',encoding= 'latin-1',header = None)
df = df.sample(frac = 1) ### shuffle the data

In [3]:
df = df.rename(columns={0: 'target', 1: 'id', 2: 'date', 3: 'query', 4: 'username', 5: 'content'}) # add names for columns

In [4]:
df = df.drop(['id','date','query','username'],axis=1) # drop unimportant columns

In [5]:
### create a word vectorized model trained on our tweets so that we can use it to make the embedded layer in keras ###
### Tokenizing tweets  ###
import gensim  # NLP library
tokenized_tweets = df.content.apply(gensim.utils.simple_preprocess)
print(tokenized_tweets.head(5))

452826     [forwardadam, know, election, via, twitter, sa...
1158876              [regalaffair, hey, you, are, up, early]
21945                              [in, bed, wid, bad, cold]
617015     [the, zoo, was, amazing, minus, the, giraffes,...
1220931                                           [it, over]
Name: content, dtype: object


In [6]:
word2vec_model = gensim.models.Word2Vec(
    window=7, # 7 words before and after the targeted word
    min_count=2, # ignore sentences contains less than 2 words
    workers=4, # number of cores (cpu threads)
    )

In [7]:
word2vec_model.build_vocab(tokenized_tweets,progress_per=1000) # progress_per => after how many words you want to see progress
word2vec_model.epochs = 3 # epochs => by default they are 5

In [8]:
import os
if not os.path.exists("gensim_model"):
    os.makedirs("gensim_model")

word2vec_model.train(tokenized_tweets,total_examples=word2vec_model.corpus_count,epochs=word2vec_model.epochs) # corpus_count = total number of sentences (tweets)
word2vec_model.save('gensim_model\\VectorizedTweets.model') # save model to gensim_model folder (you can continue training later)

In [9]:
### load model ###
from gensim.models import Word2Vec
word2vec_model = Word2Vec.load("gensim_model\\VectorizedTweets.model")

## test model

In [10]:
print(word2vec_model.wv.most_similar("happy")) # most similar words to happy

[('celebrating', 0.6401989459991455), ('janell', 0.5958845615386963), ('present', 0.5867543816566467), ('implore', 0.5770038366317749), ('blessed', 0.5501918196678162), ('pleased', 0.5448803305625916), ('presents', 0.5438183546066284), ('yuky', 0.5434587597846985), ('outragous', 0.5378299951553345), ('bash', 0.5368425846099854)]


In [11]:
print(word2vec_model.wv.similarity("happy","sad")) # similarity between happy and sad

0.53194535


In [12]:
word2vec_model.wv["happy"] # get word vector of happy

array([ 7.7209848e-01,  2.9098938e+00, -2.2881694e+00,  1.1564962e+00,
       -7.0474006e-02,  1.9252822e+00,  2.8932507e+00, -2.0322993e+00,
        5.2276370e-04, -1.2602391e+00, -1.0535733e+00,  1.2958065e+00,
        2.4510903e+00,  1.4463763e+00,  2.2114964e-01,  1.3051102e+00,
       -1.9796247e+00, -1.0180608e+00, -3.7736886e+00, -4.1302692e-02,
       -3.5985131e+00, -9.0370959e-01,  1.3302295e+00, -2.0401471e+00,
       -2.0792520e+00,  7.6381743e-01, -8.2763106e-01, -7.6384658e-01,
        1.3098241e+00, -2.0987318e+00, -9.6480125e-01,  3.7115054e+00,
       -6.6492420e-01,  1.1560215e+00, -2.0045030e+00,  9.9720895e-01,
       -2.7270319e+00, -3.6406973e-01, -3.1315205e+00,  5.2508420e-01,
        2.0887537e-01,  2.4835315e+00, -9.7937882e-01, -3.6917013e-01,
        1.8493687e+00, -2.1091148e-01, -7.5071257e-01,  1.3299061e+00,
        8.9197558e-01,  9.8119003e-01, -1.4542979e-02, -4.9188328e-01,
       -6.3787031e-01, -3.6916361e+00, -1.0722351e+00, -2.9612443e+00,
      

In [14]:
# get the most common 3 words
print(word2vec_model.wv.index_to_key[0])
print(word2vec_model.wv.index_to_key[1])
print(word2vec_model.wv.index_to_key[2])

to
the
my


In [15]:
# get the least common 3 words
print(word2vec_model.wv.index_to_key[-1])
print(word2vec_model.wv.index_to_key[-2])
print(word2vec_model.wv.index_to_key[-3])

kenno
jay_rachinea
splashin


In [20]:
word2vec_model.wv.key_to_index["happy"] # get the index of happy

0

In [22]:
word2vec_model.wv.doesnt_match(["green", "blue", "red", "zebra"]) # get the word that is different from other words

'zebra'

## word2vec as Embedding layer

In [25]:
# convert the wv word vector into a numpy matrix that is suitable for keras
# insertion into TensorFlow and Keras models
import numpy as np
embedding_matrix = np.zeros((len(word2vec_model.wv.key_to_index),word2vec_model.vector_size))
for i in range(len(word2vec_model.wv.key_to_index)):
    embedding_vector = word2vec_model.wv[word2vec_model.wv.index_to_key[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
embedding_matrix

array([[ 2.40618587e+00, -4.43708986e-01, -5.81877947e-01, ...,
        -1.88972306e+00,  4.82971877e-01, -1.30342066e+00],
       [ 2.72394753e+00,  9.42519784e-01,  1.34451652e+00, ...,
        -1.26569605e+00,  1.64840174e+00, -3.77694488e+00],
       [ 3.01912665e+00, -4.37624514e-01, -2.86220026e+00, ...,
        -2.50975728e+00,  2.63309591e-02, -4.38553762e+00],
       ...,
       [-1.85065977e-02,  3.52748968e-02,  2.82073170e-02, ...,
        -1.62989236e-02, -5.30586578e-03, -1.60317123e-02],
       [-1.05304271e-02, -4.01071506e-03, -1.05484780e-02, ...,
        -8.12941231e-03, -1.15895560e-02,  2.96911894e-04],
       [-2.13941233e-03,  1.18429000e-02, -1.47233577e-02, ...,
        -3.04355994e-02, -2.11580135e-02, -3.12503800e-02]])

In [27]:

def getMaxWordLength():
    max_word_length = 0
    for i in word2vec_model.wv.key_to_index.keys():
        if len(i) > max_word_length:
            max_word_length = len(i)
    return max_word_length

In [29]:
word2vec_model.wv.vector_size

100

In [None]:
import tensorflow as tf
from  tensorflow.keras.layers import Embedding, Input
MAX_SEQUENCE_LENGTH = getMaxWordLength()
embedding_layer = tf.keras.layers.Embedding(word2vec_model.wv.vector_size, 
                                            input_dim=embedding_matrix.shape[0], 
                                            input_length = MAX_SEQUENCE_LENGTH, 
                                            weights=[embedding_matrix], 
                                            trainable=False)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = embedding_layer(sequence_input)