# Word Embeddings

## Importing libraries 

In [1]:
"""
Import necessary libraries
"""
import itertools
import random
import pandas as pd
import numpy as np
import gensim
import gensim.downloader as api
import keras                                                            #pip install keras #pip install tensorflow
from tqdm import tqdm
from scipy import spatial
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from tensorflow.keras.optimizers import Adam
from gensim.models import Word2Vec                                   #pip install gensim
from gensim.models.word2vec import Text8Corpus
from gensim import utils
from glove import Corpus, Glove                                      #pip install glove-python-binary 
from joblib import dump, load

import matplotlib.pyplot as plt
plt.style.use(style='seaborn')
%matplotlib inline

In [2]:
"""
Load full train data processed and build train set 
"""
PATH_TRAIN_POS = '../Resources/preprocessing_pos_fp_full.txt'
PATH_TRAIN_NEG = '../Resources/preprocessing_neg_fp_full.txt'

with open(PATH_TRAIN_POS,errors = 'ignore') as f:
    train_pos = f.read().splitlines()
with open(PATH_TRAIN_NEG,errors = 'ignore') as f:
    train_neg = f.read().splitlines()

train_set = train_pos + train_neg

## Count Vectorizer 

In [3]:
"""
basic and naive method for word embedding
"""
def we_count_vectorize(train_set) :
    
    vectorizer = CountVectorizer(lowercase=True)
    text_counts = vectorizer.fit_transform(train_set)
    voc = vectorizer.vocabulary_
    
    #df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["count vectorize"])
    #df = df.sort_values('Count frequency', ascending=False)
    #print(df.head(25))
    
    return voc, text_counts

In [5]:
"Sparse matrix : would require way too much space (around 38GB)"
voc, text_counts = we_count_vectorize(train_set)

'Sparse matrix : would require way too much space (around 38GB)'

## Tf-idf

In [6]:
"""
INPUT : 
    train_set : list         - contains all the positive and negative tweets
    n_min : int              - the minimal number or words in the word representation of the vocabulary (set to one)
    n_max : int              - maximal number of words : chose 3 for 3-gram, 4 for 4-gram...
"""
def we_tfIdf(train_set,n_min,n_max) : 
    
    vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(n_min, n_max))
    text_tfIdf = vectorizer.fit_transform(train_set)
    voc = vectorizer.vocabulary_

    #df = pd.DataFrame(text_tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
    #df = df.sort_values('TF-IDF', ascending=False)
    #print (df.head(25))
    
    return voc, text_tfIdf

In [7]:
voc_tf, tf_idf = we_tfIdf(train_set,1,3)

## Word2Vector

In [130]:
"""
Create Word2vector corpus 
"""
class MyCorpus(object):
    
    def __init__(self, positive_corpus,negative_corpus):
        
        self.positive_corpus = positive_corpus
        self.negative_corpus = negative_corpus
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        
        for line in open(self.positive_corpus):            
            yield utils.simple_preprocess(line)
        for line in open(self.negative_corpus):
            yield utils.simple_preprocess(line)

#### do not run this cell again : run the next one to load the model

In [29]:
"""
neural embedding model : saves and return Word2vector model
    INPUT : 
        path_train_pos: string     
        path_train_neg: string
    OUTPUT : 
        returns the Word2vector model on the corpus 
"""
def we_word2vector_create_model(path_train_pos, path_train_neg) :
    
    corpus = MyCorpus(path_train_pos, path_train_neg)
    model = Word2Vec(sentences = corpus)
    model.save('word2vec_saved_model.joblib')
    model.wv.save("word2vec.wordvectors")

    return model

#### load the model 

In [131]:
"""
load the Word2Vector model previously saved
"""
# for path_train_pos = ..\\Resources\\train_pos.txt
# for path_train_heg = ..\\Resources\\train_neg.txt
def we_word2vector_load_model() :
    return gensim.models.Word2Vec.load('word2vec_saved_model.joblib')

## GloVe

In [195]:
"""
Create glove corpus from training set 
"""
def create_corpus(train_set):
    
    corpus=[]
    for tweet in train_set:
        words=[word.lower() for word in word_tokenize(tweet) if(len(tweet)>1 and word.isalpha())]
        corpus.append(words)
        corpus = [element for element in corpus if len(element)>0]
        
    return corpus

In [196]:
"""
Create embedding dictionary from GloVe pretrained twitter dataset
here we use the one of dimension 200
""""
def create_pretrained() : 
    embedding_dict={}
    with open('../Resources/twitter_dict/glove.twitter.27B.200d.txt','rb') as f:
        for line in f:
            values=line.split()
            vectors=np.asarray(values[1:],'float32')
            embedding_dict[values[0].decode()]=vectors
    f.close()
    
    return embedding_dict

In [197]:
"""
Create embedding dictionary from our twitter dataset and save it as glove.model
""""
def create_glove_emb(train_set) :
    model = Corpus()
    train_splitted = [tweet.split() for tweet in train_set]
    model.fit(train_splitted, window = 5)
    
    glove = Glove(no_components=200, learning_rate=0.05)
    glove.fit(model.matrix, epochs=50)
    glove.add_dictionary(model.dictionary)
    glove.save('glove.model')
    
    embedding_dict = {}
    for w, id_ in glove.dictionary.items():
        embedding_dict[w] = np.array(glove.word_vectors[id_])

    return embedding_dict

In [198]:
"""
Load previously computed embedding dictionaries 
"""
def load_embedding_dict(train_set, use_pretrained=True):
    
    if (use_pretrained) : 
        embedding_dict = create_pretrained()
    else : 
        embedding_dict = create_glove_emb(train_set)
    
    return embedding_dict

In [199]:
"""
Create sequences from corpus and tokenize corpus
Pad sequences with zeros so they all have the same length (max_length)
"""
def create_sequence(corpus, max_length=None) : 

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    word_index = tokenizer.word_index
    nb_words = len(word_index)
    sequences = tokenizer.texts_to_sequences(corpus)
    tweet_pad = pad_sequences(sequences,maxlen=max_length,truncating='post',padding='post')
    
    corpus_size = len(corpus)
    y = np.array(int(corpus_size/2) * [0] + int(corpus_size/2) * [1])
 
    indices = np.arange(tweet_pad.shape[0])
    np.random.shuffle(indices)
    tweet_pad = tweet_pad[indices]
    y = y[indices]

    return tweet_pad , word_index , nb_words, y 

In [200]:
def we_glove(train_set,use_pretrained = True, max_length = None):
    corpus = create_corpus(train_set)
    sequence, word_index, nb_words, y = create_sequence(corpus,max_length)
    num_words=nb_words+1
    tweets = [tweet.split() for tweet in train_set]
    embedding_dict = load_embedding_dict(train_set, use_pretrained)
    glove_matrix = np.zeros((num_words, 200))

    for word,i in word_index.items():
        if i >= num_words:
            continue
        if use_pretrained : 
            emb_vec = embedding_dict.get(word)
        else : 
            emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            glove_matrix[i]=emb_vec
            
    return sequence, glove_matrix, embedding_dict, nb_words, y 

## test both pretrained and in house glove word embeddings

In [201]:
sequence, glove_matrix, embedding_dict, nb_words, y  = we_glove(train_set)

In [202]:
sequence1, glove_matrix1, embedding_dict1, nb_words1, y1  = we_glove(train_set,False)

In [206]:
def find_similar_word(embedding_dict, emmbedes):
    nearest = sorted(embedding_dict.keys(), key=lambda word: spatial.distance.euclidean(embedding_dict[word], emmbedes))
    return nearest

In [207]:
find_similar_word(embedding_dict, embedding_dict['love'])[0:10]

['love',
 'you',
 'much',
 'always',
 'know',
 'loves',
 'miss',
 'loving',
 'true',
 'life']

In [209]:
find_similar_word(embedding_dict1, embedding_dict1['love'])[0:10]

['love',
 'kiss","positive',
 'fan',
 'beautiful',
 'know","personally',
 'love","positive',
 'rain","positive',
 'justin","wear',
 'kt","really',
 'youu']