# Word Embeddings

## Importing libraries 

In [1]:
import itertools
import random
import pandas as pd
import numpy as np
import gensim
import keras                                                            #pip install keras #pip install tensorflow
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from tensorflow.keras.optimizers import Adam
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec                                   #pip install gensim
from gensim.models.word2vec import Text8Corpus
import gensim.downloader as api
from glove import Corpus, Glove                                      #pip install glove-py
from joblib import dump, load
from gensim import utils
import matplotlib.pyplot as plt
plt.style.use(style='seaborn')
%matplotlib inline

In [2]:
PATH_TRAIN_NEG = '../Resources/train_neg_processed.txt'
PATH_TRAIN_POS = '../Resources/train_pos_processed.txt'

with open(PATH_TRAIN_POS) as f:
    train_pos = f.read().splitlines()
with open(PATH_TRAIN_NEG) as f:
    train_neg = f.read().splitlines()

train_set = train_pos + train_neg

## Count Vectorizer 

In [3]:
"basic and naive method for word embedding"
def we_count_vectorize(train_set) :
    
    vectorizer = CountVectorizer(lowercase=True)
    text_counts = vectorizer.fit_transform(train_set)
    voc = vectorizer.vocabulary_
    
    #df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
    #df = df.sort_values('Count frequency', ascending=False)
    #print(df.head(25))
    
    return voc, text_counts

In [4]:
"Sparse matrix : would require way too much space (around 38GB)"
voc, text_counts = we_count_vectorize(train_set)

## Tf-idf

In [5]:
def we_tfIdf(train_set,n_min,n_max) : 
    
    vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(n_min, n_max))
    text_tfIdf = vectorizer.fit_transform(train_set)
    voc = vectorizer.vocabulary_

    #df = pd.DataFrame(text_tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
    #df = df.sort_values('TF-IDF', ascending=False)
    #print (df.head(25))
    
    return voc, text_tfIdf

## Word2Vector

In [6]:
class MyCorpus(object):
    
    def __init__(self, positive_corpus,negative_corpus):
        
        self.positive_corpus = positive_corpus
        self.negative_corpus = negative_corpus
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        
        for line in open(self.positive_corpus):            
            yield utils.simple_preprocess(line)
        for line in open(self.negative_corpus):
            yield utils.simple_preprocess(line)

#### do not run this cell again : run the next one to load the model

In [29]:
"""
neural embedding model
"""
def we_word2vector_create_model(path_train_pos, path_train_neg) :
    
    corpus = MyCorpus(path_train_pos, path_train_neg)
    model = Word2Vec(sentences = corpus)
    model.save('word2vec_saved_model.joblib')
    model.wv.save("word2vec.wordvectors")

    return model

#### load the model 

In [7]:
# for path_train_pos = ..\\Resources\\train_pos.txt
# for path_train_heg = ..\\Resources\\train_neg.txt
def we_word2vector_load_model() :
    return gensim.models.Word2Vec.load('word2vec_saved_model.joblib')

## GloVe

In [7]:
def create_corpus(train_set):
    corpus=[]
    for tweet in train_set:
        words=[word.lower() for word in word_tokenize(tweet) if(len(tweet)>1 and word.isalpha())]
        corpus.append(words)
        corpus = [element for element in corpus if len(element)>0]
    return corpus

In [8]:
# GloVe pretrained twitter dataset
def load_pretrained():
    embedding_dict={}
    with open('../Resources/twitter_dict/glove.twitter.27B.200d.txt','rb') as f:
        for line in f:
            values=line.split()
            vectors=np.asarray(values[1:],'float32')
            embedding_dict[values[0]]=vectors
    f.close()
    return embedding_dict

In [9]:
enbedding_dict = load_pretrained()

In [10]:
def create_sequence(corpus, max_length=None) : 

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    word_index = tokenizer.word_index
    nb_words = len(word_index)
    sequences = tokenizer.texts_to_sequences(corpus)
    tweet_pad = pad_sequences(sequences,maxlen=max_length,truncating='post',padding='post')
    
    corpus_size = len(corpus)
    y = np.array(int(corpus_size/2) * [0] + int(corpus_size/2) * [1])
 
    indices = np.arange(tweet_pad.shape[0])
    np.random.shuffle(indices)
    tweet_pad = tweet_pad[indices]
    y = y[indices]

    return tweet_pad, word_index, y 

In [15]:
corpus = create_corpus(train_set)
seq, wi,y = create_sequence(corpus)

In [41]:
def we_glove(train_set, word_index):
    num_words=len(word_index)+1
    tweets = [tweet.split() for tweet in train_set]
    embedding_dict = load_pretrained()
    glove_matrix = np.zeros((num_words, 200))

    for word,i in word_index.items():
        if i >= num_words:
            continue
    
        emb_vec=embedding_dict.get(word.encode())
        if emb_vec is not None:
            glove_matrix[i]=emb_vec
            
    return glove_matrix, word_embedding

In [42]:
glove_matrix, word_embedding = we_glove(train_set, wi)
num_words = len(wi)

In [43]:
glove_matrix.shape

(69234, 200)