# Word Embeddings

## Importing libraries 

In [77]:
"""
Import necessary libraries
"""
import csv
import itertools
import random
import pandas as pd
import numpy as np
import gensim
import gensim.downloader as api
import keras                                                            #pip install keras #pip install tensorflow
from tqdm import tqdm
from scipy import spatial
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D, Conv1D, Dropout, Flatten, MaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.initializers import Constant
from tensorflow.keras.optimizers import Adam
from gensim.models import Word2Vec                                   #pip install gensim
from gensim.models.word2vec import Text8Corpus
from gensim import utils
from glove import Corpus, Glove                                      #pip install glove-python-binary 
from joblib import dump, load
from pre_processing import get_pre_process_data_test
import matplotlib.pyplot as plt
plt.style.use(style='seaborn')
%matplotlib inline

In [39]:
"""
Load full train data processed and build train set 
"""
PATH_TRAIN_POS = '../Resources/preprocessing_pos_fp_full.txt'
PATH_TRAIN_NEG = '../Resources/preprocessing_neg_fp_full.txt'

with open(PATH_TRAIN_POS,errors = 'ignore') as f:
    train_pos = f.read().splitlines()
with open(PATH_TRAIN_NEG,errors = 'ignore') as f:
    train_neg = f.read().splitlines()

train_set = train_pos + train_neg

## Count Vectorizer 

In [3]:
"""
basic and naive method for word embedding
"""
def we_count_vectorize(train_set) :
    
    vectorizer = CountVectorizer(lowercase=True)
    text_counts = vectorizer.fit_transform(train_set)
    voc = vectorizer.vocabulary_
    
    #df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["count vectorize"])
    #df = df.sort_values('Count frequency', ascending=False)
    #print(df.head(25))
    
    return voc, text_counts

In [5]:
"Sparse matrix : would require way too much space (around 38GB)"
voc, text_counts = we_count_vectorize(train_set)

'Sparse matrix : would require way too much space (around 38GB)'

## Tf-idf

In [6]:
"""
INPUT : 
    train_set : list         - contains all the positive and negative tweets
    n_min : int              - the minimal number or words in the word representation of the vocabulary (set to one)
    n_max : int              - maximal number of words : chose 3 for 3-gram, 4 for 4-gram...
"""
def we_tfIdf(train_set,n_min,n_max) : 
    
    vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(n_min, n_max))
    text_tfIdf = vectorizer.fit_transform(train_set)
    voc = vectorizer.vocabulary_

    #df = pd.DataFrame(text_tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
    #df = df.sort_values('TF-IDF', ascending=False)
    #print (df.head(25))
    
    return voc, text_tfIdf

In [7]:
voc_tf, tf_idf = we_tfIdf(train_set,1,3)

## Word2Vector

In [130]:
"""
Create Word2vector corpus 
"""
class MyCorpus(object):
    
    def __init__(self, positive_corpus,negative_corpus):
        
        self.positive_corpus = positive_corpus
        self.negative_corpus = negative_corpus
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        
        for line in open(self.positive_corpus):            
            yield utils.simple_preprocess(line)
        for line in open(self.negative_corpus):
            yield utils.simple_preprocess(line)

#### do not run this cell again : run the next one to load the model

In [29]:
"""
neural embedding model : saves and return Word2vector model
    INPUT : 
        path_train_pos: string     
        path_train_neg: string
    OUTPUT : 
        returns the Word2vector model on the corpus 
"""
def we_word2vector_create_model(path_train_pos, path_train_neg) :
    
    corpus = MyCorpus(path_train_pos, path_train_neg)
    model = Word2Vec(sentences = corpus)
    model.save('word2vec_saved_model.joblib')
    model.wv.save("word2vec.wordvectors")

    return model

#### load the model 

In [131]:
"""
load the Word2Vector model previously saved
"""
# for path_train_pos = ..\\Resources\\train_pos.txt
# for path_train_heg = ..\\Resources\\train_neg.txt
def we_word2vector_load_model() :
    return gensim.models.Word2Vec.load('word2vec_saved_model.joblib')

## GloVe

### load 2nd best processing files

In [49]:
PATH_TRAIN_NEG = '../Resources/preprocessing_neg_full.txt'
PATH_TRAIN_POS = '../Resources/preprocessing_pos_full.txt'

# Load the preprocessed datasets already computed

def get_input() :
    with open(PATH_TRAIN_POS) as f:
        train_pos = f.read().splitlines()
    with open(PATH_TRAIN_NEG) as f:
        train_neg = f.read().splitlines()

    train_set = train_pos + train_neg

    y = np.array(len(train_pos) * [1] + len(train_neg) * [0])

    test_set = get_pre_process_data_test(save_file_name='test_data_process.txt')

    return train_set, y, test_set

### load 1st best processing files 

In [63]:
PATH_TRAIN_NEG = '../Resources/preprocessing_neg_fp_full.txt'
PATH_TRAIN_POS = '../Resources/preprocessing_pos_fp_full.txt'

# Load the preprocessed datasets already computed

def get_input_fp() :
    with open(PATH_TRAIN_POS) as f:
        train_pos = f.read().splitlines()
    with open(PATH_TRAIN_NEG) as f:
        train_neg = f.read().splitlines()

    train_set = train_pos + train_neg

    y = np.array(len(train_pos) * [1] + len(train_neg) * [0])

    test_set = get_pre_process_data_test(save_file_name='preprocessing_test_fp.txt')

    return train_set, y, test_set

In [64]:
"""
Create glove corpus from training set 
"""
def create_corpus(train_set):
    
    corpus=[]
    for tweet in train_set:
        words=[word.lower() for word in word_tokenize(tweet) if(len(tweet)>1 and word.isalpha())]
        corpus.append(words)
        corpus = [element for element in corpus if len(element)>0]
        
    return corpus

In [65]:
"""
Create embedding dictionary from GloVe pretrained twitter dataset
here we use the one of dimension 50
    OUTPUTS :
         The embedded matrix from the glove database
"""
def create_pretrained() : 
    
    vocabulary_size = 20000
    embedding_dict = dict()
    with open('../Resources/twitter_dict/glove.twitter.27B.50d.txt','rb') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_dict[word.decode()] = coefs
    f.close()    
    return embedding_dict, vocabulary_size

In [111]:
"""
Create our embedded matrix from our train dataset
    INPUTS : 
        train_set : list of tweets
    OUTPUTS :
       The embedded matrix made of our train set
"""
def create_glove_emb(train_set) :
    
    model = Corpus()
    train_splitted = [tweet.split() for tweet in train_set]
    model.fit(train_splitted, window = 5)
    
    glove = Glove(no_components=50, learning_rate=0.05)
    glove.fit(model.matrix, epochs=50)
    glove.add_dictionary(model.dictionary)
    glove.save('glove.model')
    
    embedding_dict = {}
    for w, id_ in glove.dictionary.items():
        embedding_dict[w] = np.array(glove.word_vectors[id_])

    return embedding_dict, len(train_set) + 1

In [67]:
"""
    INPUTS :
        train_set : list of tweets
        use_pretrained : True to use the glove pre trained
    OUTPUTS :
        the embedded dictionary
"""
def load_embedding_dict(train_set, use_pretrained=True):

    if (use_pretrained) : 
        return create_pretrained()
    else : 
        return create_glove_emb(train_set)

In [68]:
"""
convert the words of the tweets to vector, they will be used in this form during the training of the model
     INPUTS :
        train_set : list of tweets
        y : sentiments (positive or negative) of the train_set
        vector_size : dimension of the vector words
    OUTPUTS :
        The train_set in a vector form and shuffled
"""
def create_sequence(train_set, y, vector_size, tokenizer) : 
   
    tokenizer.fit_on_texts(train_set)
    sequences = tokenizer.texts_to_sequences(train_set)
    
    tweet_pad = pad_sequences(sequences,maxlen=vector_size,truncating='post',padding='post')

    indices = np.arange(tweet_pad.shape[0])
    np.random.shuffle(indices)
    tweet_pad = tweet_pad[indices]
    y = y[indices]

    return tweet_pad , y 

In [69]:
"""
Compute the embedded matrix that will be used to make the model
    INPUTS :
        train_set : list of tweets
        y : sentiments (positive or negative) of the train_set
            vector_size : dimension of the vector words
    OUTPUTS :
        returns the vectorized train_set, the embedded matrix, 
            and the sentiment shuffled accordingly
"""
def we_glove(train_set, y,use_pretrained, vector_size):
   
    embedding_dict, vocabulary_size = load_embedding_dict(train_set, use_pretrained)
    tokenizer = Tokenizer(num_words=vocabulary_size)
    sequence,y = create_sequence(train_set,y, vector_size, tokenizer)

    embedding_matrix = np.zeros((vocabulary_size, vector_size))
    for word, index in tokenizer.word_index.items():
        if index > vocabulary_size - 1:
            break
        else:
            embedding_vector = embedding_dict.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
            
    return sequence, embedding_matrix, y, vocabulary_size, tokenizer

### Test both pretrained and in house glove word embeddings

In [115]:
sequence, glove_matrix, y, vocabuliary_size, tokenizer  = we_glove(train_set)

In [202]:
sequence1, glove_matrix1, embedding_dict1, nb_words1, y1  = we_glove(train_set,False)

In [206]:
def find_similar_word(embedding_dict, emmbedes):
    nearest = sorted(embedding_dict.keys(), key=lambda word: spatial.distance.euclidean(embedding_dict[word], emmbedes))
    return nearest

In [207]:
find_similar_word(embedding_dict, embedding_dict['love'])[0:10]

['love',
 'you',
 'much',
 'always',
 'know',
 'loves',
 'miss',
 'loving',
 'true',
 'life']

In [209]:
find_similar_word(embedding_dict1, embedding_dict1['love'])[0:10]

['love',
 'kiss","positive',
 'fan',
 'beautiful',
 'know","personally',
 'love","positive',
 'rain","positive',
 'justin","wear',
 'kt","really',
 'youu']

### train on glove

In [70]:
def get_model(embedding_matrix, vocabulary_size, vector_dimension) :
    '''
        compute the model using 3 hidden layers and a sigmoid activation

        INPUTS : 
            embedding_matrix : the glove embedded matrix of size vocabulary_size * vector_dimension
        OUTPUTS :
            The model ready to be trained
    '''
    model = Sequential()
    model.add(
        Embedding(
            vocabulary_size,
            vector_dimension,
            input_length=50,
            weights=[embedding_matrix],
            trainable=False))
    model.add(
        Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(
        Conv1D(filters=64, kernel_size=6, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(
        Conv1D(filters=32, kernel_size=7, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=1))
    model.add(Dropout(0.2))
    model.add(
        Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=1))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='mean_squared_error', optimizer='Adam', metrics=["acc"])
    model.summary()
    return model

In [78]:
def train_model(model, X, y) :
    '''
        Train the model with the sentiments of the train_set
        INPUTS :
            model : model to be trained
            X : the vectorize form of the train set
            y : the sentiment of each tweet in X

        OUTPUTS :
            the trained model
    '''
    model.fit(
        X,
        y,
        batch_size=200,
        verbose=1,
        validation_split=0.2,
        epochs=5,
        callbacks=[
                ModelCheckpoint(
                filepath='Embeddings_best_weights.hdf5',
                monitor='val_acc',
                verbose=1,
                save_best_only=True,
                mode='max'),
                EarlyStopping(
                    monitor='val_acc', patience=3, mode='max')
                ])
    return model

In [79]:
def make_predictions(model, test, tokenizer):
    '''
        Put the test tweets in vector forms and predict them with the model
        OUTPUTS :
            the predictions, each predictions is in the range [0,1]
    '''
    test_sequences = tokenizer.texts_to_sequences(test)
    test = pad_sequences(test_sequences, maxlen=50)
    return model.predict(test)

In [80]:
def make_submission(predictions) :
    '''
        write the predictions in the glove_results file
        INPUTS :
            prediction : 10 000 sentiments of the test tweets in range [0,1]
    '''
    predictions =list(zip(range(1, 10001),predictions))
    with open('../Resources/glove_result.csv', 'w') as out:
        writer = csv.writer(out)
        writer.writerow(["Id", "Prediction"])
        for a,b in predictions:
            if b < 0.5:
                writer.writerow([a, -1])
            else:
                writer.writerow([a, 1])

In [113]:
def run_glove(vector_size = 50, use_pretrained=True) :

    # load the pre processed input : 2nd best preprocessing
    X, y, test = get_input()
    
    # load the pre processed input :  best preprocessing
    #X, y, test = get_input_fp()

    # compute the embedded matrix
    sequence, glove_matrix, y, vocabulary_size, tokenizer  = we_glove(X,y,use_pretrained, vector_size)

    # create a model and train it with our train dataset
    model = get_model(glove_matrix, vocabulary_size,vector_size)
    model_trained = train_model(model,sequence,y)

    # make the predictions of our test dataset with our model
    predictions = make_predictions(model_trained, test, tokenizer)
    make_submission(predictions)
    return model_trained, predictions

## Run models with different preprocessing 

### 2nd best preprocessing with pretrained word embedding

In [84]:
model_trained, predictions = run_glove()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 50, 50)            1000000   
                                                                 
 conv1d_24 (Conv1D)          (None, 50, 128)           32128     
                                                                 
 max_pooling1d_24 (MaxPoolin  (None, 25, 128)          0         
 g1D)                                                            
                                                                 
 dropout_24 (Dropout)        (None, 25, 128)           0         
                                                                 
 conv1d_25 (Conv1D)          (None, 25, 64)            49216     
                                                                 
 max_pooling1d_25 (MaxPoolin  (None, 12, 64)           0         
 g1D)                                                 

### best preprocessing with pretrained word embedding

In [107]:
model_trained_fp, predictions_fp = run_glove()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 50, 50)            1000000   
                                                                 
 conv1d_28 (Conv1D)          (None, 50, 128)           32128     
                                                                 
 max_pooling1d_28 (MaxPoolin  (None, 25, 128)          0         
 g1D)                                                            
                                                                 
 dropout_28 (Dropout)        (None, 25, 128)           0         
                                                                 
 conv1d_29 (Conv1D)          (None, 25, 64)            49216     
                                                                 
 max_pooling1d_29 (MaxPoolin  (None, 12, 64)           0         
 g1D)                                                 

### run 2nd best preprocessing with our own glove embedding

In [114]:
model_trained1, predictions1 = run_glove(vector_size = 50, use_pretrained=False)

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 50, 50)            113057450 
                                                                 
 conv1d_36 (Conv1D)          (None, 50, 128)           32128     
                                                                 
 max_pooling1d_36 (MaxPoolin  (None, 25, 128)          0         
 g1D)                                                            
                                                                 
 dropout_36 (Dropout)        (None, 25, 128)           0         
                                                                 
 conv1d_37 (Conv1D)          (None, 25, 64)            49216     
                                                                 
 max_pooling1d_37 (MaxPoolin  (None, 12, 64)           0         
 g1D)                                                

### run best preprocessing with our own glove embedding

In [112]:
model_trained2, predictions2 = run_glove(vector_size = 50, use_pretrained=False)

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 50, 50)            113057450 
                                                                 
 conv1d_32 (Conv1D)          (None, 50, 128)           32128     
                                                                 
 max_pooling1d_32 (MaxPoolin  (None, 25, 128)          0         
 g1D)                                                            
                                                                 
 dropout_32 (Dropout)        (None, 25, 128)           0         
                                                                 
 conv1d_33 (Conv1D)          (None, 25, 64)            49216     
                                                                 
 max_pooling1d_33 (MaxPoolin  (None, 12, 64)           0         
 g1D)                                                 