In [21]:
import numpy as np
import csv

from glove import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Conv1D, Flatten, MaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping

from pre_processing import get_pre_process_data_test

from gensim.models import Word2Vec


In [22]:
PATH_TRAIN_NEG = '../Resources/preprocessing_neg_full'
PATH_TRAIN_POS = '../Resources/preprocessing_pos_full'

# Load the preprocessed datasets already computed

def get_input() :
    with open(PATH_TRAIN_POS) as f:
        train_pos = f.read().splitlines()
    with open(PATH_TRAIN_NEG) as f:
        train_neg = f.read().splitlines()

    train_set = train_pos + train_neg

    y = np.array(len(train_pos) * [1] + len(train_neg) * [0])

    test_set = get_pre_process_data_test(save_file_name='test_data_process.txt')

    return train_set, y, test_set

In [23]:

def create_pretrained() : 
    '''
        Load the pre-compute embedded matrix of the glove database for the tweets
        OUTPUTS :
            The embedded matrix from the glove database
    '''
    vocabulary_size = 20000
    embedding_dict = dict()
    f = open('../Resources/glove.twitter.27B.50d.txt')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = coefs
    f.close()
    
    return embedding_dict, vocabulary_size

In [24]:
def create_glove_emb(train_set) :
    '''
        Create our embedded matrix from our train dataset
        INPUTS : 
            train_set : list of tweets
        OUTPUTS :
            The embedded matrix made of our train set
    '''
    model = Corpus()
    train_splitted = [tweet.split() for tweet in train_set]
    model.fit(train_splitted, window = 5)
    
    glove = Glove(no_components=200, learning_rate=0.05)
    glove.fit(model.matrix, epochs=50)
    glove.add_dictionary(model.dictionary)
    glove.save('glove.model')
    
    embedding_dict = {}
    for w, id_ in glove.dictionary.items():
        embedding_dict[w] = np.array(glove.word_vectors[id_])

    return embedding_dict, len(train_set) + 1

In [25]:
def load_embedding_dict(train_set, use_pretrained=True):
    '''
        INPUTS :
            train_set : list of tweets
            use_pretrained : True to use the glove pre trained
        OUTPUTS :
            the embedded dictionary
    '''
    if (use_pretrained) : 
        return create_pretrained()
    else : 
        return create_glove_emb(train_set)

In [26]:
def create_sequence(train_set, y, vector_size, tokenizer) : 
    '''
        convert the words of the tweets to vector, they will be used in this form during the training of the model
        INPUTS :
            train_set : list of tweets
            y : sentiments (positive or negative) of the train_set
            vector_size : dimension of the vector words
        OUTPUTS :
            The train_set in a vector form and shuffled
    '''
    tokenizer.fit_on_texts(train_set)
    sequences = tokenizer.texts_to_sequences(train_set)
    
    tweet_pad = pad_sequences(sequences,maxlen=vector_size,truncating='post',padding='post')

    indices = np.arange(tweet_pad.shape[0])
    np.random.shuffle(indices)
    tweet_pad = tweet_pad[indices]
    y = y[indices]

    return tweet_pad , y 

In [27]:
def we_glove(train_set, y,use_pretrained, vector_size):
    '''
        compute the embedded matrix that will be used to make the model
        INPUTS :
            train_set : list of tweets
            y : sentiments (positive or negative) of the train_set
            vector_size : dimension of the vector words
        OUTPUTS :
            returns the vectorized train_set, the embedded matrix, 
                and the sentiment shuffled accordingly
    '''

    

    embedding_dict, vocabulary_size = load_embedding_dict(train_set, use_pretrained)
    tokenizer = Tokenizer(num_words=vocabulary_size)
    sequence,y = create_sequence(train_set,y, vector_size, tokenizer)

    embedding_matrix = np.zeros((vocabulary_size, vector_size))
    for word, index in tokenizer.word_index.items():
        if index > vocabulary_size - 1:
            break
        else:
            embedding_vector = embedding_dict.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
            
    return sequence, embedding_matrix, y, vocabulary_size, tokenizer

In [28]:
def get_model(embedding_matrix, vocabulary_size, vector_dimension) :
    '''
        compute the model using 3 hidden layers and a sigmoid activation

        INPUTS : 
            embedding_matrix : the glove embedded matrix of size vocabulary_size * vector_dimension
        OUTPUTS :
            The model ready to be trained
    '''
    model = Sequential()
    model.add(
        Embedding(
            vocabulary_size,
            vector_dimension,
            input_length=50,
            weights=[embedding_matrix],
            trainable=False))
    model.add(
        Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(
        Conv1D(filters=64, kernel_size=6, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(
        Conv1D(filters=32, kernel_size=7, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=1))
    model.add(Dropout(0.2))
    model.add(
        Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=1))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='mean_squared_error', optimizer='Adam', metrics=["acc"])
    model.summary()
    return model

In [29]:
def train_model(model, X, y) :
    '''
        Train the model with the sentiments of the train_set
        INPUTS :
            model : model to be trained
            X : the vectorize form of the train set
            y : the sentiment of each tweet in X

        OUTPUTS :
            the trained model
    '''
    model.fit(
        X,
        y,
        batch_size=200,
        verbose=1,
        validation_split=0.2,
        epochs=100,
        callbacks=[
                ModelCheckpoint(
                filepath='Embeddings_best_weights.hdf5',
                monitor='val_acc',
                verbose=1,
                save_best_only=True,
                mode='max'),
                EarlyStopping(
                    monitor='val_acc', patience=3, mode='max')
                ])
    return model

In [30]:
def make_predictions(model, test, tokenizer):
    '''
        Put the test tweets in vector forms and predict them with the model
        OUTPUTS :
            the predictions, each predictions is in the range [0,1]
    '''
    test_sequences = tokenizer.texts_to_sequences(test)
    test = pad_sequences(test_sequences, maxlen=50)
    return model.predict(test)

In [31]:
def make_submission(predictions) :
    '''
        write the predictions in the glove_results file
        INPUTS :
            prediction : 10 000 sentiments of the test tweets in range [0,1]
    '''
    predictions =list(zip(range(1, 10001),predictions))
    with open('../Resources/glove_result.csv', 'w') as out:
        writer = csv.writer(out)
        writer.writerow(["Id", "Prediction"])
        for a,b in predictions:
            if b < 0.5:
                writer.writerow([a, -1])
            else:
                writer.writerow([a, 1])

In [32]:
def run_glove(vector_size = 50, use_pretrained=True) :

    # load the pre processed input
    X, y, test = get_input()

    # compute the embedded matrix
    sequence, glove_matrix, y, vocabulary_size, tokenizer  = we_glove(X,y,use_pretrained, vector_size)

    # create a model and train it with our train dataset
    model = get_model(glove_matrix, vocabulary_size,vector_size)
    train_model(model,sequence,y)

    # make the predictions of our test dataset with our model
    predictions = make_predictions(model, test, tokenizer)
    make_submission(predictions)

In [33]:
run_glove()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 50)            1000000   
                                                                 
 conv1d_4 (Conv1D)           (None, 50, 128)           32128     
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 25, 128)          0         
 1D)                                                             
                                                                 
 dropout_4 (Dropout)         (None, 25, 128)           0         
                                                                 
 conv1d_5 (Conv1D)           (None, 25, 64)            49216     
                                                                 
 max_pooling1d_5 (MaxPooling  (None, 12, 64)           0         
 1D)                                                  