In [37]:
from pre_processing import get_pre_process_data_test

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Flatten, MaxPooling1D, GRU, SpatialDropout1D, Bidirectional
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping

import pandas as pd

import numpy as np

import csv

In [38]:
PATH_TRAIN_NEG = '../Resources/preprocessing_neg_full.txt'
PATH_TRAIN_POS = '../Resources/preprocessing_pos_full.txt'

# Load the preprocessed datasets already computed

def get_input() :
    with open(PATH_TRAIN_POS) as f:
        train_pos = f.read().splitlines()
    with open(PATH_TRAIN_NEG) as f:
        train_neg = f.read().splitlines()

    train_set = train_pos + train_neg

    y = np.array(len(train_pos) * [1] + len(train_neg) * [0])

    test_set = get_pre_process_data_test(save_file_name='test_data_process.txt')

    return train_set, y, test_set

In [39]:
def get_model_cnn_lstm(vocabulary_size, max_length) :
    '''
        compute the model using one embedding layer to put the tweet in spatial space,
        make the lstm and one layer of dense with sigmoid activation to get the output in the range [0,1]

        INPUTS : 
            vocabulary_size : number of different words
            max_length : size of one vector tweet in X
        OUTPUTS :
            The model ready to be trained
    '''

    model = Sequential()
    model.add(
        Embedding(vocabulary_size, 200, input_length=max_length))
    model.add(Dropout(0.2))
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(200))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(
        loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
    

In [40]:
def train_model(model, X, y, vocabulary_size, max_length, epochs=100) :
    '''
        Train the model with the sentiments of the train_set
        INPUTS :
            model : model to be trained
            X : the vectorize form of the train set
            y : the sentiment of each tweet in X
            vocabulary_size : number of different words
            max_length : size of one vector tweet in X

        OUTPUTS :
            the trained model
    '''

    tokenizer = Tokenizer(num_words=vocabulary_size)
    tokenizer.fit_on_texts(X)
    sequences = tokenizer.texts_to_sequences(X)
    X = pad_sequences(sequences, maxlen=max_length)
    model.fit(X,y, epochs=epochs)
    return model, tokenizer

In [41]:
def make_predictions(model, test_dataset, tokenizer,max_length) :
    '''
        Put the test tweets in vector forms and predict them with the model
        OUTPUTS :
            the predictions, each predictions is in the range [0,1]
    '''
    test_sequences = tokenizer.texts_to_sequences(test_dataset)
    test = pad_sequences(test_sequences, maxlen=max_length)
    return model.predict(test)

In [42]:
def make_submission(predictions) :
    '''
        write the predictions in the glove_results file
        INPUTS :
            prediction : 10 000 sentiments of the test tweets in range [0,1]
    '''
    predictions =list(zip(range(1, 10001),predictions))
    with open('../Resources/cnn_lstm.csv', 'w') as out:
        writer = csv.writer(out)
        writer.writerow(["Id", "Prediction"])
        for a,b in predictions:
            if b < 0.5:
                writer.writerow([a, -1])
            else:
                writer.writerow([a, 1])

In [43]:
def run_cnn_lstm() : 
    max_length = 32 
    vocabulary_size = 100000

    # load the datasets
    X, y, test = get_input()

    # make and train the model
    model = get_model_cnn_lstm(max_length=max_length, vocabulary_size=vocabulary_size)
    model, toke = train_model(model, X, y, max_length=max_length, vocabulary_size=vocabulary_size)
    
    # make the predictions of our test dataset with our model
    predictions = make_predictions(model, test, toke, max_length)
    make_submission(predictions)

In [44]:
run_cnn_lstm()

Epoch 1/100
 5142/64597 [=>............................] - ETA: 134:49:23 - loss: 0.4542 - accuracy: 0.7784