# Predictions with a BiLSTM model using Keras

## 0. Imports

In [None]:
# import useful libraries
import tensorflow as tf
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras import layers
from keras import Input

## 1. Pre-process data to trainable input vectors

In [None]:
# load data
train_neg = [tweet[:-1] for tweet in open('./train_neg.txt').readlines()]
train_pos = [tweet[:-1] for tweet in open('./train_pos.txt').readlines()]

In [None]:
# define the tokenizer and fit it on our tweets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_neg + train_pos)

# Here the representation of tweets is on a sequence.
# Each column of the sequence correspond to an index of a word in the dictionary 
seq = tokenizer.texts_to_sequences(train_neg + train_pos)

# Define y vector 
# Pad sequence X, so all tokenized tweets of X has all the same length
X, y = pad_sequences(seq), np.array([0]*100000 + [1]*100000)

# split data into train and test feature vectors
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## 2. Create and train the Bi-LSTM model

In [None]:
# some model parameters
max_features=10000
maxlen=x_train.shape[1]
embedding_dim=128
num_filters=200 

# the model layers

# embedding
text_input=Input(shape=(None,),dtype='int32',name='text')
embedded_text=layers.Embedding(max_features,embedding_dim,input_length=maxlen)(text_input)

# Our bi-lstm layers
# It allows for our classifier to capture context before and after each word of a text.
x=layers.Bidirectional(layers.LSTM(100,activation='tanh',return_sequences=False, dropout=0.5, recurrent_dropout=0.1))(embedded_text)

# some dropouts
x=layers.Dropout(0.5)(x)

# Neural network
x=layers.Dense(30,activation='relu')(x)

# sigmoid activation for the binary classification
output=layers.Dense(1,activation='sigmoid')(x)

# finally assemble all this into a model
model_biLSTM=Model(text_input,output)

# let's see how it is 
model_biLSTM.summary()



Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text (InputLayer)           [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         1280000   
                                                                 
 bidirectional_1 (Bidirectio  (None, 200)              183200    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense_2 (Dense)             (None, 30)                6030      
                                                                 
 dense_3 (Dense)             (None, 1)                 31        
                                                           

In [None]:
# Now we can train the model using a gpu
with tf.device('/gpu:0'):
  model_biLSTM.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
  history_biLSTM=model_biLSTM.fit(x_train,y_train,epochs=3,batch_size=64,validation_data=(x_test,y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


## 3. Submitting predictions

In [None]:
# our validation accuracy seems pretty good
# let's submit it on AIcrowd
test_data = pad_sequences(tokenizer.texts_to_sequences([tweet[:-1] for tweet in open('./test_data.txt').readlines()]))
prediction = model_biLSTM.predict(test_data)



In [None]:
# convert this into a csv file ready for submission
predictions = (2*(prediction>0.5).astype(int)-1).reshape((10000,)).tolist()
pd.DataFrame.from_dict({'Id': range(1, 10001), 'Prediction': predictions}).to_csv('submissions_dd.csv', index = False)