# Neural net with pretrained word embeddings

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D,MaxPooling1D,GRU
from keras.layers import Conv1D
from keras.layers import LSTM
from keras.layers import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

from numpy import asarray
from numpy import zeros

import matplotlib.pyplot as plt

#from utils import create_csv_submission




## Loading the dataset

In [3]:
#Run this cell to work with the full dataset

df_full = pd.read_pickle("tweet.pkl")

#shuffle the dataset to mix the labels
df_full = df_full.sample(frac=1, random_state=1).reset_index(drop=True)

## Convert tweets to vectors

In [4]:
def tok_and_pad(df,maxlen, tokenizer):
    """ 
    Tokenizes and pads to maxlen each tweet
    """
    
    df = tokenizer.texts_to_sequences(df)#convert each word to a integer based on the tokenizer
    
    df = pad_sequences(df, padding='post', maxlen=maxlen) #makes sure all tweets have maxlen words (padding)

    return df
    
    



def create_embedding_matrix(path_file, vocab_size,tokenizer):
    """
    Creates the embedding matrix from the file that contains the pre-computed embedding vectors
    """
    
    #open file
    embeddings_dictionary = dict()
    glove_file = open('w2v_full_w20_min4.txt', encoding="utf8")
    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimensions = asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions
    glove_file.close()
    
    #create matrix
    embedding_matrix = zeros((vocab_size, 200))
    for word, index in tokenizer.word_index.items():
        embedding_vector = embeddings_dictionary.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector    
    return embedding_matrix


In [5]:

X_train, X_test, y_train, y_test = train_test_split(df_full['tweet'], df_full['label'], test_size=0.05, random_state=42)


#create word dictionary
#it will keep only the top num_words words
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(X_train)
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size:", vocab_size)

maxlen = 100
X_train, X_test = tok_and_pad(X_train,maxlen,tokenizer), tok_and_pad(X_test,maxlen,tokenizer)

vocab_size: 404041


In [6]:
embedding_matrix = create_embedding_matrix('w2v_full_w20_min4.txt',vocab_size,tokenizer)

## First model : simple neural network

In [9]:
#first model : simple neural network
model = Sequential()
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False) #trainable set to False bc we use the downloaded dict
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 200)          80808200  
                                                                 
 flatten_1 (Flatten)         (None, 20000)             0         
                                                                 
 dense_1 (Dense)             (None, 1)                 20001     
                                                                 
Total params: 80828201 (308.34 MB)
Trainable params: 20001 (78.13 KB)
Non-trainable params: 80808200 (308.26 MB)
_________________________________________________________________
None


In [10]:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, validation_split=0.1)

score = model.evaluate(X_test, y_test)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Test Score: 0.5221220254898071
Test Accuracy: 0.7532438039779663


## 2nd model : convolutional neural network

In [None]:
#model 2 : convolutional neural network
model_2 = Sequential()

embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model_2.add(embedding_layer)

model_2.add(Conv1D(32, 5, activation='relu'))
model_2.add(GlobalMaxPooling1D())
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model_2.summary())

In [None]:
history_2 = model_2.fit(X_train, y_train, batch_size=32, epochs=6, verbose=1, validation_split=0.2)

score_2 = model_2.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score_2[0])
print("Test Accuracy:", score_2[1])

## 3rd model : recurrent neural network

In [17]:
#3rd model : recurrent neural net
model_3 = Sequential()
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model_3.add(embedding_layer)
model_3.add(LSTM(128))

model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model_3.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 200)          104704000 
                                                                 
 lstm (LSTM)                 (None, 128)               168448    
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 104872577 (400.06 MB)
Trainable params: 168577 (658.50 KB)
Non-trainable params: 104704000 (399.41 MB)
_________________________________________________________________
None


In [18]:
history_3 = model_3.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

score_3 = model_3.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score_3[0])
print("Test Accuracy:", score_3[1])

Epoch 1/6
  436/13434 [..............................] - ETA: 24:10 - loss: -0.1786 - acc: 4.1213e-04

KeyboardInterrupt: 

## Fourth model : GRU

In [22]:
#4th model
model_4 = Sequential()
embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False) #trainable set to False bc we use the downloaded dict
model_4.add(embedding_layer)
model_4.add(Conv1D(64, kernel_size = 3, padding='same', activation='relu'))
model_4.add(MaxPooling1D(pool_size = 2))
model_4.add(Dropout(0.25))
model_4.add(GRU(128, return_sequences=True))
model_4.add(Dropout(0.3))
model_4.add(Flatten())
model_4.add(Dense(128, activation='relu'))
model_4.add(Dropout(0.5))
model_4.add(Dense(1,activation='sigmoid'))
model_4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model_4.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 200)          16477600  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 64)           38464     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 50, 64)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 64)            0         
_________________________________________________________________
gru_1 (GRU)                  (None, 50, 128)           74112     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 128)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 6400)             

In [23]:
history_4 = model_4.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

score_4 = model_4.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score_4[0])
print("Test Accuracy:", score_4[1])

Train on 137803 samples, validate on 34451 samples
Epoch 1/6
 13056/137803 [=>............................] - ETA: 3:54 - loss: 0.5556 - acc: 0.6951

KeyboardInterrupt: 

## Computing predictions

It computes the predictions (on the preprocessed dataset with lemmatization from the preprocessing notebook) of the last model that was ran from the 3 models above.

In [21]:
to_predict = pd.read_csv("./data/test_cleaned.csv")
to_predict.index += 1

to_predict = to_predict['tweet']
to_predict = to_predict.astype(str)

to_predict= tok_and_pad(to_predict,maxlen)

result_test = model_4.predict(to_predict)

#it returns values between [0,1] (since sigmoid is used) 
result_test[result_test < 0.5] = -1 #replace values < 0.5 to -1
result_test[result_test >= 0.5] = 1


NameError: name 'model_4' is not defined

In [None]:
#create_csv_submission(result_test,"xxx.csv")