# ECE NLP

## Overview

In this challenge you will be building a model that automatically determines logical entailment between two sentences.  
The model for this task we chose is a Bidirectionnial LSTM.

## Imports

In [132]:
import numpy as np
import pandas as pd
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, Bidirectional, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
from collections import Counter

## Data import

#### Import train and test csv files

In [3]:
train_df = pd.read_csv("Data/dataset_train.csv",sep='\t', index_col='index')
test_df = pd.read_csv("Data/dataset_test_no_labels.csv",sep='\t',index_col='index')
labels = train_df['label']

In [4]:
print(train_df.isna().sum())
print(test_df.isna().sum())
print(labels)

sentence_1    0
sentence_2    0
label         0
dtype: int64
sentence_1    0
sentence_2    0
dtype: int64
index
0               neutral
1            entailment
2            entailment
3            entailment
4               neutral
              ...      
392657    contradiction
392658          neutral
392659       entailment
392660          neutral
392661          neutral
Name: label, Length: 392662, dtype: object


In [5]:
train_df.values[0]

array(['Conceptually cream skimming has two basic dimensions - product and geography.',
       'Product and geography are what make cream skimming work. ',
       'neutral'], dtype=object)

In [6]:
print(train_df['sentence_1'].values)

['Conceptually cream skimming has two basic dimensions - product and geography.'
 'you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him'
 'One of our number will carry out your instructions minutely.' ...
 'Houseboats are a beautifully preserved tradition of the heyday of the British Raj.'
 'Obituaries fondly recalled his on-air debates and two thumbs up salutes with fellow reviewer Roger Ebert on their eponymous syndicated TV show.'
 'in that other you know uh that i should do it or that or just to think about doing it rat her than having someone  tell him to do it i know that was a big thing in our house for a long time was that if i wanted my husband to do something to help']


## Data Transformation

In [115]:
max_sequence_length = 100
max_vocabulary_size=1000

In [154]:
sentence_1 = train_df['sentence_1'].values
sentence_1_words = [line.split() for line in sentence_1]
assert( len(sentence_1_words) == len(sentence_1))

sentence_2 = train_df['sentence_2'].values
sentence_2_words = [line.split() for line in sentence_2]
assert( len(sentence_2_words) == len(sentence_2))

In [156]:
len(sentence_)

392662

In [117]:
flat_sentence_1_words = [item for sublist in sentence_1_words for item in sublist]
flat_sentence_2_words = [item for sublist in sentence_2_words for item in sublist]
flat_words = flat_sentence_1_words + flat_sentence_2_words
assert(len(flat_words) == len(flat_sentence_1_words)+len(flat_sentence_2_words))

In [None]:
words = sorted(set(flat_words))
word_index = dict((c, i) for i, c in enumerate(words))
word_index_inversed = dict((i, c) for i, c in enumerate(words))

In [None]:
def limit_vocabulary(corpus):
    counter = Counter(corpus)
    allowed_words = set([item[0] for item in counter.most_common(max_vocabulary_size)])
    return [word for word in corpus if word in allowed_words]
words = limit_vocabulary(words)

We convert the input sentences to integer sequences using tf tokenizer.  
We also need to pad the sequences to have them all at the same length

In [134]:
tokenizer = Tokenizer(num_words=max_vocabulary_size, char_level=False)
tokenizer.fit_on_texts(words)
sentence_1 =tokenizer.texts_to_sequences(train_df["sentence_1"])
sentence_2 = tokenizer.texts_to_sequences(train_df["sentence_2"])
sentence_1_seq = sequence.pad_sequences(sentence_1, maxlen=max_sequence_length, value=0,truncating="post",padding="post")
sentence_2_seq  = sequence.pad_sequences( sentence_2,maxlen=max_sequence_length, value=0,truncating="post",padding="post")

In [135]:
tmp = []
tmp.append(sentence_1_seq)
tmp.append(sentence_2_seq)
X = np.array(tmp)

In [186]:
def translate_labels(labels,inverse=False):
    convert_dict = {
      'entailment': 0,
      'neutral': 1,
      'contradiction': 2
    }
    convert_dict_inverse = {
      0: 'entailment',
      1: 'neutral',
      2: 'contradiction'
    }
    new_labels=[]
    if inverse:
        new_labels.append(convert_dict_inverse[labels])
    else:
        for label in labels:
            new_labels.append(convert_dict[label])
    
    return new_labels

In [137]:
num_labels = translate_labels(labels)

## Model Definition

In [157]:
vocabulary_size = len(words)
vector_size = 50
batch_size = 1024
embedding_size = 64
hidden_size = 64
epochs = 10


In [148]:
vocabulary_size*embedding_size

64000

This next cell is useful to reset the model and to not get any errors during training

In [149]:
from tensorflow.core.protobuf import rewriter_config_pb2
from tensorflow.keras.backend import set_session
tf.keras.backend.clear_session()  # For easy reset of notebook state.

config_proto = tf.ConfigProto()
off = rewriter_config_pb2.RewriterConfig.OFF
config_proto.graph_options.rewrite_options.arithmetic_optimization = off
session = tf.Session(config=config_proto)
set_session(session)

In [150]:
model = Sequential()

sentence_1 = Sequential()
sentence_1.add(Embedding(vocabulary_size,embedding_size))

               
sentence_2 = Sequential()
sentence_2.add(Embedding(vocabulary_size,embedding_size))




model_concat = concatenate([sentence_1.output, sentence_2.output])
model_concat = Bidirectional(tf.keras.layers.LSTM(hidden_size))(model_concat)
model_concat = Dense(hidden_size, activation='relu')(model_concat)
model_concat = Dense(3, activation='softmax')(model_concat)
model = Model(inputs=[sentence_1.input, sentence_2.input], outputs=model_concat)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [151]:
model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_input (InputLayer)    [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1_input (InputLayer)  [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 64)     64000       embedding_input[0][0]            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 64)     64000       embedding_1_input[0][0]          
______________________________________________________________________________________________

## Model Training/Evaluation

In [158]:
model.fit(
           [X[0],X[1]],num_labels, validation_split=0.2, batch_size=batch_size, epochs=epochs,verbose=True
        )

Train on 314129 samples, validate on 78533 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x29bdfb3a1d0>

## Model Inference

In [159]:
test_sentence_1 = test_df["sentence_1"].values
test_sentence_2 = test_df["sentence_2"].values

test_sentence_1 =tokenizer.texts_to_sequences(test_df["sentence_1"])
test_sentence_2 = tokenizer.texts_to_sequences(test_df["sentence_2"])
test_sentence_1_seq = sequence.pad_sequences(test_sentence_1, maxlen=max_sequence_length, value=0,truncating="post",padding="post")
test_sentence_2_seq  = sequence.pad_sequences(test_sentence_2,maxlen=max_sequence_length, value=0,truncating="post",padding="post")



In [160]:
tmp = []
tmp.append(test_sentence_1_seq)
tmp.append(test_sentence_2_seq)

X_test = np.array(tmp)

In [163]:
predict = model.predict([X_test[0],X_test[1]],verbose=True)



In [194]:
pred_labels=pd.DataFrame(columns=["index","label"])

for index,pred in enumerate(predict):
    real_label = translate_labels(np.round(np.argmax(pred)),inverse=True)
    real_label = str(real_label).strip('[]').replace("'","")
    pred_labels = pred_labels.append({'index':index, 'label':real_label},ignore_index=True)

In [195]:
pred_labels

Unnamed: 0,index,label
0,0,entailment
1,1,entailment
2,2,neutral
3,3,contradiction
4,4,contradiction
...,...,...
19642,19642,contradiction
19643,19643,neutral
19644,19644,neutral
19645,19645,neutral


## Export results

In [196]:
pred_labels.to_csv('submission.csv', index = False)

In [197]:
TableBIS = pd.read_csv("submission.csv", sep="\t")
print(TableBIS)

               index,label
0             0,entailment
1             1,entailment
2                2,neutral
3          3,contradiction
4          4,contradiction
...                    ...
19642  19642,contradiction
19643        19643,neutral
19644        19644,neutral
19645        19645,neutral
19646     19646,entailment

[19647 rows x 1 columns]
