In [26]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout,Embedding,CuDNNLSTM,Bidirectional, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, MaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
# to access data on Google Drive from colab
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Data Preparation

In [11]:
def prep_data(data):
  data = data[data['language']=='English']
  del data['language']
  del data['id']
  del data['lang_abv']
  data['prem_hyp'] = data[['premise', 'hypothesis']].apply(lambda x: ' [SEP] '.join(x), axis=1)

  data['label_0'] = np.where(data['label']==0, 1, 0)
  data['label_1'] = np.where(data['label']==1, 1, 0)
  data['label_2'] = np.where(data['label']==2, 1, 0)
  
  # sentences are related if label = 0 (entailment) or 2 (contradiction)
  data['related'] = np.where((data['label']==0) | (data['label']==2), 1, 0)
  data['related_0'] = np.where((data['related']==0), 1, 0)
  data['related_1'] = np.where((data['related']==1), 1, 0)

  
  data = data.sample(frac=1)
  train_size = int(0.7 * len(data))
  train_set = data[:train_size]
  test_set = data[train_size:]
  
  return train_set, test_set


def tokenize_x(x_train, x_test, size_vector):
  token = Tokenizer()
  premise_hypothesis = pd.concat([x_train['prem_hyp'], x_test['prem_hyp']], axis=0).values
  token.fit_on_texts(premise_hypothesis)

  premise_hypothesis_train = x_train['prem_hyp'].values
  sequences_train = token.texts_to_sequences(premise_hypothesis_train)
  padding_train = pad_sequences(sequences_train,maxlen=size_vector)

  premise_hypothesis_test = x_test['prem_hyp'].values
  sequences_test = token.texts_to_sequences(premise_hypothesis_test)
  padding_test = pad_sequences(sequences_test,maxlen=size_vector)

  vocabulary_size = len(token.word_index)+1

  return token, padding_train, padding_test, vocabulary_size



def get_embedding_matrix(token, path_glove, vocabulary_size, embedding_dim = 300):
  embedding_vector = {}
  
  # get dictionnary from GloVe
  f = open (os.path.join(path_glove % embedding_dim))
  for line in tqdm(f):
      value = line.split(' ')
      word = value[0]
      coefficient = np.array(value[1:],dtype = 'float32')
      embedding_vector[word] = coefficient
  
  embedding_matrix = np.zeros((vocabulary_size,300))
  for word,i in tqdm(token.word_index.items()):
      embedding_value = embedding_vector.get(word)
      if embedding_value is not None:
          embedding_matrix[i] = embedding_value
  return embedding_matrix



# LSTM model

In [None]:
def lstm(vocabulary_size, embedding_matrix, embedding_dim=300, size_vector=300, metric='accuracy'):
  model = Sequential()
  model.add(Embedding(vocabulary_size,embedding_dim,weights = [embedding_matrix],input_length=300,trainable = False))
  model.add(Bidirectional(LSTM(64)))
  model.add(Dropout(0.3))
  model.add(Dense(2, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[metric])
  return model

# Experiment : 2 steps LSTM

In [None]:
path_glove = "/content/drive/MyDrive/GloVe/glove.6B.%sd.txt"
data = pd.read_csv("/content/drive/MyDrive/dataset_watson/train.csv")
embedding_dim = 300
size_vector = 300

In [12]:
train_set, test_set = prep_data(data)


token_related, padding_train_related, padding_test_related, vocabulary_size_related = tokenize_x(train_set, test_set, size_vector)

embedding_matrix_related = get_embedding_matrix(token_related, path_glove, vocabulary_size_related, embedding_dim)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas

In [15]:
model_related = lstm(vocabulary_size_related, embedding_matrix_related)
possible_labels = ['related_0', 'related_1']
targets_related_train = train_set[possible_labels].values

history_related = model_related.fit(padding_train_related,targets_related_train,epochs = 5,batch_size= 64,validation_split=0.2)
model_related.summary()

# get predictions for related
predictions_related=model_related.predict(padding_test_related) 
predictions_related=np.argmax(predictions_related,axis=1)
test_set['prediction_related'] = predictions_related
accuracy_related = accuracy_score(test_set['related'], predictions_related)

# get index where predictions are 1 (related)
index_related_values = np.where(predictions_related==1)

# extract subdataset for labels = 1
subdataset_related_sentences_train = train_set[train_set['related']==1]
subdataset_related_sentences_test = test_set.iloc[index_related_values]


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 300, 300)          3870600   
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              186880    
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 2)                 258       
                                                                 
Total params: 4,057,738
Trainable params: 187,138
Non-trainable params: 3,870,600
_________________________________________________________________


In [16]:
print(accuracy_related)

0.6787967006307618


In [19]:
# classification for entailment or contradictory

token_entailment, padding_train_entailment, padding_test_entailment, vocabulary_size_entailment = tokenize_x(subdataset_related_sentences_train, subdataset_related_sentences_test, size_vector)

embedding_matrix_entailment = get_embedding_matrix(token_entailment, path_glove, vocabulary_size_entailment, embedding_dim)



400000it [00:36, 11014.81it/s]
100%|██████████| 11883/11883 [00:00<00:00, 351896.88it/s]


In [22]:
model_entailment =  lstm(vocabulary_size_entailment,embedding_matrix_entailment)
possible_labels = ['label_0', 'label_2']
targets_classification_train = subdataset_related_sentences_train[possible_labels].values
history_entailment = model_entailment.fit(padding_train_entailment,targets_classification_train,epochs = 5,batch_size= 64,validation_split=0.2)
model_entailment.summary()

predictions_entailment = model_entailment.predict(padding_test_entailment) 
predictions_entailment = np.argmax(predictions_entailment,axis=1)
accuracy_entailment = accuracy_score(subdataset_related_sentences_test['label'], predictions_entailment)

# In original dataset, replace values by 0, 1 and 2
dictionary_index_predictions = dict(zip(index_related_values[0].tolist(), predictions_entailment))

# replace all 0 in original dataset (not related) by 1 (neutral)
test_set['predictions'] = np.where(test_set['prediction_related']==0, 1, 0)
for key in dictionary_index_predictions.keys():
  test_set['predictions'][key] = dictionary_index_predictions[key]

# get final accuracy
accuracy= accuracy_score(test_set['label'], test_set['predictions'])

print("Accuracy related",accuracy_related,"Accuracy entailment", 
      accuracy_entailment, "Final accuracy", accuracy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 300, 300)          3565200   
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              186880    
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 2)                 258       
                                                                 
Total params: 3,752,338
Trainable params: 187,138
Non-trainable params: 3,565,200
_________________________________________________________________


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Accuracy related 0.6787967006307618 Accuracy entailment 0.3525852585258526 Final accuracy 0.36147501213003397


# Classify related sentence only

In [None]:
path_glove = "/content/drive/MyDrive/GloVe/glove.6B.%sd.txt"
data = pd.read_csv("/content/drive/MyDrive/dataset_watson/train.csv")
embedding_dim = 300
size_vector = 300

In [23]:
def prep_data_related(data):
  data = data[data['language']=='English']
  del data['language']
  del data['id']
  del data['lang_abv']
  data['prem_hyp'] = data[['premise', 'hypothesis']].apply(lambda x: ' [SEP] '.join(x), axis=1)
  
  # sentences are related if label = 0 (entailment) or 2 (contradiction)
  data['related'] = np.where((data['label']==0) | (data['label']==2), 1, 0)
  data = data[data['related']==1]
  del data['related']

  data['class0'] = np.where((data['label']==0), 1, 0)
  data['class2'] = np.where((data['label']==2), 1, 0)

  data = data.sample(frac=1)
  train_size = int(0.7 * len(data))
  train_set = data[:train_size]
  test_set = data[train_size:]
  
  return train_set, test_set

path_glove = "/content/drive/MyDrive/GloVe/glove.6B.%sd.txt"
data = pd.read_csv("/content/drive/MyDrive/dataset_watson/train.csv")
embedding_dim = 300



In [24]:
train_set, test_set = prep_data_related(data)

token, padding_train, padding_test, vocabulary_size = tokenize_x(train_set, test_set, size_vector)

embedding_matrix= get_embedding_matrix(token, path_glove, vocabulary_size, embedding_dim)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
400000it [00:31, 12759.55it/s]
100%|██████████| 11675/11675 [00:00<00:00, 413264.18it/s]


In [25]:
model = lstm(vocabulary_size,embedding_matrix)
possible_labels = ['class0', 'class2']
targets_train = train_set[possible_labels].values
history = model.fit(padding_train,targets_train,epochs = 10,batch_size=256,validation_split=0.2)
model.summary()

predictions = model.predict(padding_test) 
predictions= np.argmax(predictions,axis=1)
accuracy = accuracy_score(test_set['label'], predictions)

print(accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 300, 300)          3502800   
                                                                 
 bidirectional_4 (Bidirectio  (None, 128)              186880    
 nal)                                                            
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 2)                 258       
                                                                 
Total params: 3,689,938
Trainable params: 187,138
Non-trainable params: 3,502,800
___________________________________________________________

In [30]:
print(confusion_matrix(test_set['label'], predictions).ravel())

[572 164   0   0   0   0 398 278   0]
