In [None]:
%matplotlib inline

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import urllib
import sys
import os
import zipfile
import pandas as pd
from keras_preprocessing.sequence import pad_sequences

In [None]:
""" Use the drive """
from google.colab import drive
drive.mount('/content/drive')

## Useful functions 

In [None]:
def unzip_single_file(zip_file_name, output_file_name):
    """
        If the outFile is already created, don't recreate
        If the outFile does not exist, create it from the zipFile
    """
    if not os.path.isfile(output_file_name):
        with open(output_file_name, 'wb') as out_file:
            with zipfile.ZipFile(zip_file_name) as zipped:
                for info in zipped.infolist():
                    if output_file_name in info.filename:
                        with zipped.open(info) as requested_file:
                            out_file.write(requested_file.read())
                            return

In [None]:
def create_glove_wordmap(glove_zip_file = "glove.6B.zip",
                         glove_vectors_file = "glove.6B.50d.txt"
                         ):
  
    from urllib.request import urlretrieve
    #large file - 862 MB
    if (not os.path.isfile(glove_zip_file) and
        not os.path.isfile(glove_vectors_file)):
        urlretrieve ("http://nlp.stanford.edu/data/glove.6B.zip", 
                    glove_zip_file)
        
    unzip_single_file(glove_zip_file, glove_vectors_file)

    glove_wordmap = {}

    with open(glove_vectors_file, "r") as glove:
        for line in glove:
            name, vector = tuple(line.split(" ", 1))
            glove_wordmap[name] = np.fromstring(vector, sep=" ")

    return glove_wordmap

In [None]:
def sentence2sequence(sentence, wordmap, visualize=False):
    """
    
    Turns an input sentence into an (n,d) matrix, 
        where n is the number of tokens in the sentence
        and d is the number of dimensions each word vector has.
        
    """
    tokens = sentence.lower().split(" ")
    rows = []
    words = []
    #Greedy search for tokens
    for token in tokens:
        i = len(token)
        while len(token) > 0 and i > 0:
            word = token[:i]
            if word in wordmap:
                rows.append(wordmap[word])
                words.append(word)
                token = token[i:]
                i = len(token)
            else:
                i = i-1
    
    if visualize: return rows, words
    else: return rows

In [None]:
def visualize(sentence, wordmap):
    """
        Visualize GloVe Embeddings in a sentence
    """
    rows, words = sentence2sequence(sentence, wordmap, visualize=True)
    mat = np.vstack(rows)
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    shown = ax.matshow(mat, aspect="auto")
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    fig.colorbar(shown)
    
    ax.set_yticklabels([""]+words)
    plt.show()

In [None]:
def gen_csv(predicted, label_map, verbosity=False):
    """
        Generate CSV with predicted results and
        required form from the return of predict function and 
        the maping dictionnary {int: 'value'} 
    """
    import csv
    from google.colab import files
    predicted_results = np.argmax(predicted, axis=1)
    if verbosity: print(label_map)
    dict_data=[]
    for i, v in enumerate(predicted_results): 
      d={'index':i, 'label':label_map[v]}
      dict_data.append(d)
    if verbosity: print(dict_data)
    csv_file='results.csv'
    try:
        with open(csv_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=['index', 'label'])
            writer.writeheader()
            for data in dict_data:
                writer.writerow(data)
    except IOError:
        print("I/O error")
    files.download('results.csv')

In [None]:
def data_from_csv(path, dev_mode=True, n_dev = 5000):
    """ 
        retrieve data from CSV in a pandas.Dataframe
    """
    # Entire dataset for training/validation
    dataset = pd.read_csv(path, sep="\t")

    # Dataset for dev
    # less items to speed up computations
    if dev_mode: 
      dataset = dataset.sample(n=n_dev)
    
    return dataset

In [None]:
def process_dataset(dataset, training=True, map_dict={'neutral':2, 'entailment':1, 'contradiction':0}):
    """
        Return a preprocessed dataframe from the dataset sent in args
        If the dataset doesn't have a label column set training to False
    """
    import string # to get rid of the punctuation

    n_dataset = dataset.copy()

    n_dataset['sentence_1'] = [x.translate(str.maketrans('', '', string.punctuation)).lower()
                             for x in dataset.sentence_1.values]

    n_dataset['sentence_2'] = [x.translate(str.maketrans('', '', string.punctuation)).lower()
                             for x in dataset.sentence_2.values]    

    n_dataset['sentence_1'] = [np.array(sentence2sequence(x, glove_wordmap)) \
                             for x in dataset.sentence_1.values]

    n_dataset['sentence_2'] = [np.array(sentence2sequence(x, glove_wordmap)) \
                             for x in dataset.sentence_2.values]

    if training:
        n_dataset['target'] = n_dataset['label'].replace(map_dict)

    return n_dataset

## Model implementation

In [None]:
# Import `Sequential` from `keras.models`
from keras.models import Sequential
# Import `Dense` from `keras.layers`
import keras
from keras.layers import Input, Dense, Dropout, Embedding, LSTM, Flatten, \
                          Bidirectional, Concatenate, GlobalAveragePooling1D

from keras.models import Model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import StandardScaler


### Model 1

#### Preprocess Dataset

In [None]:
# Retrieve Embedded Vectors from Glove
glove_wordmap = create_glove_wordmap()

In [None]:
# Retrieve Dataset from CSV 
path_train="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_train.csv"
path_test_no_label="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_test_no_labels.csv"

dataset = data_from_csv(path_train, n_dev=5000, dev_mode=False)

In [None]:
map_dict={'neutral':2, 'entailment':1, 'contradiction':0}
dataset_processed = process_dataset(dataset, map_dict=map_dict)
print("Processed ! ")

In [None]:
dataset_processed.head()

In [None]:
# On pad ici, sinon ca fait planter la RAM quand on utilise toutes les 
# lignes du CSV
pad1 = pad_sequences(dataset_processed['sentence_1'].values, maxlen=80)
pad2 = pad_sequences(dataset_processed['sentence_2'].values, maxlen=80)

X = [pad1, pad2]
y = dataset_processed['target'].values

# (Nb inputs, nb words, nb dim)
assert X[0][0].shape == (80, 50), "Check the shape of your data"

#### Model fitting

In [None]:
inputs1 = Input(shape=(80,50))
inputs2 = Input(shape=(80,50))

lstm_layer_1 = Bidirectional(LSTM(256, return_sequences=True))
x1 = lstm_layer_1(inputs1)
x2 = lstm_layer_1(inputs2)

dropout_layer = Dropout(0.2)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

lstm_layer_2 = LSTM(128)
x1 = lstm_layer_2(x1)
x2 = lstm_layer_2(x2)

dropout_layer = Dropout(0.2)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

dense = Dense(64, activation='relu')
x1 = dense(x1)
x2 = dense(x2)

x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

x = Concatenate(axis=-1)([x1,x2])
predictions = Dense(3, activation='softmax')(x)

model_1 = Model(inputs=[inputs1, inputs2], outputs=predictions)
model_1.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model_1.summary()

In [None]:
filepath="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_1.hdf5"

In [None]:
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_1.fit(X, to_categorical(y), epochs=1, batch_size=64, verbose=1, validation_split=0.25, callbacks=[checkpointer])

In [None]:
model_1.load_weights(filepath, by_name=False)

#### Predict results and generate CSV

In [None]:
data_test = data_from_csv(path_test_no_label, dev_mode=False)

data_test_processed = process_dataset(data_test, training=False)
print(data_test_processed.head())
X = [pad_sequences(data_test_processed['sentence_1'].values, maxlen=80), pad_sequences(data_test_processed['sentence_2'].values, maxlen=80)]

assert X[0][0].shape == (80, 50), "Check shape of your inputs"

In [None]:
predicted = model_1.predict(X)
predicted

In [None]:
map_dict = {v: k for k, v in map_dict.items()}
gen_csv(predicted, map_dict)

### Model 2

#### Pre-process dataset

In [None]:
# Retrieve Embedded Vectors from Glove
glove_wordmap = create_glove_wordmap()

In [None]:
# Retrieve Dataset from CSV 
path_train="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_train.csv"
path_test_no_label="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_test_no_labels.csv"

dataset = data_from_csv(path_train, n_dev=5000, dev_mode=True)

In [None]:
map_dict={'neutral':2, 'entailment':1, 'contradiction':0}
dataset_processed = process_dataset(dataset, map_dict=map_dict)
print("Processed ! ")

In [None]:
dataset_processed.head()

In [None]:
# On pad ici, sinon ca fait planter la RAM quand on utilise toutes les 
# lignes du CSV
X = [pad_sequences(dataset_processed['sentence_1'].values, maxlen=80), pad_sequences(dataset_processed['sentence_2'].values, maxlen=80)]
y = dataset_processed['target'].values

# (Nb inputs, nb words, nb dim)
assert X[0][0].shape == (80, 50), "Check the shape of your data"

#### Model fitting

In [None]:
inputs1 = Input(shape=(80,50))
inputs2 = Input(shape=(80,50))

lstm_layer_1 = Bidirectional(LSTM(256))
x1 = lstm_layer_1(inputs1)
x2 = lstm_layer_1(inputs2)

dropout_layer = Dropout(0.2)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

dense = Dense(64, activation='relu')
x1 = dense(x1)
x2 = dense(x2)

x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

x = Concatenate(axis=-1)([x1,x2])
predictions = Dense(3, activation='softmax')(x)

model_2 = Model(inputs=[inputs1, inputs2], outputs=predictions)
model_2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model_2.summary()

In [None]:
filepath="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_2.hdf5"
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_2.fit(X, to_categorical(y), epochs=1, batch_size=64, verbose=1, validation_split=0.25, callbacks=[checkpointer])

#### Predict results and generate CSV

In [None]:
data_test = data_from_csv(path_test_no_label, dev_mode=False)

data_test_processed = process_dataset(data_test, training=False)

X = [pad_sequences(data_test_processed['sentence_1'].values, maxlen=80), pad_sequences(data_test_processed['sentence_2'].values, maxlen=80)]

assert X[0][0].shape == (80, 50), "Check shape of your inputs"

In [None]:
predicted = model_2.predict(X)
predicted

In [None]:
map_dict = {v: k for k, v in map_dict.items()}
gen_csv(predicted, map_dict)

### Model 3

#### Data Preprocessing

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.layers import GlobalAveragePooling1D

def preprocess_model_3(dataset, tokenizer,
                       vocab_size = 50000, max_length = 80, 
                       training=True, 
                       map_dict={'neutral':2, 'entailment':1, 'contradiction':0}):
    """
        Preprocess data for model_3
    """
     
    n_dataset = dataset.copy()
    
    if training:
        tokenizer.fit_on_texts(dataset.sentence_1.values + dataset.sentence_2.values)

    sentence1_seq = tokenizer.texts_to_sequences(n_dataset.sentence_1.values)
    sentence2_seq = tokenizer.texts_to_sequences(n_dataset.sentence_2.values)
    sentence2_seq_padded = pad_sequences(sentence2_seq, maxlen=max_length)
    sentence1_seq_padded = pad_sequences(sentence1_seq, maxlen=max_length)

    X = [sentence1_seq_padded,sentence2_seq_padded ]
    
    if training: 
        y = dataset['label'].replace(map_dict)
        return X, y
    else: 
        return X

In [None]:
# Retrieve Dataset from CSV 
path_train="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_train.csv"
path_test_no_label="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_test_no_labels.csv"

dataset = data_from_csv(path_train, n_dev=5000, dev_mode=False)

In [None]:
vocab_size = 50000
map_dict = {'neutral':2, 'entailment':1, 'contradiction':0}
tokenizer = Tokenizer(num_words=vocab_size)
max_length = 80

X, y = preprocess_model_3(dataset, tokenizer, max_length=max_length)

In [None]:
assert X[0][0].shape == (max_length,), "Check your input"

#### Model fitting

In [None]:
inputs1 = Input(shape=(max_length,))
inputs2 = Input(shape=(max_length,))

embedding_layer = Embedding(vocab_size, 128, input_length=max_length)
emb_out1 = embedding_layer(inputs1)
emb_out2 = embedding_layer(inputs2)

lstm_layer = LSTM(128)
x1 = lstm_layer(emb_out1)
x2 = lstm_layer(emb_out2)

dropout_layer = Dropout(0.3)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

dense = Dense(64, activation='relu')
x1 = dense(x1)
x2 = dense(x2)

dropout_layer = Dropout(0.2)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

x = Concatenate(axis=-1)([x1,x2])
predictions = Dense(3, activation='softmax')(x)

model_3 = Model(inputs=[inputs1, inputs2], outputs=predictions)
model_3.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model_3.summary()

In [None]:
filepath="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_3.hdf5"

In [None]:
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_3.fit(X, to_categorical(y), epochs=2, batch_size=64, verbose=1, validation_split=0.25, callbacks=[checkpointer])

In [None]:
model_3.load_weights(filepath, by_name=False)

#### Predict results and generate CSV

In [None]:
data_test = data_from_csv(path_test_no_label, dev_mode=False)

X = preprocess_model_3(data_test, tokenizer, training=False)

assert X[0][0].shape == (max_length,) and X[1][0].shape == (max_length,), "Check X dimensions"

In [None]:
predicted = model_3.predict(X)
predicted

In [None]:
map_dict = {v: k for k, v in map_dict.items()}
gen_csv(predicted, map_dict)

### Model 4

#### Data Preprocessing

In [None]:
def preprocess_model_4(dataset, tokenizer,
                       vocab_size = 50000, max_length = 80, 
                       training=True, 
                       map_dict={'neutral':2, 'entailment':1, 'contradiction':0}):
    """
        Preprocess data for model_4
    """
     
    n_dataset = dataset.copy()
    
    if training:
        tokenizer.fit_on_texts(dataset.sentence_1.values + dataset.sentence_2.values)

    sentence1_seq = tokenizer.texts_to_sequences(n_dataset.sentence_1.values)
    sentence2_seq = tokenizer.texts_to_sequences(n_dataset.sentence_2.values)
    sentence2_seq_padded = pad_sequences(sentence2_seq, maxlen=max_length)
    sentence1_seq_padded = pad_sequences(sentence1_seq, maxlen=max_length)

    X = [sentence1_seq_padded,sentence2_seq_padded ]
    
    if training: 
        y = dataset['label'].replace(map_dict)
        return X, y
    else: 
        return X

In [None]:
# Retrieve Dataset from CSV 
path_train="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_train.csv"
path_test_no_label="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_test_no_labels.csv"

dataset = data_from_csv(path_train, n_dev=5000, dev_mode=False)

In [None]:
vocab_size = 50000
map_dict = {'neutral':2, 'entailment':1, 'contradiction':0}
tokenizer = Tokenizer(num_words=vocab_size)
max_length = 80

X, y = preprocess_model_4(dataset, tokenizer, max_length=max_length)

In [None]:
assert X[0][0].shape == (max_length,), "Check your input"

#### Model fitting

In [None]:
inputs1 = Input(shape=(max_length,))
inputs2 = Input(shape=(max_length,))

embedding_layer = Embedding(vocab_size, 128, input_length=max_length)
emb_out1 = embedding_layer(inputs1)
emb_out2 = embedding_layer(inputs2)

lstm_layer = Bidirectional(LSTM(128))
x1 = lstm_layer(emb_out1)
x2 = lstm_layer(emb_out2)

dropout_layer = Dropout(0.3)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

dense = Dense(64, activation='relu')
x1 = dense(x1)
x2 = dense(x2)

dropout_layer = Dropout(0.2)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

x = Concatenate(axis=-1)([x1,x2])
predictions = Dense(3, activation='softmax')(x)

model_4 = Model(inputs=[inputs1, inputs2], outputs=predictions)
model_4.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model_4.summary()

In [None]:
filepath="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_4.hdf5"

In [None]:
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_4.fit(X, to_categorical(y), epochs=2, batch_size=64, verbose=1, validation_split=0.25, callbacks=[checkpointer])

In [None]:
model_4.load_weights(filepath, by_name=False)

#### Predict results and generate CSV

In [None]:
data_test = data_from_csv(path_test_no_label, dev_mode=False)

X = preprocess_model_3(data_test, tokenizer, training=False)

assert X[0][0].shape == (max_length,) and X[1][0].shape == (max_length,), "Check X dimensions"

In [None]:
predicted = model_4.predict(X)
predicted

In [None]:
map_dict = {v: k for k, v in map_dict.items()}
gen_csv(predicted, map_dict)

### Model 5 - RoBERTa

In [None]:
!pip install fairseq

In [None]:
# Retrieve Dataset from CSV 
path_train="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_train.csv"
path_test_no_label="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_test_no_labels.csv"

data_test = data_from_csv(path_test_no_label, n_dev=5000, dev_mode=False)

In [None]:
import torch
from fairseq.data.data_utils import collate_tokens

roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.eval()

In [None]:
batch_of_pairs = list()

for i in range(int(len(data_test))):
    batch_of_pairs.append([data_test.sentence_1[i], data_test.sentence_2[i]])

In [None]:
predictions = list()
with torch.no_grad():
    # Encode a pair of sentences and make a prediction
    for i, pair in enumerate(batch_of_pairs):
      print(i, "/", len(batch_of_pairs))
      tokens = roberta.encode(pair[0], pair[1])
      prediction = roberta.predict('mnli', tokens).argmax().item()
      predictions.append(prediction)

In [None]:
import csv
from google.colab import files
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
dict_data=[]

for i, v in enumerate(predictions): 
    d={'index':i, 'label':label_map[v]}
    dict_data.append(d)

csv_file='results_roberta.csv'
try:
    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['index', 'label'])
        writer.writeheader()
        for data in dict_data:
            writer.writerow(data)

except IOError:
    print("I/O error")
files.download('results_roberta.csv')