In [1]:
%matplotlib inline

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import urllib
import sys
import os
import zipfile
import pandas as pd
from keras_preprocessing.sequence import pad_sequences

In [4]:
""" Use the drive """
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Useful functions 

In [0]:
def unzip_single_file(zip_file_name, output_file_name):
    """
        If the outFile is already created, don't recreate
        If the outFile does not exist, create it from the zipFile
    """
    if not os.path.isfile(output_file_name):
        with open(output_file_name, 'wb') as out_file:
            with zipfile.ZipFile(zip_file_name) as zipped:
                for info in zipped.infolist():
                    if output_file_name in info.filename:
                        with zipped.open(info) as requested_file:
                            out_file.write(requested_file.read())
                            return

In [0]:
def create_glove_wordmap(glove_zip_file = "glove.6B.zip",
                         glove_vectors_file = "glove.6B.50d.txt"
                         ):
  
    from urllib.request import urlretrieve
    #large file - 862 MB
    if (not os.path.isfile(glove_zip_file) and
        not os.path.isfile(glove_vectors_file)):
        urlretrieve ("http://nlp.stanford.edu/data/glove.6B.zip", 
                    glove_zip_file)
        
    unzip_single_file(glove_zip_file, glove_vectors_file)

    glove_wordmap = {}

    with open(glove_vectors_file, "r") as glove:
        for line in glove:
            name, vector = tuple(line.split(" ", 1))
            glove_wordmap[name] = np.fromstring(vector, sep=" ")

    return glove_wordmap

In [0]:
def sentence2sequence(sentence, wordmap, visualize=False):
    """
    
    Turns an input sentence into an (n,d) matrix, 
        where n is the number of tokens in the sentence
        and d is the number of dimensions each word vector has.
        
    """
    tokens = sentence.lower().split(" ")
    rows = []
    words = []
    #Greedy search for tokens
    for token in tokens:
        i = len(token)
        while len(token) > 0 and i > 0:
            word = token[:i]
            if word in wordmap:
                rows.append(wordmap[word])
                words.append(word)
                token = token[i:]
                i = len(token)
            else:
                i = i-1
    
    if visualize: return rows, words
    else: return rows

In [0]:
def visualize(sentence, wordmap):
    """
        Visualize GloVe Embeddings in a sentence
    """
    rows, words = sentence2sequence(sentence, wordmap, visualize=True)
    mat = np.vstack(rows)
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    shown = ax.matshow(mat, aspect="auto")
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    fig.colorbar(shown)
    
    ax.set_yticklabels([""]+words)
    plt.show()

In [0]:
def gen_csv(predicted, label_map, verbosity=False):
    """
        Generate CSV with predicted results and
        required form from the return of predict function and 
        the maping dictionnary {int: 'value'} 
    """
    import csv
    from google.colab import files
    predicted_results = np.argmax(predicted, axis=1)
    if verbosity: print(label_map)
    dict_data=[]
    for i, v in enumerate(predicted_results): 
      d={'index':i, 'label':label_map[v]}
      dict_data.append(d)
    if verbosity: print(dict_data)
    csv_file='results.csv'
    try:
        with open(csv_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=['index', 'label'])
            writer.writeheader()
            for data in dict_data:
                writer.writerow(data)
    except IOError:
        print("I/O error")
    files.download('results.csv')

In [0]:
def data_from_csv(path, dev_mode=True, n_dev = 5000):
    """ 
        retrieve data from CSV in a pandas.Dataframe
    """
    # Entire dataset for training/validation
    dataset = pd.read_csv(path, sep="\t")

    # Dataset for dev
    # less items to speed up computations
    if dev_mode: 
      dataset = dataset.sample(n=n_dev)
    
    return dataset

In [0]:
def process_dataset(dataset, training=True, map_dict={'neutral':2, 'entailment':1, 'contradiction':0}):
    """
        Return a preprocessed dataframe from the dataset sent in args
        If the dataset doesn't have a label column set training to False
    """
    import string # to get rid of the punctuation

    n_dataset = dataset.copy()

    n_dataset['sentence_1'] = [x.translate(str.maketrans('', '', string.punctuation)).lower()
                             for x in dataset.sentence_1.values]

    n_dataset['sentence_2'] = [x.translate(str.maketrans('', '', string.punctuation)).lower()
                             for x in dataset.sentence_2.values]    

    n_dataset['sentence_1'] = [np.array(sentence2sequence(x, glove_wordmap)) \
                             for x in dataset.sentence_1.values]

    n_dataset['sentence_2'] = [np.array(sentence2sequence(x, glove_wordmap)) \
                             for x in dataset.sentence_2.values]

    if training:
        n_dataset['target'] = n_dataset['label'].replace(map_dict)

    return n_dataset

## Model implementation

In [10]:
# Import `Sequential` from `keras.models`
from keras.models import Sequential
# Import `Dense` from `keras.layers`
import keras
from keras.layers import Input, Dense, Dropout, Embedding, LSTM, Flatten, \
                          Bidirectional, Concatenate, GlobalAveragePooling1D

from keras.models import Model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import StandardScaler


Using TensorFlow backend.


### Model 1

#### Preprocess Dataset

In [0]:
# Retrieve Embedded Vectors from Glove
glove_wordmap = create_glove_wordmap()

In [0]:
# Retrieve Dataset from CSV 
path_train="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_train.csv"
path_test_no_label="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_test_no_labels.csv"

dataset = data_from_csv(path_train, n_dev=5000, dev_mode=False)

In [13]:
map_dict={'neutral':2, 'entailment':1, 'contradiction':0}
dataset_processed = process_dataset(dataset, map_dict=map_dict)
print("Processed ! ")

Processed ! 


In [0]:
dataset_processed.head()

Unnamed: 0,index,sentence_1,sentence_2,label,target
0,0,"[[1.0997, -1.0101, -0.44778, -0.014276, 0.2596...","[[0.15882, -0.27394, 0.25375, 0.76122, 0.30715...",neutral,2
1,1,"[[-0.0010919, 0.33324, 0.35743, -0.54041, 0.82...","[[-0.0010919, 0.33324, 0.35743, -0.54041, 0.82...",entailment,1
2,2,"[[0.31474, 0.41662, 0.1348, 0.15854, 0.88812, ...","[[0.21705, 0.46515, -0.46757, 0.10082, 1.0135,...",entailment,1
3,3,"[[0.68938, -0.10644, 0.17083, -0.37583, 0.7517...","[[0.53074, 0.40117, -0.40785, 0.15444, 0.47782...",entailment,1
4,4,"[[-0.80924, -0.030977, 0.5102, -0.75298, 0.490...","[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -...",neutral,2


In [0]:
# On pad ici, sinon ca fait planter la RAM quand on utilise toutes les 
# lignes du CSV
pad1 = pad_sequences(dataset_processed['sentence_1'].values, maxlen=80)
pad2 = pad_sequences(dataset_processed['sentence_2'].values, maxlen=80)

X = [pad1, pad2]
y = dataset_processed['target'].values

# (Nb inputs, nb words, nb dim)
assert X[0][0].shape == (80, 50), "Check the shape of your data"

#### Model fitting

In [14]:
inputs1 = Input(shape=(80,50))
inputs2 = Input(shape=(80,50))

lstm_layer_1 = Bidirectional(LSTM(256, return_sequences=True))
x1 = lstm_layer_1(inputs1)
x2 = lstm_layer_1(inputs2)

dropout_layer = Dropout(0.2)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

lstm_layer_2 = LSTM(128)
x1 = lstm_layer_2(x1)
x2 = lstm_layer_2(x2)

dropout_layer = Dropout(0.2)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

dense = Dense(64, activation='relu')
x1 = dense(x1)
x2 = dense(x2)

x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

x = Concatenate(axis=-1)([x1,x2])
predictions = Dense(3, activation='softmax')(x)

model_1 = Model(inputs=[inputs1, inputs2], outputs=predictions)
model_1.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model_1.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 80, 50)       0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 80, 512)      628736      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 80, 512)      0           bidirectional_1[0][0]            
                                                                 bidir

In [0]:
filepath="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_1.hdf5"

In [0]:
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_1.fit(X, to_categorical(y), epochs=1, batch_size=64, verbose=1, validation_split=0.25, callbacks=[checkpointer])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 294496 samples, validate on 98166 samples
Epoch 1/1






Epoch 00001: val_acc improved from -inf to 0.45989, saving model to /content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_1.hdf5


<keras.callbacks.History at 0x7f3ff0fca860>

In [17]:
model_1.load_weights(filepath, by_name=False)









#### Predict results and generate CSV

In [18]:
data_test = data_from_csv(path_test_no_label, dev_mode=False)

data_test_processed = process_dataset(data_test, training=False)
print(data_test_processed.head())
X = [pad_sequences(data_test_processed['sentence_1'].values, maxlen=80), pad_sequences(data_test_processed['sentence_2'].values, maxlen=80)]

assert X[0][0].shape == (80, 50), "Check shape of your inputs"

   index  ...                                         sentence_2
0      0  ...  [[0.21705, 0.46515, -0.46757, 0.10082, 1.0135,...
1      1  ...  [[0.41617, 0.0086969, -0.045779, -0.453, 0.396...
2      2  ...  [[0.36274, -0.033799, 0.73714, -0.41275, 0.178...
3      3  ...  [[0.59766, -0.11836, -0.48428, 0.48654, -0.472...
4      4  ...  [[0.23158, 0.69964, 0.43878, -0.31633, 0.18509...

[5 rows x 3 columns]


In [19]:
predicted = model_1.predict(X)
predicted

array([[0.3359165 , 0.349552  , 0.31453148],
       [0.23123178, 0.41849878, 0.35026947],
       [0.20817226, 0.39915594, 0.3926718 ],
       ...,
       [0.43202537, 0.20403525, 0.3639394 ],
       [0.36197454, 0.35931134, 0.27871412],
       [0.19766548, 0.38119608, 0.42113847]], dtype=float32)

In [0]:
map_dict = {v: k for k, v in map_dict.items()}
gen_csv(predicted, map_dict)

### Model 2

#### Pre-process dataset

In [0]:
# Retrieve Embedded Vectors from Glove
glove_wordmap = create_glove_wordmap()

In [0]:
# Retrieve Dataset from CSV 
path_train="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_train.csv"
path_test_no_label="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_test_no_labels.csv"

dataset = data_from_csv(path_train, n_dev=5000, dev_mode=True)

In [0]:
map_dict={'neutral':2, 'entailment':1, 'contradiction':0}
dataset_processed = process_dataset(dataset, map_dict=map_dict)
print("Processed ! ")

Processed ! 


In [0]:
dataset_processed.head()

Unnamed: 0,index,sentence_1,sentence_2,label,target
116430,116430,"[[-0.58475, -0.37703, -0.18568, 0.012511, -0.2...","[[-0.58475, -0.37703, -0.18568, 0.012511, -0.2...",entailment,1
230523,230523,"[[-0.90402, -0.5696, -0.42533, 0.10271, 0.1714...","[[0.68938, -0.10644, 0.17083, -0.37583, 0.7517...",neutral,2
127138,127138,"[[0.40991, 0.081369, 0.29962, -0.2535, 0.30073...","[[0.61183, -0.22072, -0.10898, -0.052967, 0.50...",contradiction,0
210803,210803,"[[0.45323, 0.059811, -0.10577, -0.333, 0.72359...","[[-0.20092, -0.060271, -0.61766, -0.8444, 0.57...",entailment,1
384477,384477,"[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -...","[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -...",contradiction,0


In [0]:
# On pad ici, sinon ca fait planter la RAM quand on utilise toutes les 
# lignes du CSV
X = [pad_sequences(dataset_processed['sentence_1'].values, maxlen=80), pad_sequences(dataset_processed['sentence_2'].values, maxlen=80)]
y = dataset_processed['target'].values

# (Nb inputs, nb words, nb dim)
assert X[0][0].shape == (80, 50), "Check the shape of your data"

#### Model fitting

In [0]:
inputs1 = Input(shape=(80,50))
inputs2 = Input(shape=(80,50))

lstm_layer_1 = Bidirectional(LSTM(256))
x1 = lstm_layer_1(inputs1)
x2 = lstm_layer_1(inputs2)

dropout_layer = Dropout(0.2)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

dense = Dense(64, activation='relu')
x1 = dense(x1)
x2 = dense(x2)

x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

x = Concatenate(axis=-1)([x1,x2])
predictions = Dense(3, activation='softmax')(x)

model_2 = Model(inputs=[inputs1, inputs2], outputs=predictions)
model_2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model_2.summary()

Model: "model_30"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_69 (InputLayer)           (None, 80, 50)       0                                            
__________________________________________________________________________________________________
bidirectional_6 (Bidirectional) (None, 512)          628736      input_69[0][0]                   
                                                                 input_70[0][0]                   
__________________________________________________________________________________________________
dropout_24 (Dropout)            multiple             0           bidirectional_6[0][0]            
                                                                 bidirectional_6[1][0]            
                                                                 dense_61[0][0]            

In [0]:
filepath="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_2.hdf5"
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_2.fit(X, to_categorical(y), epochs=1, batch_size=64, verbose=1, validation_split=0.25, callbacks=[checkpointer])

Train on 3750 samples, validate on 1250 samples
Epoch 1/3

Epoch 00001: val_acc improved from -inf to 0.37280, saving model to /content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_2.hdf5
Epoch 2/3

Epoch 00002: val_acc improved from 0.37280 to 0.38320, saving model to /content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_2.hdf5
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.38320


<keras.callbacks.History at 0x7fd92748a8d0>

#### Predict results and generate CSV

In [0]:
data_test = data_from_csv(path_test_no_label, dev_mode=False)

data_test_processed = process_dataset(data_test, training=False)

X = [pad_sequences(data_test_processed['sentence_1'].values, maxlen=80), pad_sequences(data_test_processed['sentence_2'].values, maxlen=80)]

assert X[0][0].shape == (80, 50), "Check shape of your inputs"

In [0]:
predicted = model_2.predict(X)
predicted

array([[0.37925217, 0.33278218, 0.2879657 ],
       [0.30615047, 0.35224319, 0.34160632],
       [0.21302755, 0.3802407 , 0.4067317 ],
       ...,
       [0.44262403, 0.31281003, 0.24456593],
       [0.3221449 , 0.3657318 , 0.3121234 ],
       [0.34165305, 0.31705436, 0.3412926 ]], dtype=float32)

In [0]:
map_dict = {v: k for k, v in map_dict.items()}
gen_csv(predicted, map_dict)

### Model 3

#### Data Preprocessing

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.layers import GlobalAveragePooling1D

def preprocess_model_3(dataset, tokenizer,
                       vocab_size = 50000, max_length = 80, 
                       training=True, 
                       map_dict={'neutral':2, 'entailment':1, 'contradiction':0}):
    """
        Preprocess data for model_3
    """
     
    n_dataset = dataset.copy()
    
    if training:
        tokenizer.fit_on_texts(dataset.sentence_1.values + dataset.sentence_2.values)

    sentence1_seq = tokenizer.texts_to_sequences(n_dataset.sentence_1.values)
    sentence2_seq = tokenizer.texts_to_sequences(n_dataset.sentence_2.values)
    sentence2_seq_padded = pad_sequences(sentence2_seq, maxlen=max_length)
    sentence1_seq_padded = pad_sequences(sentence1_seq, maxlen=max_length)

    X = [sentence1_seq_padded,sentence2_seq_padded ]
    
    if training: 
        y = dataset['label'].replace(map_dict)
        return X, y
    else: 
        return X

In [0]:
# Retrieve Dataset from CSV 
path_train="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_train.csv"
path_test_no_label="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_test_no_labels.csv"

dataset = data_from_csv(path_train, n_dev=5000, dev_mode=False)

In [0]:
vocab_size = 50000
map_dict = {'neutral':2, 'entailment':1, 'contradiction':0}
tokenizer = Tokenizer(num_words=vocab_size)
max_length = 80

X, y = preprocess_model_3(dataset, tokenizer, max_length=max_length)

In [0]:
assert X[0][0].shape == (max_length,), "Check your input"

#### Model fitting

In [35]:
inputs1 = Input(shape=(max_length,))
inputs2 = Input(shape=(max_length,))

embedding_layer = Embedding(vocab_size, 128, input_length=max_length)
emb_out1 = embedding_layer(inputs1)
emb_out2 = embedding_layer(inputs2)

lstm_layer = LSTM(128)
x1 = lstm_layer(emb_out1)
x2 = lstm_layer(emb_out2)

dropout_layer = Dropout(0.3)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

dense = Dense(64, activation='relu')
x1 = dense(x1)
x2 = dense(x2)

dropout_layer = Dropout(0.2)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

x = Concatenate(axis=-1)([x1,x2])
predictions = Dense(3, activation='softmax')(x)

model_3 = Model(inputs=[inputs1, inputs2], outputs=predictions)
model_3.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model_3.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 80)           0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 80)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 80, 128)      6400000     input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
lstm_4 (LSTM)                   (None, 128)          131584      embedding_2[0][0]          

In [0]:
filepath="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_3.hdf5"

In [0]:
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_3.fit(X, to_categorical(y), epochs=2, batch_size=64, verbose=1, validation_split=0.25, callbacks=[checkpointer])

In [0]:
model_3.load_weights(filepath, by_name=False)

#### Predict results and generate CSV

In [0]:
data_test = data_from_csv(path_test_no_label, dev_mode=False)

X = preprocess_model_3(data_test, tokenizer, training=False)

assert X[0][0].shape == (max_length,) and X[1][0].shape == (max_length,), "Check X dimensions"

In [40]:
predicted = model_3.predict(X)
predicted

array([[0.24822998, 0.35178694, 0.3999831 ],
       [0.26851237, 0.34763318, 0.38385442],
       [0.09854436, 0.24954228, 0.65191334],
       ...,
       [0.21078224, 0.10045719, 0.6887606 ],
       [0.27909794, 0.2581422 , 0.4627598 ],
       [0.08310273, 0.3540999 , 0.5627974 ]], dtype=float32)

In [0]:
map_dict = {v: k for k, v in map_dict.items()}
gen_csv(predicted, map_dict)

### Model 4

#### Data Preprocessing

In [0]:
def preprocess_model_4(dataset, tokenizer,
                       vocab_size = 50000, max_length = 80, 
                       training=True, 
                       map_dict={'neutral':2, 'entailment':1, 'contradiction':0}):
    """
        Preprocess data for model_4
    """
     
    n_dataset = dataset.copy()
    
    if training:
        tokenizer.fit_on_texts(dataset.sentence_1.values + dataset.sentence_2.values)

    sentence1_seq = tokenizer.texts_to_sequences(n_dataset.sentence_1.values)
    sentence2_seq = tokenizer.texts_to_sequences(n_dataset.sentence_2.values)
    sentence2_seq_padded = pad_sequences(sentence2_seq, maxlen=max_length)
    sentence1_seq_padded = pad_sequences(sentence1_seq, maxlen=max_length)

    X = [sentence1_seq_padded,sentence2_seq_padded ]
    
    if training: 
        y = dataset['label'].replace(map_dict)
        return X, y
    else: 
        return X

In [0]:
# Retrieve Dataset from CSV 
path_train="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_train.csv"
path_test_no_label="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/dataset_test_no_labels.csv"

dataset = data_from_csv(path_train, n_dev=5000, dev_mode=False)

In [0]:
vocab_size = 50000
map_dict = {'neutral':2, 'entailment':1, 'contradiction':0}
tokenizer = Tokenizer(num_words=vocab_size)
max_length = 80

X, y = preprocess_model_4(dataset, tokenizer, max_length=max_length)

In [0]:
assert X[0][0].shape == (max_length,), "Check your input"

#### Model fitting

In [49]:
inputs1 = Input(shape=(max_length,))
inputs2 = Input(shape=(max_length,))

embedding_layer = Embedding(vocab_size, 128, input_length=max_length)
emb_out1 = embedding_layer(inputs1)
emb_out2 = embedding_layer(inputs2)

lstm_layer = Bidirectional(LSTM(128))
x1 = lstm_layer(emb_out1)
x2 = lstm_layer(emb_out2)

dropout_layer = Dropout(0.3)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

dense = Dense(64, activation='relu')
x1 = dense(x1)
x2 = dense(x2)

dropout_layer = Dropout(0.2)
x1 = dropout_layer(x1)
x2 = dropout_layer(x2)

x = Concatenate(axis=-1)([x1,x2])
predictions = Dense(3, activation='softmax')(x)

model_4 = Model(inputs=[inputs1, inputs2], outputs=predictions)
model_4.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

model_4.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 80)           0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, 80)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 80, 128)      6400000     input_9[0][0]                    
                                                                 input_10[0][0]                   
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 256)          263168      embedding_3[0][0]          

In [0]:
filepath="/content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_4.hdf5"

In [51]:
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model_4.fit(X, to_categorical(y), epochs=2, batch_size=64, verbose=1, validation_split=0.25, callbacks=[checkpointer])

Train on 294496 samples, validate on 98166 samples
Epoch 1/2

Epoch 00001: val_acc improved from -inf to 0.54733, saving model to /content/drive/My Drive/MatMax-ING5/Deep Learning/projet_kaggle/data/weights-embedded_4.hdf5
Epoch 2/2

Epoch 00002: val_acc did not improve from 0.54733


<keras.callbacks.History at 0x7f8ac2aaacc0>

In [0]:
model_4.load_weights(filepath, by_name=False)

#### Predict results and generate CSV

In [0]:
data_test = data_from_csv(path_test_no_label, dev_mode=False)

X = preprocess_model_3(data_test, tokenizer, training=False)

assert X[0][0].shape == (max_length,) and X[1][0].shape == (max_length,), "Check X dimensions"

In [55]:
predicted = model_4.predict(X)
predicted

array([[0.3732662 , 0.32626432, 0.30046955],
       [0.32090116, 0.26149133, 0.41760755],
       [0.0603015 , 0.5184491 , 0.42124936],
       ...,
       [0.31232375, 0.17211495, 0.5155613 ],
       [0.2549939 , 0.37340778, 0.37159833],
       [0.05829293, 0.5696199 , 0.3720871 ]], dtype=float32)

In [0]:
map_dict = {v: k for k, v in map_dict.items()}
gen_csv(predicted, map_dict)