# Trabajo Práctico NLP - ECI 2019

#### Alumno: Leandro Carreira
#### LU: 669/18

Se detalla a continuación el proceso usado para alcanzar el resultado de **66.168%** de acierto en el dataset Natural Language Inference (SNLI) en la competencia ECI 2019 - NLP:

https://www.kaggle.com/c/eci2019nlp/overview

Se redujo el modelo a lo **mínimo y necesario** usando fastai con **embeddings de dimensión 2** y **10 iteraciones**, alcanzando resultados similares a los del paper de [Gururangan et al., 2018](https://www.aclweb.org/anthology/N18-2017) que demuestran que existe un sesgo en el dataset SNLI que permite predecir las etiquetas de las hipótesis sin necesidad de usar las premisas.



# Data preprocessing

In [2]:
# From: https://www.kaggle.com/mschumacher/using-fasttext-models-for-robust-embeddings
import re
def normalize(s):
    """
    Given a text, cleans and normalizes it.
    """
    s = s.lower()
    # Replace ips
    s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
    # Isolate punctuation
    s = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])', r' \1 ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Replace numbers and symbols with language
    s = s.replace('&', ' and ')
    s = s.replace('@', ' at ')
    s = s.replace('0', ' zero ')
    s = s.replace('1', ' one ')
    s = s.replace('2', ' two ')
    s = s.replace('3', ' three ')
    s = s.replace('4', ' four ')
    s = s.replace('5', ' five ')
    s = s.replace('6', ' six ')
    s = s.replace('7', ' seven ')
    s = s.replace('8', ' eight ')
    s = s.replace('9', ' nine ')
    return s


In [3]:
# From read_data.py
import json
import csv

fasttext_train_data_dir = './data.train.txt'
fasttext_train_data_no_labels_dir = './data.train.no.labels.txt'

fasttext_test_data_dir = './data.test.txt'
fasttext_val_data_dir = './data.val.txt'

train_sentences_dir = './snli_1.0_train_filtered.jsonl'
train_labels_dir    = './snli_1.0_train_gold_labels.csv'

val_sentences_dir = './snli_1.0_dev_filtered.jsonl'
val_labels_dir    = './snli_1.0_dev_gold_labels.csv'

test_sentences_dir  = './snli_1.0_test_filtered.jsonl'


def create_data(sentences_dir, labels_dir=None, data_dir=None):
    try:
        # Loading sentences from file
        sentences_data = open(sentences_dir, 'r')
    except:
        print('Error loading sentences at: {}'.format(sentences_dir))
        return
    
    if labels_dir:
        try:
            # Loading labels from file
            label_data = open(labels_dir, 'r')
        except:
            print('Error loading labels at: {}'.format(labels_dir))
            return
        # Append-adds at last 
        fasttext_data = open(data_dir, "a") # append mode 
        for sentence, label in zip(it_sentences(sentences_data), it_labels(label_data)):
            # Tenemos la oración en sentence con su categoría en label
            #print('[{}]: {}'.format(label, sentence))
            fasttext_data.write('__label__{} {}\n'.format(label, normalize(sentence)))
            pass
        fasttext_data.close()
    else:
        fasttext_data = open(data_dir, "a") # append mode 
        for sentence in it_sentences(sentences_data):
            # Tenemos una oración en sentence
            #print('{}'.format(sentence))
            fasttext_data.write('{}\n'.format(normalize(sentence)))
            pass
        fasttext_data.close()
        try:
            # Loading sentences from file
            sentences_data = open(sentences_dir, 'r')
        except:
            print('Error loading sentences at: {}'.format(sentences_dir))
            return
        fasttext_data = open(data_dir[:-4]+'.pair_id.txt', "a") # append mode 
        for sentence in it_pairid(sentences_data):
            # Tenemos una oración en sentence
            #print('{}'.format(sentence))
            fasttext_data.write('{}\n'.format(sentence))
            pass
        fasttext_data.close()
    
def it_sentences(sentence_data):
    for line in sentence_data:
        example = json.loads(line)
        yield example['sentence2']

def it_pairid(sentence_data):
    for line in sentence_data:
        example = json.loads(line)
        yield example['pairID']        

def it_labels(label_data):
    label_data_reader = csv.DictReader(label_data)
    for example in label_data_reader:
        yield example['gold_label']

# For training
try:
    data = open(fasttext_train_data_dir, 'r')
    print('Training data succesfully loaded.')
except IOError:
    print('Training data not found. Creating new one... ')
    create_data(train_sentences_dir, train_labels_dir, data_dir=fasttext_train_data_dir)
    print('Done.')

# For unsupervised learning (no labels)
# try:
#     data = open(fasttext_train_data_no_labels_dir, 'r')
#     print('Training data 2 succesfully loaded.')
# except IOError:
#     print('Training data 2 not found. Creating new one... ')
#     create_data(train_sentences_dir, data_dir=fasttext_train_data_no_labels_dir)
#     print('Done.')
    
# For test
try:
    data = open(fasttext_test_data_dir, 'r')
    print('Test data succesfully loaded.')
except IOError:
    print('Test data not found. Creating new one... ')
    create_data(test_sentences_dir, data_dir=fasttext_test_data_dir)
    print('Done.')
    
# For validation
try:
    data = open(fasttext_val_data_dir, 'r')
    print('Validation data succesfully loaded.')
except IOError:
    print('Validation data not found. Creating new one... ')
    create_data(val_sentences_dir, val_labels_dir, data_dir=fasttext_val_data_dir)
    print('Done.')

Training data succesfully loaded.
Test data succesfully loaded.
Validation data succesfully loaded.


In [4]:
import fasttext

In [None]:
model = fasttext.train_supervised(fasttext_train_data_dir,
                                  lr = 1.0,
                                  epoch = 250,
                                  wordNgrams = 2,
                                  bucket = 200000,
                                  dim = 50,
                                  loss = 'hs',
                                  thread=8)

In [18]:
model.save_model("model_filename.bin")

In [102]:
#model = fasttext.load_model("model_filename.bin")

In [6]:
print('Vocabulario de {} palabras'.format(len(model.words)))

Vocabulario de 29805 palabras


In [7]:
print('Labels:')
print(model.labels)

Labels:
['__label__entailment', '__label__contradiction', '__label__neutral']


# Lets find good hyperparameters:

**Obs:** FastText no brinda mucha flexibilidad a la hora de entrenar modelos, por lo que por cada variante de los hiperparámetros, deberá entrenarse un nuevo modelo desde cero.

Tampoco tenemos la posibilidad de testear el modelo a mitad de entrenamiento usando el validation set, por lo que solo al final del entrenamiento podemos calificarlo. 

## Buscando *wordNgrams*

#### n-grams: 5

In [13]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,
              wordNgrams = 5,
              bucket = 200000,
              dim = 20,
              loss = 'softmax',
              thread=4)

In [19]:
model.test('./data.val.txt')

(9842, 0.7291200975411501, 0.7291200975411501)

In [63]:
def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.653
R@1	0.653


#### n-grams: 1

In [7]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,
              wordNgrams = 1,
              bucket = 200000,
              dim = 20,
              loss = 'softmax',
              thread=4)

In [8]:
def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


#### n-grams: 2

In [11]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,
              wordNgrams = 2,
              bucket = 200000,
              dim = 20,
              loss = 'softmax',
              thread=4)

In [12]:
def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.630
R@1	0.630


#### n-grams: 3

In [13]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,
              wordNgrams = 3,
              bucket = 200000,
              dim = 20,
              loss = 'softmax',
              thread=4)

In [14]:
def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.604
R@1	0.604


#### n-grams: 4

In [15]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,
              wordNgrams = 4,
              bucket = 200000,
              dim = 20,
              loss = 'softmax',
              thread=4)

In [16]:
def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.576
R@1	0.576


### Análisis:

Con *wordNgrams=1* se obtuvieron los mejores resultados.


Probemos con diferentes **tamaños de embeddings**.

## Buscando tamaño de Embeddings

> *More is less*

#### n-grams: 1
#### embedding size: 10 (2^10=1024 representaciones no parece "poco" para este problema)

In [17]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,
              wordNgrams = 1,
              bucket = 200000,
              dim = 10,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.648
R@1	0.648


#### n-grams: 1
#### embedding size: 20 

In [18]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,
              wordNgrams = 1,
              bucket = 200000,
              dim = 20,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


#### n-grams: 1
#### embedding size: 50 

In [19]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,
              wordNgrams = 1,
              bucket = 200000,
              dim = 50,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.648
R@1	0.648


#### n-grams: 1
#### embedding size: 100

In [20]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,
              wordNgrams = 1,
              bucket = 200000,
              dim = 100,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.647
R@1	0.647


#### n-grams: 1
#### embedding size: 200

In [21]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,
              wordNgrams = 1,
              bucket = 200000,
              dim = 200,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.566
R@1	0.566


### Same pero con más iteraciones (para ser justos con embeddings más grandes)

#### n-grams: 1
#### embedding size: 10

In [23]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 2000,
              wordNgrams = 1,
              bucket = 200000,
              dim = 10,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


#### n-grams: 1
#### embedding size: 20 

In [24]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 2000,
              wordNgrams = 1,
              bucket = 200000,
              dim = 20,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.648
R@1	0.648


#### n-grams: 1
#### embedding size: 50 

In [25]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 2000,
              wordNgrams = 1,
              bucket = 200000,
              dim = 50,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


#### n-grams: 1
#### embedding size: 100

In [26]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 2000,
              wordNgrams = 1,
              bucket = 200000,
              dim = 100,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.574
R@1	0.574


#### n-grams: 1
#### embedding size: 200

In [None]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 2000,
              wordNgrams = 1,
              bucket = 200000,
              dim = 200,
              loss = 'softmax',
              thread=1)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

# me queda chica la compu

#### n-grams: 1
#### embedding size: 5

In [8]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 5000,
              wordNgrams = 1,
              bucket = 200000,
              dim = 5,
              loss = 'softmax',
              thread=8)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


> # *Supervivencia del más apto*
>
> # *Sobresaliencia del más simple*
>
> what-

#### Embedding de dimensión 1 (2 valores posibles)

In [9]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 5000,
              wordNgrams = 1,
              bucket = 200000,
              dim = 1,
              loss = 'softmax',
              thread=8)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.506
R@1	0.506


#### Razonando (superficialmente) resultados

Acierta con un 50% de probabilidades, pero considerando que son 3 categorías bien repartidas, estamos bastante por encima de los 33.33% de una respuesta al azar.

Si puede representar solo dos valores, podría "categorizar" la palabra en una u otra categoría, pero como solo existen dos valores posibles (0 y 1), no será posible corresponder a las 3 categorías del problema.

#### Embedding de dimensión 2 (2^2 valores posibles)

In [10]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 5000,
              wordNgrams = 1,
              bucket = 200000,
              dim = 2,
              loss = 'softmax',
              thread=8)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


#### Razonando (superficialmente) resultados

En este caso, un embedding de dimensión 2 puede representar 4 elementos, por lo que podria haber una correspondencia directa entre cada embedding (de cada palabra) y la categoria "asociada".

Si cada palabra tiene un sesgo hacia alguna de las categorias ( ej. es más usada en '*contradicciones*'), **aportará información favorable** para la categoria correspondiente al ser usada en el **promedio** a calcular para la oración.

Para palabras comunes podría usar una 4ta categoria que da la misma probabilidad a las 3, y para palabras sesgadas, debería elegir entre alguno de los otros 3 valores posibles (**(0,0)**, **(0,1)** y **(1,0)**, ó **(1,1)** para la 4ta categoria).


#### Embedding de dimensión 3 (2^3 valores posibles)

In [11]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 5000,
              wordNgrams = 1,
              bucket = 200000,
              dim = 3,
              loss = 'softmax',
              thread=8)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


#### Razonando (superficialmente) resultados

No se observan mejoras por sobre dimensión 2, a pesar de que pueda representar 8 valores en total con sus embeddings, el doble que antes.

Ésto indica que es suficiente con embeddings de dos dimensiones para capturar el sesgo de cada palabra (aunque ignorando otra posible información).

#### Solo queda ver si quedó alguna palabra fuera del bucket, o si puede hacerse algo mejor con más epochs

In [14]:
print('Vocabulario de {} palabras'.format(len(model.words)))

Vocabulario de 29805 palabras


El vocabulario es de menos de 30.000 palabras, por lo que el bucket usado hasta el momento de 200.000 palabras es más que suficiente.

In [13]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 50000,    # x10
              wordNgrams = 1,
              bucket = 200000, 
              dim = 2,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.647
R@1	0.647


Se ven indicios de overfitting, reduzco epochs:

In [16]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 1000,   
              wordNgrams = 1,
              bucket = 200000, 
              dim = 2,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


In [17]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 500,   
              wordNgrams = 1,
              bucket = 200000, 
              dim = 2,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


In [18]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 250,   
              wordNgrams = 1,
              bucket = 200000, 
              dim = 2,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


In [19]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 100,   
              wordNgrams = 1,
              bucket = 200000, 
              dim = 2,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


In [20]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 50,   
              wordNgrams = 1,
              bucket = 200000, 
              dim = 2,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.649
R@1	0.649


In [56]:
model = fasttext.train_supervised(fasttext_train_data_dir,
              lr = 1.0,
              epoch = 10,   
              wordNgrams = 1,
              bucket = 200000, 
              dim = 2,
              loss = 'softmax',
              thread=4)

def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Total sentences	9842
P@1	0.652
R@1	0.652


# Conclusión

Se decide utilizar el modelo más simple de embeddings de 2 dimensiones que demuestra el gran sesgo en el training set que permite predecir resultados con más del 65% de acierto, mientras que al azar serían del 33%, con un simple modelo entrenado en pocos segundos. 

# Entrenando modelo con toda la data (train + validation set)

In [67]:
fasttext_train_ALL_data_dir = './data.train+val.txt'

model = fasttext.train_supervised(fasttext_train_ALL_data_dir,
              lr = 1.0,
              epoch = 10,
              wordNgrams = 1,
              bucket = 200000,
              dim = 2,
              loss = 'softmax',
              thread=4)

print('Estos resultados NO generalizan, ya que evalúa con la misma data de entrenamiento:')
def print_results(N, p, r):
    print("Total sentences\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('data.val.txt'))

Estos resultados NO generalizan, ya que evalúa con la misma data de entrenamiento:
Total sentences	9842
P@1	0.668
R@1	0.668


# Usando modelo entrenado

In [106]:
model.predict(['the kids are frowning'])

([['__label__contradiction']], array([[1.00001979]]))

In [107]:
model.predict(['the kids are frown'])

([['__label__contradiction']], array([[0.99987841]]))

In [8]:
try:
    test_data = open(fasttext_test_data_dir, 'r')
    print('Validation data succesfully loaded.')
except IOError:
    print('Validation data not found.')

print()
for i, sentence in enumerate(test_data):
    if i > 3:
        break
    # Tenemos una oración en sentence
    #print('{}'.format(sentence))
    #fasttext_data.write('{}\n'.format(normalize(sentence)))
    print(sentence[:-1]) # removing \n at the end
    label, prob = model.predict(sentence[:-1], k=3)
    print('Category: {}\nProb: {}'.format(label[0][9:], prob))
    print(label)
    print()
    pass


Validation data succesfully loaded.

the church has cracks in the ceiling . 
Category: contradiction
Prob: [9.99913692e-01 1.16334704e-04]
('__label__contradiction', '__label__entailment')

the church is filled with song . 
Category: neutral
Prob: [0.93546563 0.06456374]
('__label__neutral', '__label__entailment')

a choir singing at a baseball game . 
Category: entailment
Prob: [0.57135916 0.24671638 0.18195307]
('__label__entailment', '__label__contradiction', '__label__neutral')

the woman is young . 
Category: neutral
Prob: [0.37215713 0.35436693 0.27351052]
('__label__neutral', '__label__contradiction', '__label__entailment')



In [12]:
try:
    test_data = open(fasttext_test_data_dir, 'r')
    fasttext_test_data_pair_id_dir = './data.test.pair_id.txt'
    test_data_pair_id = open(fasttext_test_data_pair_id_dir, 'r')
    print('Test data succesfully loaded.')
except IOError:
    print('Test data not found.')

print()
header = 'pairID,gold_label'
print(header)
i=0
for pairid, sentence in zip(test_data_pair_id, test_data):
    if i > 3:
        break
    # Tenemos una oración en sentence
    #print('{}'.format(sentence))
    #fasttext_data.write('{}\n'.format(normalize(sentence)))
    label, prob = model.predict(sentence[:-1], k=1)
    print('{},{}'.format(pairid[:-1], label[0][9:]))

    i+=1
    pass

Test data succesfully loaded.

pairID,gold_label
2677109430.jpg#1r1n,contradiction
2677109430.jpg#1r1e,neutral
2677109430.jpg#1r1c,entailment
6160193920.jpg#4r1n,neutral


# Creación de archivo para subida

In [62]:
try:
    test_data = open(fasttext_test_data_dir, 'r')
    fasttext_test_data_pair_id_dir = './data.test.pair_id.txt'
    test_data_pair_id = open(fasttext_test_data_pair_id_dir, 'r')
    print('Test data succesfully loaded.')
except IOError:
    print('Test data not found.')

print()
i=0
test_solved = open('test_solved.csv', 'a')
header = 'pairID,gold_label\n'
test_solved.write(header)
for pairid, sentence in zip(test_data_pair_id, test_data):
    # Tenemos una oración en sentence
    #print('{}'.format(sentence))
    #fasttext_data.write('{}\n'.format(normalize(sentence)))
    label, prob = model.predict(sentence[:-1], k=1)
    test_solved.write('{},{}\n'.format(pairid[:-1], label[0][9:]))
    pass
test_solved.close()

Test data succesfully loaded.



# Resources

Hierarchical Softmax
https://www.youtube.com/watch?v=B95LTf2rVWM

Fasttext documentation
https://github.com/facebookresearch/fastText