# Test embedalign with SentEval 

This notebook will allow you to test EmbedAlign using SentEval. In particular, this also works on **CPUs** :D

* Dependencies:
    * Python 3.5 with NumPy/SciPy
    * Pytorch 
    * Tensorflow 1.5.0  (for CPUs or GPUs depending on how you plan to run it)
        * For example in MacOS: 
        ```
        pip install https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl
        ```
    * scikit-learn>=0.18.0
    * dill>=0.2.7.1


* Install `dgm4nlp` by following the instructions [here](https://github.com/uva-slpl/dgm4nlp), we highly recommend the use of `virtualenv`.

In the same `virtualenv`, do the following:

* Clone repo from FAIR github
```
    git clone https://github.com/facebookresearch/SentEval.git
    cd SentEval/
```

* Install senteval
```
    python setup.py install
```

* Download datasets (it takes some time...)
    * these are downstream tasks
    * new Senteval also has probing tasks (https://github.com/facebookresearch/SentEval/tree/master/data/probing) for evaluating linguistic properties of your embeddings. 

```
    cd data/downstream/
    ./get_transfer_data.bash
```

* Download [pretained embedlaign model](https://surfdrive.surf.nl/files/index.php/s/9M4h5zqmYETSmf3)


* The following code evaluates embedalign pretrained embeddings on en-fr Europarl on different NLP downstream tasks.



In [4]:
from __future__ import absolute_import, division, unicode_literals

import sys
import numpy as np
import logging
import sklearn
#import data 
# data.py is part of Senteval and it is used for loading word2vec style files
import senteval
import tensorflow as tf
import logging
from collections import defaultdict
import dill
import dgm4nlp

In [5]:
class dotdict(dict):
    """ dot.notation access to dictionary attributes """
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

class EmbeddingExtractor:
    """
    This will compute a forward pass with the inference model of EmbedAlign and 
        give you the variational mean for each L1 word in the batch.
        
    Note that this takes monolingual L1 sentences only (at this point we have a traiend EmbedAlign model
        which dispenses with L2 sentences).    
        
    You don't really want to touch anything in this class.
    """

    def __init__(self, graph_file, ckpt_path, config=None):        
        g1 = tf.Graph()
        self.meta_graph = graph_file
        self.ckpt_path = ckpt_path
        
        self.softmax_approximation = 'botev-batch' #default
        with g1.as_default():
            self.sess = tf.Session(config=config, graph=g1)
            # load architecture computational graph
            self.new_saver = tf.train.import_meta_graph(self.meta_graph)
            # restore checkpoint
            self.new_saver.restore(self.sess, self.ckpt_path) #tf.train.latest_checkpoint(
            self.graph = g1  #tf.get_default_graph()
            # retrieve input variable
            self.x = self.graph.get_tensor_by_name("X:0")
            # retrieve training switch variable (True:trianing, False:Test)
            self.training_phase = self.graph.get_tensor_by_name("training_phase:0")
            #self.keep_prob = self.graph.get_tensor_by_name("keep_prob:0")

    def get_z_embedding_batch(self, x_batch):
        """
        :param x_batch: is np array of shape [batch_size, longest_sentence] containing the unique ids of words
        
        :returns: [batch_size, longest_sentence, z_dim]        
        """
        # Retrieve embeddings from latent variable Z
        # we can sempale several n_samples, default 1
        try:
            z_mean = self.graph.get_tensor_by_name("z:0")
            
            feed_dict = {
                self.x: x_batch,
                self.training_phase: False,
                #self.keep_prob: 1.

            }
            z_rep_values = self.sess.run(z_mean, feed_dict=feed_dict) 
        except:
            raise ValueError('tensor Z not in graph!')
        return z_rep_values

This is how you interface with SentEval. The only think you need to change are the paths to trained models in the main block at the end.

In [7]:
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#



# Set PATHs
# path to senteval
#PATH_TO_SENTEVAL = '../'



# import SentEval
#sys.path.insert(0, PATH_TO_SENTEVAL)

# Set params for SentEval
# we use logistic regression (usepytorch: Fasle) and kfold 10
# In this dictionary you can add extra information that you model needs for initialization
# for example the path to a dictionary of indices, of hyper parameters
# this dictionary is passed to the batched and the prepare fucntions
params_senteval = {'task_path': '',
                   'usepytorch': False,
                   'kfold': 10,
                   'ckpt_path': '',
                   'tok_path': '',
                   'extractor': None,
                   'tks1': None}
# made dictionary a dotdict
params_senteval = dotdict(params_senteval)
# this is the config for the NN classifier but we are going to use scikit-learn logistic regression with 10 kfold
# usepytorch = False 
#params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
#                                 'tenacity': 3, 'epoch_size': 2}



def prepare(params, samples):
    """
    In this example we are going to load a tensorflow model, 
    we open a dictionary with the indices of tokens and the computation graph
    """
    params.extractor = EmbeddingExtractor(
        graph_file='%s.meta'%(params.ckpt_path),
        ckpt_path=params.ckpt_path,
        config=None #run in cpu
    )

    # load tokenizer from training
    params.tks1 = dill.load(open(params.tok_path, 'rb'))
    return

def batcher(params, batch):
    """
    At this point batch is a python list containing sentences. Each sentence is a list of tokens (each token a string).
    The code below will take care of converting this to unique ids that EmbedAlign can understand.
    
    This function should return a single vector representation per sentence in the batch.
    In this example we use the average of word embeddings (as predicted by EmbedAlign) as a sentence representation.
    
    In this method you can do mini-batching or you can process sentences 1 at a time (batches of size 1).
    We choose to do it 1 sentence at a time to avoid having to deal with masking. 
    
    This should not be too slow, and it also saves memory.
    """
    # if a sentence is empty dot is set to be the only token
    # you can change it into NULL dependening in your model
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []
    for sent in batch:
        # Here is where dgm4nlp converts strings to unique ids respecting the vocabulary
        # of the pre-trained EmbedAlign model
        # from tokens ot ids position 0 is en
        x1 = params.tks1[0].to_sequences([(' '.join(sent))])
        
        # extract word embeddings in context for a sentence
        # [1, sentence_length, z_dim]
        z_batch1 = params.extractor.get_z_embedding_batch(x_batch=x1)
        # sentence vector is the mean of word embeddings in context
        # [1, z_dim]
        sent_vec = np.mean(z_batch1, axis=1)
        # check if there is any NaN in vector (they appear sometimes when there's padding)
        if np.isnan(sent_vec.sum()):
            sent_vec = np.nan_to_num(sent_vec)        
        embeddings.append(sent_vec)
    embeddings = np.vstack(embeddings)
    return embeddings


# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    # define paths
    # path to senteval data
    # note senteval adds downstream into the path
    params_senteval.task_path = 'SentEval-master/data/' 
    # path to computation graph
    # we use best model on validation AER
    # TODO: you have to point to valid paths! Use the pre-trained model linked from the top of this notebook.
    params_senteval.ckpt_path = 'ull-practical3-embedalign/model.best.validation.aer.ckpt'
    # path to tokenizer with ids of trained Europarl data
    # out dictionary id depends on dill for pickle
    params_senteval.tok_path = 'ull-practical3-embedalign/tokenizer.pickle'
    # we use 10 fold cross validation
    params_senteval.kfold = 10
    se = senteval.engine.SE(params_senteval, batcher, prepare)
    
    # here you define the NLP taks that your embedding model is going to be evaluated
    # in (https://arxiv.org/abs/1802.05883) we use the following :
    # SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
    # but STS14 (semantic textual similarity) is a similar type of semantic task
    transfer_tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC',
                      'MRPC', 'SICKEntailment', 'STS14']
    #transfer_tasks = ['SST2', 'TREC',
    #                       'MRPC', 'SICKEntailment', 'STS14']
    # senteval prints the results and returns a dictionary with the scores
    results = se.eval(transfer_tasks)
    print(results)

2018-05-24 16:56:33,435 : ***** Transfer task : MR *****




INFO:tensorflow:Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt


2018-05-24 16:56:34,887 : Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt
2018-05-24 16:56:35,497 : Generating sentence embeddings
2018-05-24 16:57:53,109 : Generated sentence embeddings
2018-05-24 16:57:53,109 : Training sklearn-LogReg with (inner) 10-fold cross-validation
2018-05-24 16:58:06,416 : Best param found at split 1: l2reg = 8                 with score 64.38
2018-05-24 16:58:20,293 : Best param found at split 2: l2reg = 8                 with score 64.61
2018-05-24 16:58:33,821 : Best param found at split 3: l2reg = 8                 with score 64.34
2018-05-24 16:58:47,346 : Best param found at split 4: l2reg = 8                 with score 64.62
2018-05-24 16:59:01,092 : Best param found at split 5: l2reg = 8                 with score 64.64
2018-05-24 16:59:14,652 : Best param found at split 6: l2reg = 8                 with score 64.75
2018-05-24 16:59:28,445 : Best param found at split 7: l2reg = 8                 with score 64.24
2018

INFO:tensorflow:Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt


2018-05-24 17:00:11,752 : Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt
2018-05-24 17:00:12,408 : Generating sentence embeddings
2018-05-24 17:00:38,022 : Generated sentence embeddings
2018-05-24 17:00:38,022 : Training sklearn-LogReg with (inner) 10-fold cross-validation
2018-05-24 17:00:41,240 : Best param found at split 1: l2reg = 8                 with score 71.39
2018-05-24 17:00:44,568 : Best param found at split 2: l2reg = 8                 with score 70.18
2018-05-24 17:00:47,941 : Best param found at split 3: l2reg = 8                 with score 70.62
2018-05-24 17:00:51,253 : Best param found at split 4: l2reg = 8                 with score 70.71
2018-05-24 17:00:54,599 : Best param found at split 5: l2reg = 4                 with score 70.47
2018-05-24 17:00:57,927 : Best param found at split 6: l2reg = 8                 with score 71.03
2018-05-24 17:01:01,285 : Best param found at split 7: l2reg = 4                 with score 70.15
2018

INFO:tensorflow:Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt


2018-05-24 17:01:12,909 : Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt
2018-05-24 17:01:13,690 : Generating sentence embeddings
2018-05-24 17:01:34,335 : Generated sentence embeddings
2018-05-24 17:01:34,335 : Training sklearn-LogReg with (inner) 10-fold cross-validation
2018-05-24 17:01:51,675 : Best param found at split 1: l2reg = 4                 with score 84.04
2018-05-24 17:02:09,398 : Best param found at split 2: l2reg = 8                 with score 83.87
2018-05-24 17:02:27,528 : Best param found at split 3: l2reg = 4                 with score 83.79
2018-05-24 17:02:45,242 : Best param found at split 4: l2reg = 8                 with score 84.15
2018-05-24 17:03:02,769 : Best param found at split 5: l2reg = 8                 with score 84.0
2018-05-24 17:03:20,640 : Best param found at split 6: l2reg = 8                 with score 83.8
2018-05-24 17:03:38,694 : Best param found at split 7: l2reg = 8                 with score 83.79
2018-0

INFO:tensorflow:Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt


2018-05-24 17:04:36,244 : Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt
2018-05-24 17:04:36,884 : Generating sentence embeddings
2018-05-24 17:06:01,350 : Generated sentence embeddings
2018-05-24 17:06:01,350 : Training sklearn-LogReg with (inner) 10-fold cross-validation
2018-05-24 17:06:12,379 : Best param found at split 1: l2reg = 8                 with score 78.9
2018-05-24 17:06:23,735 : Best param found at split 2: l2reg = 8                 with score 79.1
2018-05-24 17:06:35,123 : Best param found at split 3: l2reg = 8                 with score 79.13
2018-05-24 17:06:46,511 : Best param found at split 4: l2reg = 8                 with score 79.29
2018-05-24 17:06:57,859 : Best param found at split 5: l2reg = 8                 with score 79.27
2018-05-24 17:07:09,157 : Best param found at split 6: l2reg = 8                 with score 79.18
2018-05-24 17:07:20,357 : Best param found at split 7: l2reg = 8                 with score 79.14
2018-0

INFO:tensorflow:Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt


2018-05-24 17:07:56,442 : Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt
2018-05-24 17:07:57,051 : Computing embedding for test
2018-05-24 17:08:09,095 : Computed test embeddings
2018-05-24 17:08:09,095 : Computing embedding for train
2018-05-24 17:12:22,778 : Computed train embeddings
2018-05-24 17:12:22,778 : Computing embedding for dev
2018-05-24 17:12:28,589 : Computed dev embeddings
2018-05-24 17:12:28,589 : Training sklearn-LogReg with standard validation..
2018-05-24 17:12:55,355 : [('reg:0.25', 64.79), ('reg:0.5', 64.91), ('reg:1', 65.25), ('reg:2', 65.48), ('reg:4', 66.28), ('reg:8', 67.2)]
2018-05-24 17:12:55,355 : Validation : best param found is reg = 8 with score             67.2
2018-05-24 17:12:55,355 : Evaluating...
2018-05-24 17:13:03,108 : 
Dev acc : 67.2 Test acc : 67.11 for             SST Binary classification

2018-05-24 17:13:03,124 : ***** Transfer task : TREC *****




INFO:tensorflow:Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt


2018-05-24 17:13:04,576 : Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt
2018-05-24 17:13:27,972 : Computed train embeddings
2018-05-24 17:13:29,863 : Computed test embeddings
2018-05-24 17:13:29,863 : Training sklearn-LogReg with 10-fold cross-validation
2018-05-24 17:14:25,242 : [('reg:0.5', 48.53), ('reg:1', 49.21), ('reg:2', 49.84), ('reg:4', 50.41), ('reg:8', 51.21), ('reg:16', 52.42), ('reg:32', 53.54)]
2018-05-24 17:14:25,242 : Cross-validation : best param found is reg = 32             with score 53.54
2018-05-24 17:14:25,242 : Evaluating...
2018-05-24 17:14:26,898 : 
Dev acc : 53.54 Test acc : 57.4             for TREC

2018-05-24 17:14:26,898 : ***** Transfer task : MRPC *****




INFO:tensorflow:Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt


2018-05-24 17:14:28,476 : Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt
2018-05-24 17:14:29,116 : Computing embedding for test
2018-05-24 17:14:53,954 : Computed test embeddings
2018-05-24 17:14:53,954 : Computing embedding for train
2018-05-24 17:15:53,974 : Computed train embeddings
2018-05-24 17:15:53,990 : Training sklearn-LogReg with 10-fold cross-validation
2018-05-24 17:16:05,831 : [('reg:0.5', 70.51), ('reg:1', 70.24), ('reg:2', 70.37), ('reg:4', 70.42), ('reg:8', 70.54), ('reg:16', 70.64), ('reg:32', 70.34)]
2018-05-24 17:16:05,831 : Cross-validation : best param found is reg = 16             with score 70.64
2018-05-24 17:16:05,831 : Evaluating...
2018-05-24 17:16:06,092 : Dev acc : 70.64 Test acc 70.96; Test F1 80.1 for MRPC.

2018-05-24 17:16:06,092 : ***** Transfer task : SICK-Entailment*****




INFO:tensorflow:Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt


2018-05-24 17:16:07,616 : Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt
2018-05-24 17:16:08,272 : Computing embedding for test
2018-05-24 17:16:47,533 : Computed test embeddings
2018-05-24 17:16:47,533 : Computing embedding for train
2018-05-24 17:17:25,017 : Computed train embeddings
2018-05-24 17:17:25,017 : Computing embedding for dev
2018-05-24 17:17:29,858 : Computed dev embeddings
2018-05-24 17:17:29,874 : Training sklearn-LogReg with standard validation..
2018-05-24 17:17:32,264 : [('reg:0.25', 71.8), ('reg:0.5', 72.2), ('reg:1', 71.8), ('reg:2', 71.2), ('reg:4', 72.2), ('reg:8', 72.6)]
2018-05-24 17:17:32,264 : Validation : best param found is reg = 8 with score             72.6
2018-05-24 17:17:32,264 : Evaluating...
2018-05-24 17:17:32,889 : 
Dev acc : 72.6 Test acc : 74.75 for                        SICK entailment

2018-05-24 17:17:32,889 : ***** Transfer task : STS14 *****




INFO:tensorflow:Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt


2018-05-24 17:17:34,420 : Restoring parameters from ull-practical3-embedalign/model.best.validation.aer.ckpt
2018-05-24 17:17:38,997 : deft-forum : pearson = 0.3478, spearman = 0.3630
2018-05-24 17:17:42,745 : deft-news : pearson = 0.6220, spearman = 0.5783
2018-05-24 17:17:47,996 : headlines : pearson = 0.5782, spearman = 0.5705
2018-05-24 17:17:54,228 : images : pearson = 0.6578, spearman = 0.6439
2018-05-24 17:17:59,977 : OnWN : pearson = 0.6588, spearman = 0.7165
2018-05-24 17:18:07,090 : tweet-news : pearson = 0.6233, spearman = 0.5527
2018-05-24 17:18:07,090 : ALL (weighted average) : Pearson = 0.5951,             Spearman = 0.5865
2018-05-24 17:18:07,090 : ALL (average) : Pearson = 0.5813,             Spearman = 0.5708



{'MR': {'acc': 64.72, 'ntest': 10662, 'devacc': 64.53, 'ndev': 10662}, 'STS14': {'tweet-news': {'pearson': (0.6233063900230104, 6.00660201616312e-82), 'spearman': SpearmanrResult(correlation=0.552714320253843, pvalue=3.22156860724007e-61), 'nsamples': 750}, 'deft-news': {'pearson': (0.6220449639059311, 1.6081711628959872e-33), 'spearman': SpearmanrResult(correlation=0.578333454614915, pvalue=3.56018796171822e-28), 'nsamples': 300}, 'OnWN': {'pearson': (0.6588460177830425, 1.5005374158495726e-94), 'spearman': SpearmanrResult(correlation=0.7165302641879749, pvalue=4.0384598465861957e-119), 'nsamples': 750}, 'deft-forum': {'pearson': (0.34780782734539145, 3.053266542353856e-14), 'spearman': SpearmanrResult(correlation=0.36295881398339536, pvalue=1.8523021232119112e-15), 'nsamples': 450}, 'all': {'pearson': {'mean': 0.5813380680712301, 'wmean': 0.5951356598291332}, 'spearman': {'mean': 0.5708203751953316, 'wmean': 0.5865477305619364}}, 'headlines': {'pearson': (0.5781995025163489, 4.019972