# Study on eSNLI

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import os
from os import path

import sys
sys.path.append("./../src")

cache_path = path.join(os.getcwd(), '..', '.cache')

DATASET_NAME='esnli'

dataset_path = path.join(cache_path, 'dataset', DATASET_NAME)

tmp_path = path.join('.cache', '2022-09-03')
os.makedirs(tmp_path,exist_ok=True)

In [2]:
from data.esnli.dataset import ESNLI

dataset = ESNLI('test', root=cache_path+'/dataset')
dataset.data

Unnamed: 0,premise,hypothesis,label,explanation,highlight_premise,highlight_hypothesis
0,This church choir sings to the masses as they ...,The church has cracks in the ceiling.,neutral,Not all churches have cracks in the ceiling,This church choir sings to the masses as they ...,The church has *cracks* *in* *the* *ceiling.*
1,This church choir sings to the masses as they ...,The church is filled with song.,entailment,"""Filled with song"" is a rephrasing of the ""cho...",This church *choir* *sings* *to* *the* *masses...,The church is *filled* *with* *song.*
2,This church choir sings to the masses as they ...,A choir singing at a baseball game.,contradiction,A choir sing some other songs other than book ...,This church choir sings to the *masses* as the...,A choir *singing* at a *baseball* *game.*
3,"A woman with a green headscarf, blue shirt and...",The woman is young.,neutral,the woman could've been old rather than young,"A woman with a green headscarf, blue shirt and...",The woman is *young.*
4,"A woman with a green headscarf, blue shirt and...",The woman is very happy.,entailment,a grin suggests hapiness.,"A woman with a green headscarf, blue shirt and...",The woman is very *happy.*
...,...,...,...,...,...,...
9819,Two women are observing something together.,Two women are standing with their eyes closed.,contradiction,Answer: The women cannot be observing with eye...,Two women are *observing* something together.,Two women are standing with their *eyes* *clos...
9820,Two women are observing something together.,Two girls are looking at something.,entailment,Observing is a synonym for looking.,Two women are *observing* something together.,Two girls are *looking* at something.
9821,A man in a black leather jacket and a book in ...,A man is flying a kite.,contradiction,A man who speaks in a classroom can not be fly...,A man in a black leather jacket and a book in ...,A man is *flying* *a* *kite.*
9822,A man in a black leather jacket and a book in ...,A man is speaking in a classroom.,entailment,speaks is the same as speaking,A man in a black leather jacket and a book in ...,A man is *speaking* in a classroom.


In [4]:
from data.esnli.pipeline import PretransformedESNLI
from data.esnli.transforms import HighlightTransform, HeuristicTransform
from data.transforms import LemmaLowerTokenizerTransform
from torchtext.vocab import GloVe
import torchtext.transforms as T
import spacy

spacy_model = spacy.load('en_core_web_sm')
tokenizer_transform = LemmaLowerTokenizerTransform(spacy_model)

hl_transform = T.Sequential(
    tokenizer_transform,
    HighlightTransform()
)

heuristic_transform = HeuristicTransform(
    vectors=GloVe(cache=path.join(cache_path, 'models', '.vector_cache')),
    spacy_model=spacy_model
)

transformations = {
    'premise': tokenizer_transform,
    'hypothesis': tokenizer_transform,
    'highlight_premise': hl_transform,
    'highlight_hypothesis': hl_transform,
    'heuristic': heuristic_transform,
}

trans_columns = {
    'premise': 'premise_tokens',
    'hypothesis': 'hypothesis_tokens',
    'highlight_premise': 'premise_rationale',
    'highlight_hypothesis': 'hypothesis_rationale',
    'heuristic': 'heuristic'
}

testset = PretransformedESNLI(transformations, trans_columns, split='test', root=cache_path+'/dataset')
testset.data

Unnamed: 0,premise,hypothesis,label,explanation,highlight_premise,highlight_hypothesis,premise_tokens,hypothesis_tokens,premise_rationale,hypothesis_rationale,premise_heuristic,hypothesis_heuristic
0,This church choir sings to the masses as they ...,The church has cracks in the ceiling.,neutral,Not all churches have cracks in the ceiling,This church choir sings to the masses as they ...,The church has *cracks* *in* *the* *ceiling.*,"[this, church, choir, sing, to, the, masse, as...","[the, church, have, crack, in, the, ceiling, .]","[False, False, False, False, False, False, Fal...","[False, False, False, True, True, True, True, ...","[-1.0000000150474662e+30, 3.064525842666626, 1...","[-1.0000000150474662e+30, 7.628961086273193, -..."
1,This church choir sings to the masses as they ...,The church is filled with song.,entailment,"""Filled with song"" is a rephrasing of the ""cho...",This church *choir* *sings* *to* *the* *masses...,The church is *filled* *with* *song.*,"[this, church, choir, sing, to, the, masse, as...","[the, church, be, fill, with, song, .]","[False, False, True, True, True, True, True, F...","[False, False, False, True, True, True, False]","[-1.0000000150474662e+30, 2.79181170463562, 2....","[-1.0000000150474662e+30, 7.628961086273193, -..."
2,This church choir sings to the masses as they ...,A choir singing at a baseball game.,contradiction,A choir sing some other songs other than book ...,This church choir sings to the *masses* as the...,A choir *singing* at a *baseball* *game.*,"[this, church, choir, sing, to, the, masse, as...","[a, choir, singing, at, a, baseball, game, .]","[False, False, False, False, False, False, Tru...","[False, False, True, False, False, True, True,...","[-1.0000000150474662e+30, 2.5598971843719482, ...","[-1.0000000150474662e+30, 6.388305187225342, 6..."
3,"A woman with a green headscarf, blue shirt and...",The woman is young.,neutral,the woman could've been old rather than young,"A woman with a green headscarf, blue shirt and...",The woman is *young.*,"[a, woman, with, a, green, headscarf, ,, blue,...","[the, woman, be, young, .]","[False, False, False, False, False, False, Fal...","[False, False, False, True, False]","[-1.0000000150474662e+30, 2.597653388977051, -...","[-1.0000000150474662e+30, 5.648240089416504, -..."
4,"A woman with a green headscarf, blue shirt and...",The woman is very happy.,entailment,a grin suggests hapiness.,"A woman with a green headscarf, blue shirt and...",The woman is very *happy.*,"[a, woman, with, a, green, headscarf, ,, blue,...","[the, woman, be, very, happy, .]","[False, False, False, False, False, False, Fal...","[False, False, False, False, True, False]","[-1.0000000150474662e+30, 2.784580707550049, -...","[-1.0000000150474662e+30, 5.648240089416504, -..."
...,...,...,...,...,...,...,...,...,...,...,...,...
9819,Two women are observing something together.,Two women are standing with their eyes closed.,contradiction,Answer: The women cannot be observing with eye...,Two women are *observing* something together.,Two women are standing with their *eyes* *clos...,"[two, woman, be, observe, something, together, .]","[two, woman, be, stand, with, their, eye, clos...","[False, False, False, True, False, False, False]","[False, False, False, False, False, False, Tru...","[-1.0000000150474662e+30, 3.7125906944274902, ...","[-1.0000000150474662e+30, 2.986828327178955, -..."
9820,Two women are observing something together.,Two girls are looking at something.,entailment,Observing is a synonym for looking.,Two women are *observing* something together.,Two girls are *looking* at something.,"[two, woman, be, observe, something, together, .]","[two, girl, be, look, at, something, .]","[False, False, False, True, False, False, False]","[False, False, False, True, False, False, False]","[-1.0000000150474662e+30, 2.8260209560394287, ...","[-1.0000000150474662e+30, 2.585012912750244, -..."
9821,A man in a black leather jacket and a book in ...,A man is flying a kite.,contradiction,A man who speaks in a classroom can not be fly...,A man in a black leather jacket and a book in ...,A man is *flying* *a* *kite.*,"[a, man, in, a, black, leather, jacket, and, a...","[a, man, be, fly, a, kite, .]","[False, False, False, False, False, False, Fal...","[False, False, False, True, True, True, False]","[-1.0000000150474662e+30, 3.1240949630737305, ...","[-1.0000000150474662e+30, 7.3323540687561035, ..."
9822,A man in a black leather jacket and a book in ...,A man is speaking in a classroom.,entailment,speaks is the same as speaking,A man in a black leather jacket and a book in ...,A man is *speaking* in a classroom.,"[a, man, in, a, black, leather, jacket, and, a...","[a, man, be, speak, in, a, classroom, .]","[False, False, False, False, False, False, Fal...","[False, False, False, True, False, False, Fals...","[-1.0000000150474662e+30, 3.5768728256225586, ...","[-1.0000000150474662e+30, 7.3323540687561035, ..."


In [8]:
for side in ['premise', 'hypothesis']:
    print('tokens vs heuristic:', (testset.data[f'{side}_tokens'].str.len() == testset.data[f'{side}_heuristic'].str.len()).all())
    print('tokens vs rationales: ', (testset.data[f'{side}_tokens'].str.len() == testset.data[f'{side}_rationale'].str.len()).all())

tokens vs heuristic: True
tokens vs rationales:  True
tokens vs heuristic: True
tokens vs rationales:  True


## Statistic on tokens

In [9]:
trainset = PretransformedESNLI(transformations, trans_columns, split='train', root=cache_path+'/dataset')

In [10]:
valset = PretransformedESNLI(transformations, trans_columns, split='val', root=cache_path+'/dataset')

In [12]:
import pandas as pd
fulldata = pd.concat([trainset.data, valset.data, testset.data], ignore_index=True)

In [16]:
fulldata['premise_tokens'].str.len().describe()

count    569005.000000
mean         14.184944
std           6.107680
min           2.000000
25%          10.000000
50%          13.000000
75%          17.000000
max          82.000000
Name: premise_tokens, dtype: float64

In [17]:
fulldata['hypothesis_tokens'].str.len().describe()

count    569005.000000
mean          8.277609
std           3.232974
min           1.000000
25%           6.000000
50%           8.000000
75%          10.000000
max          65.000000
Name: hypothesis_tokens, dtype: float64

## Statistic on POS tags

In [14]:
trainset = PretransformedESNLI(transformations, trans_columns, split='train', root=cache_path+'/dataset')
valset = PretransformedESNLI(transformations, trans_columns, split='val', root=cache_path+'/dataset')

In [None]:
datast = 

In [13]:
testset.data['premise_tokens'] + testset.data['hypothesis_tokens']

0       [this, church, choir, sing, to, the, masse, as...
1       [this, church, choir, sing, to, the, masse, as...
2       [this, church, choir, sing, to, the, masse, as...
3       [a, woman, with, a, green, headscarf, ,, blue,...
4       [a, woman, with, a, green, headscarf, ,, blue,...
                              ...                        
9819    [two, woman, be, observe, something, together,...
9820    [two, woman, be, observe, something, together,...
9821    [a, man, in, a, black, leather, jacket, and, a...
9822    [a, man, in, a, black, leather, jacket, and, a...
9823    [a, man, in, a, black, leather, jacket, and, a...
Length: 9824, dtype: object

In [None]:
# Load dataset

# Add punctuation to separate sentences
for i in range(len(tokens)):
    tokens[i].append('.')
    rationale[i].append(0)


# Find postag
from spacy.tokens import Doc
import spacy

flatten_token = [tk for sent in tokens for tk in sent]
flatten_rationale = [r for sent in rationale for r in sent]

nlp = spacy.load('en_core_web_sm')
doc = Doc(nlp.vocab, words=flatten_token)
pos = [tk.pos_ for tk in nlp(doc)]