In [31]:
import numpy as np
from pathlib import Path
import spacy
from zipfile import ZipFile
import pandas as pd
import json
import os

In [38]:
CORPUS = "snli_1.0.zip"
MAPPING = {"contradiction": -1,
           "neutral": 0, 
           "entailment": 1}

#### Data Reading / Converting

In [67]:
def get_data(data):
    for d in data:
        prem = d["sentence1"]
        hyp = d["sentence2"]
        label = d["gold_label"]
        if label != "-":
            yield prem, hyp, label

In [68]:
def prepare_data(prefix="test", folder="snli_1.0"):
    filename = os.path.join(folder, folder+"_"+prefix+".jsonl")
    with ZipFile(CORPUS) as f:
        with f.open(filename) as f_in, open(prefix+".txt", "w+") as f_out:
            lines = (json.loads(line.decode()) for line in f_in) 
            f_out.write("\t".join(["sentence1", "sentence2", "gold_label"])+"\n")
            for record in get_data(lines):
                f_out.write("\t".join(record)+"\n")

In [69]:
def read_prepared(prefix, sep="\t"):
    df = pd.read_csv(prefix+".txt", sep=sep)
    return df

In [70]:
for prefix in ["test", "dev", "train"]:
    prepare_data(prefix)

In [71]:
df_dev = read_prepared("dev")
df_test = read_prepared("test")
df_train = read_prepared("train")

In [63]:
df_train.shape
df_test.shape
df_dev.shape

(550152, 3)

(10000, 3)

(10000, 3)

In [75]:
df_train.gold_label.value_counts(normalize=True)
df_test.gold_label.value_counts(normalize=True)
df_dev.gold_label.value_counts(normalize=True)

entailment       0.333868
contradiction    0.333451
neutral          0.332681
Name: gold_label, dtype: float64

entailment       0.342834
contradiction    0.329499
neutral          0.327667
Name: gold_label, dtype: float64

entailment       0.338244
contradiction    0.333062
neutral          0.328693
Name: gold_label, dtype: float64

#### Feature Building