In [None]:
from dataset_factory import TextDataset, getDataloaders
import seaborn as sns # For class distribution visualization
import matplotlib.pyplot as plt
from file_utils import *
from transformers import BertTokenizer
import torch
from torch.utils.data import random_split # For custom data-sets
import pandas as pd
from datasets import load_dataset

## Stanford Dataset loading

In [None]:
snli_dataset_export = load_dataset('snli', split='train')

# Save to dictionary
snli_dict = {'premise': snli_dataset_export['premise'], 'hypothesis': snli_dataset_export['hypothesis'], 'lang_abv': ['en']*len(snli_dataset_export), 'label': snli_dataset_export['label']}
snli_df = pd.DataFrame(snli_dict)

# Save to csv
snli_df.to_csv('./data/snli_train.csv', index=False)

In [None]:
# # Attempt at creating the dataset rather than loading to CSV

# # Get the Stanford Natural Language Inference Dataset
# snli_dataset = load_dataset('snli', split='train') # Just use the Train, it gives us 550k which is pleanty! 

# # Tokenize the premises 
# snli_dataset = snli_dataset.map(lambda e: tokenizer(e['premise'], max_length=max_length, truncation=True, padding='max_length'), batched=True)
# # Map the tokenized outputs to prem_id, prem_token_type_ids, and prem_atten_mask
# snli_dataset = snli_dataset.map(lambda e: {'prem_id': e['input_ids'], 'prem_token_type_ids': e['token_type_ids'], 'prem_atten_mask': e['attention_mask']})

# # Tokenize the hypothesis 
# snli_dataset = snli_dataset.map(lambda e: tokenizer(e['hypothesis'], max_length=max_length, truncation=True, padding='max_length'), batched=True)
# # Map the tokenized outputs to hypo_id, hypo_token_type_ids, and hypo_atten_mask
# snli_dataset = snli_dataset.map(lambda e: {'hypo_id': e['input_ids'], 'hypo_token_type_ids': e['token_type_ids'], 'hypo_atten_mask': e['attention_mask']})

# # Set format to match TextDataset - returns a dictionary with the column keys
# snli_dataset.set_format(type='torch', columns=['prem_id', 'hypo_id', 'label'])

In [None]:
# # Statistical anlaysis on SNLI (length of sentences and number of words per sentence)
# snli_dataset_2 = load_dataset('snli', split='train')
# snli_dataset_2['premise'][0]
# dataset_with_length = snli_dataset_2.map(lambda x: {"length_premise": len(x["premise"]), "length_hypothesis": len(x["hypothesis"])})
# dataset_with_length = dataset_with_length.map(lambda x: {'prem_toks': tokenizer.tokenize(x['premise']), 'hypo_toks': tokenizer.tokenize(x['hypothesis'])})
# dataset_with_length = dataset_with_length.map(lambda x: {'prem_toks_length': len(x['prem_toks']), 'hypo_toks_length': len(x['hypo_toks'])})


In [None]:
# binning = list(range(0, 250, 5))
# plt.hist(dataset_with_length['length_premise'], bins=binning, alpha=0.5)
# plt.hist(dataset_with_length['length_hypothesis'], bins=binning, alpha=0.5)
# plt.title("Premise and Hypothesis lengths from SNLI")
# plt.show()

In [None]:
# binning = list(range(0, 90, 3))
# plt.hist(dataset_with_length['prem_toks_length'], bins=binning, alpha=0.5, label="prem tok count")
# plt.hist(dataset_with_length['hypo_toks_length'], bins=binning, alpha=0.5, label="hypo tok count")
# plt.title("Premise and Hypothesis Token Counts from SNLI")
# plt.legend()
# plt.show()

## Dataloader playground

In [None]:
# If you make changes to dataset_factory you can reload this cell to update without having to restart the kernel
import dataset_factory 
import importlib
importlib.reload(dataset_factory)

In [None]:
# Load config
config = read_file("./default.json")

In [None]:
# See that config is loaded properly
config

In [None]:
# Test dataloader
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dl, val_dl, test_dl = dataset_factory.getDataloaders(config['dataset']['data_file_path'], config['generation']['max_length'], 512, 
                               config['dataset']['num_workers'], tokenizer, val_split=0.1, test_split=0.1)
    

In [None]:
len(train_dl)

In [None]:
# Print out some details of the first batch in test dataloader as a test
count = 0
for i, (prem, hypo, label) in enumerate(train_dl):
    if i % 100 == 0: 
        count += 1
        inputs = prem.to('cpu')
        print(inputs.shape) # n_batch_elems, max_length
        print(hypo[0]) # First hypothesis
        print(len(label)) # number of labels

## Tokenizer playground

In [None]:
# Load pretrained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# BERT has 2x the vocab size as the COCO dataset
tokenizer.vocab_size 

In [None]:
# Or put it together and include padding requirements
inputs = tokenizer("Hello, my dog is cute [PAD] [PAD] [SEP] he hehe [PAD][PAD][PAD] ", max_length=25, 
                truncation = True,
                padding='max_length')
inputs # This will be a dictionary with 'input_ids', 'token_type_ids', 'attention_mask'

In [None]:
# Tokenizing can happen in two steps
inputs = tokenizer.tokenize("Hello, my dog is cute [PAD] [PAD] [SEP] he hehe [PAD][PAD][PAD] ")
for i in tokenizer.convert_tokens_to_ids(inputs):
    print(tokenizer._convert_id_to_token(i))

## Dataset playground

In [None]:
# If you make changes to dataset_factory you can reload this cell to update without having to restart the kernel
import dataset_factory 
import importlib
importlib.reload(dataset_factory)

In [None]:
csv_file = './data/train.csv'
csv_data = pd.read_csv(csv_file)
csv_data = csv_data[csv_data['lang_abv'] == 'en'] # Drop non-english rows 

In [None]:
prem_lengths = [len(tokenizer.tokenize(prem)) for prem in csv_data['premise']]
hypo_lengths = [len(tokenizer.tokenize(hypo)) for hypo in csv_data['hypothesis']]

binning = list(range(0, 90, 3))
plt.hist(prem_lengths, bins=binning, alpha=0.5, label="prem tok count")
plt.hist(hypo_lengths, bins=binning, alpha=0.5, label="hypo tok count")
plt.title("Premise and Hypothesis Token Counts from My Dear Watson")
plt.legend()
plt.show()

In [None]:
# Load all the training data as all our data (we technically only have train.csv to work with since it has labels)
# TextDataset will automatically only select 'en' English rows
all_data = dataset_factory.TextDataset(csv_file, 20, tokenizer)
len(all_data)

In [None]:
# Split train, val, test
num_train = int(len(all_data) * 0.8)
num_val = int(len(all_data) * 0.1)
num_test = int(len(all_data) * 0.1)

# Make sure to check that your split produces integer vals that add up to the total number in all_data
print(num_train+num_val+num_test)

# Random split
torch.manual_seed(torch.initial_seed())
train_dataset, val_dataset, test_dataset = random_split(all_data, (num_train, num_val, num_test))

In [None]:
# Count the number of each class we have in each dataset
def tally_classes(dataset):
    # Assumes last element is the class label: premise, hypothesis, label
    class_count = {}
    for d in dataset: 
        label = d[2]
        if label not in class_count: 
            class_count[label] = 0
        class_count[label] += 1 
        
    return class_count

In [None]:
# Check distribution of random split

sns.barplot(data = pd.DataFrame.from_dict([tally_classes(train_dataset)]).melt(), x="variable", y="value", hue="variable").set_title('Hypothesis Type Distribution')
plt.show()

sns.barplot(data = pd.DataFrame.from_dict([tally_classes(val_dataset)]).melt(), x ="variable", y="value", hue="variable").set_title('Hypothesis Type Distribution')
plt.show()

sns.barplot(data = pd.DataFrame.from_dict([tally_classes(test_dataset)]).melt(), x="variable", y="value", hue="variable").set_title('Hypothesis Type Distribution')
plt.show()

