In [116]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [117]:
load_dataset = False
load_model = False

# Dataset preparing

In [118]:
from dataset.fact_dataset_generator import FactDatasetGenerator
import numpy as np
import sys
import pandas as pd

In [119]:
true_dist_size = 1000
alpha = 1
dataset = FactDatasetGenerator(number_person=100,  distribution="zipf", dataset_folder='./dataset/data/', food_list_name="food_list_small.txt",true_dist_size=true_dist_size, experiment_path="experiment/small_dataset/data/")

In [120]:
if load_dataset:
    dataset.load_dataset()
    true_dist = dataset.true_dist 
    training_data = dataset.training_data
else:
    # Generate all possible facts
    temp = dataset.generate_all_possibilities()
    # Sample true dist (zipf)
    true_dist = dataset.generate_true_dist(alpha=alpha)
    # Sample training data uniformly, %80 of true dist
    training_dataset_size = int(0.8 * true_dist_size)
    training_data = dataset.sample_training_data(training_dataset_size,true_dist.tolist())
    print(dataset.vocab_size)
    print(len(temp))

202
10100


In [121]:
true_dist_df = pd.DataFrame(true_dist,columns=["facts"])

In [122]:
true_dist_df

Unnamed: 0,facts
0,"Bendite,beef tartare"
1,"El,carrot cake"
2,"Shaina,gyoza"
3,"Rossy,samosa"
4,"Shaun,club sandwich"
...,...
995,"Flore,pad thai"
996,"Violetta,club sandwich"
997,"Mirilla,donuts"
998,"Farly,ravioli"


In [123]:
true_duplicates_count = true_dist_df.groupby(list(true_dist_df.columns)).size().reset_index(name='count_true')


In [124]:
true_duplicates_count

Unnamed: 0,facts,count_true
0,"Albertine,apple pie",1
1,"Albertine,beef tartare",1
2,"Albertine,french fries",1
3,"Albertine,ice cream",1
4,"Albertine,spaghetti carbonara",1
...,...,...
507,"Winfield,tiramisu",1
508,"Xenos,baklava",1
509,"Xenos,french toast",3
510,"Zeb,cheese plate",1


In [125]:
training_dist_df = pd.DataFrame(training_data,columns=["facts"])
training_duplicates_count = training_dist_df.groupby(list(training_dist_df.columns)).size().reset_index(name='count_train')

In [126]:
training_duplicates_count

Unnamed: 0,facts,count_train
0,"Albertine,french fries",1
1,"Albertine,ice cream",1
2,"Albertine,spaghetti carbonara",1
3,"Archibald,omelette",2
4,"Archibald,pizza",1
...,...,...
418,"Winfield,seaweed salad",1
419,"Winfield,tiramisu",1
420,"Xenos,baklava",1
421,"Xenos,french toast",3


In [127]:
import torch
## get the training  datasets
train_dataset = [torch.tensor(x, dtype=torch.long) for x in dataset.tokenized_training_data]


In [128]:
from torch.utils.data import Dataset

## create a dataset class
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Assuming each item in data is a sequence and we use the same sequence shifted by one as the target
        x = torch.tensor(self.data[idx][:-1], dtype=torch.long)
        y = torch.tensor(self.data[idx][1:], dtype=torch.long)
        #print(x)
        #print(y)
        
        return x, y


## create the datasets
train_data = MyDataset(train_dataset)
# test_data = MyDataset(test_dataset)

print(train_data)

<__main__.MyDataset object at 0x0000028A04E41CD0>


# Model preparing

In [129]:
## import mingpt
sys.path.append('minGPT/')
from mingpt.model import GPT
from mingpt.utils import set_seed
set_seed(42)

model_config = GPT.get_default_config()
model_config.n_layer=12
model_config.n_head=8
model_config.n_embd=512
model_config.vocab_size = dataset.vocab_size
model_config.model_type = None
model_config.block_size = 2

# model_config.model_type = 'gpt-nano'
# model_config.vocab_size = dataset.vocab_size
# model_config.block_size = 2

model = GPT(model_config).to("cuda")

number of parameters: 37.93M


In [130]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-5 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 20000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_data)

running on device cuda


# Training the model

In [None]:
best_iter = 100000000000000
best_epoch = 0
def batch_end_callback(trainer):
    global best_iter
    global best_epoch
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 100:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
        if trainer.loss.item() < best_iter:
            best_iter = trainer.loss.item()
            best_epoch = trainer.iter_num
            torch.save(model.state_dict(), dataset.experiment_path[:-5] + "model.pt")
trainer.set_callback('on_batch_end', batch_end_callback)

if load_model:
    model.load_state_dict(torch.load(dataset.experiment_path[:-5]+ "model.pt"))
else:
    trainer.run()
    print(f"Best loss is: {best_iter} on epoch: {best_epoch}")

iter_dt 0.00ms; iter 0: train loss 5.48713


  x = torch.tensor(self.data[idx][:-1], dtype=torch.long)
  y = torch.tensor(self.data[idx][1:], dtype=torch.long)


iter_dt 3.10ms; iter 100: train loss 3.05125
iter_dt 3.70ms; iter 200: train loss 2.85718
iter_dt 3.25ms; iter 300: train loss 3.07995
iter_dt 2.90ms; iter 400: train loss 3.01853
iter_dt 2.65ms; iter 500: train loss 2.74514
iter_dt 2.60ms; iter 600: train loss 2.68592
iter_dt 3.05ms; iter 700: train loss 2.94540
iter_dt 2.60ms; iter 800: train loss 2.76482
iter_dt 2.70ms; iter 900: train loss 2.79193
iter_dt 2.80ms; iter 1000: train loss 2.68514
iter_dt 2.60ms; iter 1100: train loss 2.76983
iter_dt 3.05ms; iter 1200: train loss 2.70202
iter_dt 2.65ms; iter 1300: train loss 2.86340
iter_dt 2.65ms; iter 1400: train loss 2.64397
iter_dt 2.65ms; iter 1500: train loss 2.65226
iter_dt 2.75ms; iter 1600: train loss 2.56045
iter_dt 2.65ms; iter 1700: train loss 2.89930
iter_dt 2.70ms; iter 1800: train loss 2.83216
iter_dt 3.15ms; iter 1900: train loss 2.72204
iter_dt 2.70ms; iter 2000: train loss 2.90701
iter_dt 2.95ms; iter 2100: train loss 2.62299
iter_dt 2.65ms; iter 2200: train loss 2.728

In [None]:
# now let's perform some evaluation
model.eval()

# Generate unconditioned facts

In [None]:
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [None]:
n_sequences = 1000
from collections import defaultdict
collected_generations = []

for _ in range(n_sequences):
    x = torch.Tensor([0]).unsqueeze(0).long().to("cuda")
    y_gen = model.generate(x, 2, do_sample=True)
    name = food_item = dataset.decode([y_gen[0][1]])[0]
    food_item = dataset.decode([y_gen[0][2]])[0]
    collected_generations.append(f"{name},{food_item}")

In [None]:
collected_generations_df = pd.DataFrame(collected_generations, columns=["facts"])

In [None]:
collected_generations_counts = collected_generations_df.groupby(list(collected_generations_df.columns)).size().reset_index(name='count_generated')


In [None]:
collected_generations_counts

In [None]:
# Merge true dist and training dist dataframes, outer is used to include data that is not in training data as well
merged_df = pd.merge(true_duplicates_count, training_duplicates_count, on='facts', how='outer')

In [None]:
# Add generated_df to true and training dfs 
# outer can be used to include all facts in true dist
# inner can be used to only show the comparison of generated facts
comparison_df = pd.merge(merged_df, collected_generations_counts, on='facts', how='outer')

In [None]:
comparison_df

In [None]:
# Fill in 0 for facts that not appear
comparison_df = comparison_df.fillna(0)

In [None]:
# Normalize the counts by length
comparison_df["count_generated"] = comparison_df['count_generated']/len(collected_generations)
comparison_df["count_train"] = comparison_df['count_train']/len(training_data)
comparison_df["count_true"] = comparison_df['count_true']/len(true_dist)


In [None]:
comparison_df = comparison_df.sort_values(by=['count_generated'], ascending=False)

In [None]:
comparison_df

In [None]:
comparison_df.sum()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Only the top 100 facts graphed for visibility 
comparison_df[:100].plot.bar(figsize=(16, 4))

## Hallucination rates

In [None]:
# True hallucination rate (generations not in true dist)
true_hallucinations = pd.merge(collected_generations_counts, true_duplicates_count, on='facts', how='left')


In [None]:
true_hallucinations

In [None]:
true_hallucinations = true_hallucinations.fillna(0)
number_of_true_hallucinations =true_hallucinations["count_true"].value_counts()[0]
true_hallucinations_rate = number_of_true_hallucinations / len(collected_generations)
print(f"Rate of true hallucinations: {true_hallucinations_rate} ")

In [None]:
# Naive hallucination rate (every generation not in training data)
naive_hallucinations = pd.merge(collected_generations_counts, training_duplicates_count, on='facts', how='left')

In [None]:
naive_hallucinations

In [None]:
naive_hallucinations = naive_hallucinations.fillna(0)
number_of_naive_hallucinations = naive_hallucinations["count_train"].value_counts()[0]
naive_hallucinations_rate = number_of_naive_hallucinations / len(collected_generations)
print(f"Rate of naive hallucinations: {naive_hallucinations_rate} ")

### Monofact rate

In [None]:
MF = training_duplicates_count["count_train"].value_counts()[1] / len(training_data)
MF

In [None]:
training_duplicates_count["count_train"].value_counts()[1]

### Miscalibration

In [None]:
from lib.calibration import miscalibration

In [None]:
comparison_sorted_by_generated = comparison_df.sort_values(by='count_generated', ascending=False)

In [None]:
miscalibration_rate = miscalibration(comparison_sorted_by_generated['count_generated'], comparison_sorted_by_generated['count_true'])
miscalibration_rate

In [None]:
miscalibration(comparison_sorted_by_generated['count_generated'], comparison_sorted_by_generated['count_train'])

### Check if it holds

In [None]:
unique_names = len(set([t[1] for t in train_dataset]))
unique_foods = len(set([t[2] for t in train_dataset]))
# Possible generations
POSS_GENERATIONS = unique_names * unique_foods

# Facts to all possibilities - facts, approximated
APPROX_FACTS_TO_POSSIBLE_HALLUCINATIONS = 300 * len(training_duplicates_count) / (POSS_GENERATIONS - len(training_duplicates_count))

In [None]:
HALLUCINATION_RATE = true_hallucinations_rate

#MF = 0.43875

MISCALIBRATION = miscalibration_rate

In [None]:
MF

In [None]:
MISCALIBRATION

In [None]:
APPROX_FACTS_TO_POSSIBLE_HALLUCINATIONS

In [None]:
MF - MISCALIBRATION

In [None]:
7 / np.sqrt(len(training_data))

In [None]:
estimated_hallucination_rate = MF - MISCALIBRATION - (7 / np.sqrt(len(training_data))) - APPROX_FACTS_TO_POSSIBLE_HALLUCINATIONS
estimated_hallucination_rate

In [None]:
HALLUCINATION_RATE

In [None]:
HALLUCINATION_RATE > MF - MISCALIBRATION

In [None]:
HALLUCINATION_RATE > MF - MISCALIBRATION - (7 / np.sqrt(len(training_data))) - APPROX_FACTS_TO_POSSIBLE_HALLUCINATIONS

In [None]:
import json
def save_results():
    experiment = {}
    experiment['number of person'] = dataset.number_person
    experiment['food_list'] = dataset.food_list_name
    experiment['true_dist_size'] = dataset.true_dist_size
    experiment['training_set_size'] = len(training_data)
    experiment['zipf_alpha'] = alpha
    experiment['monofact_rate'] = MF
    experiment['miscalibration_rate'] = MISCALIBRATION
    experiment['facts_to_possible_hallucinations_ratio'] = APPROX_FACTS_TO_POSSIBLE_HALLUCINATIONS
    experiment['estimated_hallucinations_rate'] = estimated_hallucination_rate
    experiment['naive_hallucinations_rate'] = naive_hallucinations_rate
    experiment['true_hallucinations_rate'] = true_hallucinations_rate
    
    json_str = json.dumps(experiment)
    with open('experiments.json', 'a') as file:
        file.write(json_str + '\n')
        
    

In [None]:
save_results()