In [16]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_dataset, interleave_datasets, load_from_disk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, pipeline
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from peft import LoraConfig, TaskType, get_peft_model
# from trl import SFTTrainer, SFTConfig

In [15]:
pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mNote: you may need to restart the kernel to use updated packages.


# Reload the base model

In [11]:
model_name='t5-small'

tokenizer = AutoTokenizer.from_pretrained(model_name)

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
original_model = original_model.to('cuda')

# Get the finetuned model from your own path

In [9]:
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("checkpoint-14838")
finetuned_model = finetuned_model.to('cuda')
model_name='t5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Reload dataset

In [5]:
dataset = load_from_disk("../../merged_dataset")

# Zero Prompt Evaluation

In [6]:
index = 0
# index = len(dataset['test'])-200

question = dataset['test'][index]['question']
context = dataset['test'][index]['context']
answer = dataset['test'][index]['answer']

prompt = f"""Tables:
{context}

Question:
{question}

Answer:
"""

inputs = tokenizer(prompt, return_tensors='pt')
inputs = inputs.to('cuda')

output = tokenizer.decode(
    finetuned_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN ANSWER:\n{answer}\n')
print(dash_line)
print(f'FINE-TUNED MODEL - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Tables:
CREATE TABLE table_name_11 (date VARCHAR, away_team VARCHAR)

Question:
On what Date did the Away team essendon play?

Answer:

---------------------------------------------------------------------------------------------------
BASELINE HUMAN ANSWER:
SELECT date FROM table_name_11 WHERE away_team = "essendon"

---------------------------------------------------------------------------------------------------
FINE-TUNED MODEL - ZERO SHOT:
SELECT date FROM table_name_11 WHERE away_team = "essendon"


In [12]:
# Perform inferences for test dataset. Do 25 only, due to time it takes.

questions = dataset['test'][0:25]['question']
contexts = dataset['test'][0:25]['context']
human_baseline_answers = dataset['test'][0:25]['answer']

original_model_answers = []
finetuned_model_answers = []

for idx, question in enumerate(questions):
    
    prompt = f"""Tables:
{contexts[idx]}

Question:
{question}

Answer:
"""
      
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    input_ids = input_ids.to('cuda')

    human_baseline_text_output = human_baseline_answers[idx]
    
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=300))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_answers.append(original_model_text_output)
    
    finetuned_model_outputs = finetuned_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=300))
    finetuned_model_text_output = tokenizer.decode(finetuned_model_outputs[0], skip_special_tokens=True)
    finetuned_model_answers.append(finetuned_model_text_output)

zipped_summaries = list(zip(human_baseline_answers, original_model_answers, finetuned_model_answers))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_answers', 'original_model_answers', 'finetuned_model_answers'])
# df

Token indices sequence length is longer than the specified maximum sequence length for this model (1115 > 512). Running this sequence through the model will result in indexing errors


In [13]:
df

Unnamed: 0,human_baseline_answers,original_model_answers,finetuned_model_answers
0,SELECT date FROM table_name_11 WHERE away_team...,Question,SELECT date FROM table_name_11 WHERE away_team...
1,SELECT institution FROM table_1974632_1 WHERE ...,"Question: state the institution in glenside, p...",SELECT institution FROM table_1974632_1 WHERE ...
2,SELECT home_team FROM table_name_4 WHERE away_...,True,SELECT home_team FROM table_name_4 WHERE away_...
3,SELECT date FROM table_name_49 WHERE home_team...,Question,SELECT date FROM table_name_49 WHERE home_team...
4,"SELECT ""Character"" FROM table_79388 WHERE ""Dur...",True,"SELECT ""Character"" FROM table_79388 WHERE ""Yea..."
5,SELECT clubs FROM table_name_59 WHERE position...,Question,SELECT clubs FROM table_name_59 WHERE position...
6,SELECT record FROM table_name_72 WHERE date = ...,Question,SELECT record FROM table_name_72 WHERE date = ...
7,SELECT DISTINCT flight.flight_id FROM airport_...,"CREATE TABLE flight_leg ( flight_id int, leg_n...",SELECT DISTINCT flight.flight_id FROM airport_...
8,SELECT season_joined_league FROM table_name_28...,Question,SELECT season_joined_league FROM table_name_28...
9,"SELECT loss FROM table_name_48 WHERE date = ""s...",True,SELECT losing FROM table_name_48 WHERE date = ...


In [20]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_answers,
    references=human_baseline_answers[0:len(original_model_answers)],
    use_aggregator=True,
    use_stemmer=True,
)
print('ORIGINAL MODEL:')
print(original_model_results)


finetuned_model_results = rouge.compute(
    predictions=finetuned_model_answers,
    references=human_baseline_answers[0:len(finetuned_model_answers)],
    use_aggregator=True,
    use_stemmer=True,
)
print('FINE-TUNED MODEL:')
print(finetuned_model_results)

ORIGINAL MODEL:
{'rouge1': 0.02996415770609319, 'rouge2': 0.005, 'rougeL': 0.030566794240933115, 'rougeLsum': 0.030956459857498675}
FINE-TUNED MODEL:
{'rouge1': 0.9281907395888822, 'rouge2': 0.8917983416395394, 'rougeL': 0.9182615264948171, 'rougeLsum': 0.9191261027066355}


In [19]:
# pip install rouge_score