In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_cosine_schedule_with_warmup, GPT2ForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import json
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [10]:
MODEL_NAME = 'gpt2-medium'

In [4]:
# Set hyperparameters
learning_rate = 1e-5
epochs = 5
dropout = 0.2
batch_size = 1

In [11]:
# Load fine-tuning model
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME).to(device)

In [6]:
# Define the optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=-1, last_epoch=-1)

In [7]:
class ChatDataset(Dataset):
    def __init__(self, filename, augmentation=True):
        with open(filename, 'r') as f:
            raw_data = json.load(f)
            
        if augmentation:
            with open('data/filter_gen_dataset_mhy_math.json', 'r') as f1:
                math_data = json.load(f1)
            with open('data/filter_gen_dataset_mhy_openbookqa.json', 'r') as f2:
                code_data = json.load(f2) 
            raw_data.extend(math_data)
            raw_data.extend(code_data)
            self.data = raw_data
        else:
            self.data = raw_data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        chat = self.data[idx]['chat']
        instruction, demonstration = chat.rsplit('Assistant: ', 1)
        instruction = instruction + 'Assistant: '
            
        return instruction, demonstration, chat

In [8]:
# Load the dataset
train_dataset = ChatDataset('data/filter_gen_dataset_mhy_train.json', augmentation=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
eval_dataset = ChatDataset('data/filter_gen_dataset_mhy_val.json', augmentation=False)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

In [9]:
# model validation
def evaluate(eval_loader, model, device, chat_max_length):
    model.eval()
    eval_loss_sum = 0
    num_eval_batches = 0
    
    with torch.no_grad():
        for step, batch in enumerate(eval_loader):
            
            # Tokenize the chat
            chat = tokenizer(batch[2], return_tensors='pt', max_length=chat_max_length, padding="max_length", truncation=True).input_ids.to(device) 

            # Count the length of instruction and demonstration
            instruction = batch[0]
            demonstration = batch[1]
            instruction_tokens_num = [tokenizer(item, return_tensors='pt').input_ids.size(1) for item in instruction] 
            demonstration_tokens_num = [tokenizer(item, return_tensors='pt').input_ids.size(1) for item in demonstration]
        
            # Add the eos_token to the end of demonstration if the length of input is less than 1024
            demonstration_tokens_num = [demonstration_tokens_num[0] if instruction_tokens_num[0] + demonstration_tokens_num[0] == chat_max_length else demonstration_tokens_num[0]+1] 

            # Get the input_ids and target_ids
            input_ids = chat
            target_ids = input_ids.clone() 

            # Forward pass through the model
            outputs = model(input_ids, labels=target_ids)
            logits = outputs.logits

            # Only consider the loss for the demonstration part
            logits_demo = logits[:, instruction_tokens_num[0]-1:instruction_tokens_num[0]+demonstration_tokens_num[0]-1, :]         # [batch size, demonstration_max_length, number of classes]
            target_ids_demo = target_ids[:, instruction_tokens_num[0]:instruction_tokens_num[0]+demonstration_tokens_num[0]]         # [batch size, demonstration_max_length]
            batch_loss = torch.nn.functional.cross_entropy(logits_demo.permute(0, 2, 1), target_ids_demo) 

            eval_loss_sum += batch_loss.item()
            num_eval_batches = step+1
            
    return eval_loss_sum / num_eval_batches

In [None]:
# model training with augmentation data
best_eval_loss = 100
save_path = "./models/sft_model/sft_model_gpt2_medium_with_augmentation_data"

model.train()
model.zero_grad()


for epoch in range(epochs):
    print(f'epoch {epoch+1}')
    epoch_loss = 0
    num_batches = 0

    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()

        # Tokenize the chat
        chat = tokenizer(batch[2], return_tensors='pt', max_length=1024, padding="max_length", truncation=True).input_ids.to(device) 

        # Count the length of instruction and demonstration
        instruction = batch[0]
        demonstration = batch[1]
        instruction_tokens_num = [tokenizer(item, return_tensors='pt').input_ids.size(1) for item in instruction] 
        demonstration_tokens_num = [tokenizer(item, return_tensors='pt').input_ids.size(1) for item in demonstration]

        # Add the eos_token to the end of demonstration if the length of input is less than 1024
        demonstration_tokens_num = [demonstration_tokens_num[0] if instruction_tokens_num[0] + demonstration_tokens_num[0] == 1024 else demonstration_tokens_num[0]+1] 

        # Get the input_ids and target_ids
        input_ids = chat
        target_ids = input_ids.clone() 

        # Forward pass through the model
        outputs = model(input_ids, labels=target_ids)
        logits = outputs.logits

        # Only consider the loss for the demonstration part
        logits_demo = logits[:, instruction_tokens_num[0]-1:instruction_tokens_num[0]+demonstration_tokens_num[0]-1, :]         # [batch size, demonstration_max_length, number of classes]
        target_ids_demo = target_ids[:, instruction_tokens_num[0]:instruction_tokens_num[0]+demonstration_tokens_num[0]]         # [batch size, demonstration_max_length]
        batch_loss = torch.nn.functional.cross_entropy(logits_demo.permute(0, 2, 1), target_ids_demo)
        
        # Backward and optimize
        batch_loss.backward()
        optimizer.step()
        scheduler.step()
        
        if step % 1000 == 0:
            print(f'At step {step}, the loss = {batch_loss.item()}') 
            eval_loss = evaluate(eval_loader, model, device, 1024)  
            print(f'Validation Loss: {eval_loss}')

        epoch_loss += batch_loss.item()
        num_batches = step+1
        
    train_loss = epoch_loss / num_batches
    eval_loss = evaluate(eval_loader, model, device, 1024)    
    print(f'Epoch: {epoch+1} | Training Loss: {train_loss} | Validation Loss: {eval_loss}')
    
    
    if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss
            model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
            print("Model Saved!")    

epoch 1
At step 0, the loss = 3.0088770389556885
Validation Loss: 2.040975540701287
At step 1000, the loss = 1.4540754556655884
Validation Loss: 1.5534521528726528
At step 2000, the loss = 1.1866689920425415
Validation Loss: 1.5092301566053081
At step 3000, the loss = 2.151099681854248
Validation Loss: 1.49071538261139
At step 4000, the loss = 2.1141738891601562
Validation Loss: 1.50893768490337
At step 5000, the loss = 1.6128333806991577
Validation Loss: 1.5034816463693503
Epoch: 0 | Training Loss: 1.4962821966993343 | Validation Loss: 1.5088125083367883
Model Saved!
epoch 2
At step 0, the loss = 2.141450881958008
Validation Loss: 1.5080248972074897
At step 1000, the loss = 1.3257321119308472
Validation Loss: 1.4496042802678168
At step 2000, the loss = 0.9988695979118347
Validation Loss: 1.445424346922493
At step 3000, the loss = 1.8954966068267822
Validation Loss: 1.437532641747627
At step 4000, the loss = 1.9612300395965576
Validation Loss: 1.4581819601295698
At step 5000, the loss 

KeyboardInterrupt: 

In [10]:
# model training with original data
best_eval_loss = 100
save_path = "./models/sft_model/sft_model_gpt2_medium_with_original_data"

model.train()
model.zero_grad()


for epoch in range(epochs):
    print(f'epoch {epoch+1}')
    epoch_loss = 0
    num_batches = 0

    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()

        # Tokenize the chat
        chat = tokenizer(batch[2], return_tensors='pt', max_length=1024, padding="max_length", truncation=True).input_ids.to(device) 

        # Count the length of instruction and demonstration
        instruction = batch[0]
        demonstration = batch[1]
        instruction_tokens_num = [tokenizer(item, return_tensors='pt').input_ids.size(1) for item in instruction] 
        demonstration_tokens_num = [tokenizer(item, return_tensors='pt').input_ids.size(1) for item in demonstration]

        # Add the eos_token to the end of demonstration if the length of input is less than 1024
        demonstration_tokens_num = [demonstration_tokens_num[0] if instruction_tokens_num[0] + demonstration_tokens_num[0] == 1024 else demonstration_tokens_num[0]+1] 

        # Get the input_ids and target_ids
        input_ids = chat
        target_ids = input_ids.clone() 

        # Forward pass through the model
        outputs = model(input_ids, labels=target_ids)
        logits = outputs.logits

        # Only consider the loss for the demonstration part
        logits_demo = logits[:, instruction_tokens_num[0]-1:instruction_tokens_num[0]+demonstration_tokens_num[0]-1, :]         # [batch size, demonstration_max_length, number of classes]
        target_ids_demo = target_ids[:, instruction_tokens_num[0]:instruction_tokens_num[0]+demonstration_tokens_num[0]]         # [batch size, demonstration_max_length]
        batch_loss = torch.nn.functional.cross_entropy(logits_demo.permute(0, 2, 1), target_ids_demo)
        
        # Backward and optimize
        batch_loss.backward()
        optimizer.step()
        scheduler.step()
        
        if step % 1000 == 0:
            print(f'At step {step}, the loss = {batch_loss.item()}') 
            eval_loss = evaluate(eval_loader, model, device, 1024)  
            print(f'Validation Loss: {eval_loss}')

        epoch_loss += batch_loss.item()
        num_batches = step+1
        
    train_loss = epoch_loss / num_batches
    eval_loss = evaluate(eval_loader, model, device, 1024)    
    print(f'Epoch: {epoch+1} | Training Loss: {train_loss} | Validation Loss: {eval_loss}')
    
    
    if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss
            model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
            print("Model Saved!")    

epoch 1
At step 0, the loss = 3.2306623458862305
Validation Loss: 2.041643994535318
At step 1000, the loss = 1.4558309316635132
Validation Loss: 1.5535552163732038
At step 2000, the loss = 1.1876931190490723
Validation Loss: 1.5091536274414015
At step 3000, the loss = 2.1515047550201416
Validation Loss: 1.4901192441454636
Epoch: 1 | Training Loss: 1.5541013350511268 | Validation Loss: 1.4693898568247237
Model Saved!
epoch 2
At step 0, the loss = 2.035090446472168
Validation Loss: 1.4691945902224013
At step 1000, the loss = 1.341347336769104
Validation Loss: 1.4582348213986343
At step 2000, the loss = 0.9976373314857483
Validation Loss: 1.4565111613656219
At step 3000, the loss = 1.885643720626831
Validation Loss: 1.4460413482715404
Epoch: 2 | Training Loss: 1.3126501225762188 | Validation Loss: 1.4367622082377756
Model Saved!
epoch 3
At step 0, the loss = 1.7367817163467407
Validation Loss: 1.4365980436879875
At step 1000, the loss = 1.2355231046676636
Validation Loss: 1.45321293462729

In [None]:
torch.cuda.empty_cache()

In [3]:
MODEL_PATH = "./models/sft_model/sft_model_gpt2_medium_with_augmentation_data"
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH)
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH).to(device)


In [13]:
# Set the model to evaluation mode
model.eval()

# Define the input question
question = "why the earth is round? \n\nAssistant: "
input_ids = tokenizer.encode(question, return_tensors='pt').to(device)

# Generate the attention mask
attention_mask = torch.ones_like(input_ids).to(device)

# Generate the answer
output = model.generate(input_ids=input_ids, attention_mask = attention_mask, max_length=100, num_beams=5, no_repeat_ngram_size=2, num_return_sequences=5, early_stopping=True)

# Decode and print the response
response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Response:", response)

# for i, beam in enumerate(output):
#     print(f"{i}: {tokenizer.decode(beam, skip_special_tokens=True)}")
#     print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response: why the earth is round? 

Assistant:  I don't know, but I think it's because of the way the sun and the moon move around. It's not because the Earth is flat. 


In [6]:
# Set the model to evaluation mode
model.eval()

# Define the input question
question = "User: The goal of the 4 following questions is to prove that the methods map and mapTr are equivalent. The former is the version seen in class and is specified by the lemmas MapNil and MapCons. The later version is a tail-recursive version and is specified by the lemmas MapTrNil and MapTrCons. All lemmas on this page hold for all x: Int, y: Int, xs: List[Int], ys: List[Int], l: List [Int] and f: Int => Int. Given the following lemmas: \n\n(MapNil) Nil.map(f) === Nil (MapCons) (x :: xs).map(f) === f(x) :: xs.map(f) (MapTrNil) Nil.mapTr(f, ys) === ys (MapTrCons) (x :: xs).mapTr(f, ys) === xs.mapTr(f, ys ++ (f(x) :: Nil)) (NilAppend) Nil ++ xs === xs (ConsAppend) (x :: xs) ++ ys === x :: (xs ++ ys) Let us first prove the following lemma: (AccOut) l.mapTr(f, y :: ys) === y :: l.mapTr(f, ys) We prove it by induction on l. Induction step: l is x :: xs. Therefore, we need to prove: (x :: xs).mapTr(f, y :: ys) === y :: (x :: xs).mapTr(f, ys). We name the induction hypothesis IH. \n\nWhat exact sequence of lemmas should we apply to rewrite the left hand-side ((x :: xs).mapTr(f, y:: ys)) to the right hand-side (y :: (x :: xs).mapTr(f, ys))? \n\nAssistant:" 

input_ids = tokenizer.encode(question, return_tensors='pt').to(device)

# Generate the attention mask
attention_mask = torch.ones_like(input_ids).to(device)

# Generate the answer
output = model.generate(input_ids=input_ids, attention_mask = attention_mask, max_length=500, num_beams=5, no_repeat_ngram_size=2, num_return_sequences=5, early_stopping=True)

# Decode and print the response
# response = tokenizer.decode(output[0], skip_special_tokens=True)
# print("Response:", response)

for i, beam in enumerate(output):
    print(f"{i}: {tokenizer.decode(beam, skip_special_tokens=True)}")
    print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: User: The goal of the 4 following questions is to prove that the methods map and mapTr are equivalent. The former is the version seen in class and is specified by the lemmas MapNil and MapCons. The later version is a tail-recursive version and is specified by the lemmas MapTrNil and MapTrCons. All lemmas on this page hold for all x: Int, y: Int, xs: List[Int], ys: List[Int], l: List [Int] and f: Int => Int. Given the following lemmas: 

(MapNil) Nil.map(f) === Nil (MapCons) (x :: xs).map(f) === f(x) :: xs.map(f) (MapTrNil) Nil.mapTr(f, ys) === ys (MapTrCons) (x :: xs).mapTr(f, ys) === xs.mapTr(f, ys ++ (f(x) :: Nil)) (NilAppend) Nil ++ xs === xs (ConsAppend) (x :: xs) ++ ys === x :: (xs ++ ys) Let us first prove the following lemma: (AccOut) l.mapTr(f, y :: ys) === y :: l.mapTr(f, ys) We prove it by induction on l. Induction step: l is x :: xs. Therefore, we need to prove: (x :: xs).mapTr(f, y :: ys) === y :: (x :: xs).mapTr(f, ys). We name the induction hypothesis IH. 

What exact 