In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM

MODEL_NAME = "google/flan-t5-large"

model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME, 
    # torch_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [2]:
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 2359296 || all params: 785509376 || trainable%: 0.30035236651331837


In [3]:
model = model.to(torch.device('mps'))

In [4]:
def create_dataset(dataset):
    new_dataset = []
    
    sessions = ['session_1', 'session_2', 'session_3']
    for data in dataset:
        for i, session in enumerate(sessions[1:]):
            for phrase_idx in range(1, len(data[session]['dialog']), 2):
                dialog = '\n'.join(data[session]['dialog'][:phrase_idx])
                context = '\n'.join([data[session]['context'] for session in sessions[:i+1]])
                answer = data[session]['dialog'][phrase_idx]
                facts = data[session]['facts']
                new_dataset.append({'dialog': dialog, 'context': context, 'answer': answer, 'facts': facts})
                
    return new_dataset

In [5]:
import json

with open('dialog_dataset.json', 'r') as f:
    dataset = list(json.load(f).values())
    
TEST_SIZE = .1

train_data = create_dataset(dataset[:int(len(dataset)*(1-TEST_SIZE))])
valid_data = create_dataset(dataset[int(len(dataset)*(1-TEST_SIZE)):])

In [6]:
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):

    def __init__(self, data):
        super().__init__()

        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        sample = self.data[index]

        dialog = '\n\nDialog:\n' + sample['dialog'] + '\nbot_1: '
        context = 'Context:\n' + sample['context']
        answer = '<pad>' + sample['answer'][7:]
        facts = '\n\n'.join([f'Facts about {person}:\n' + '\n'.join(facts) 
                             for person, facts in sample['facts'].items()])

        return context, facts, dialog, answer

In [7]:
class Collator:

    def __init__(self, tokenizer, encoder_max_length=1024, decoder_max_length=128):

        self.tokenizer = tokenizer

        self.encoder_max_length = encoder_max_length
        self.decoder_max_length = decoder_max_length

    def __call__(self, batch):

        inputs = []
        targets = []

        max_len = 0
        for context, facts, dialog, answer in batch:
            inputs.append(context + '\n\n' + facts + '\n\n' + dialog)
            targets.append(answer)
            
        tokenized_contexts = self.tokenizer(
            inputs,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.encoder_max_length
        )

        tokenized_responses = self.tokenizer(
            targets,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.decoder_max_length
        )

        tokenized_contexts["decoder_input_ids"] = tokenized_responses["input_ids"][:, :-1]
        tokenized_contexts["decoder_attention_mask"] = tokenized_responses["attention_mask"][:, :-1]

        targets = tokenized_responses["input_ids"][:, 1:]

        return tokenized_contexts, targets

In [8]:
BATCH_SIZE = 1

train_dataset = SequenceDataset(data=train_data)
valid_dataset = SequenceDataset(data=valid_data)

collator = Collator(tokenizer=tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collator, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collator, shuffle=False)

In [9]:
for x, y in train_loader:
    break

In [10]:
criterion = nn.CrossEntropyLoss(label_smoothing=0.1, ignore_index=tokenizer.pad_token_id)


optimizer = torch.optim.AdamW(params=model.parameters(), lr=2e-4)

In [11]:
from tqdm import tqdm
import numpy as np

def loop(n_epoch, is_train, loader, grad_acum_steps=1):
    
    global_step = 0

    if is_train:
        model.train()
    else:
        model.eval()

    all_predictions = list()
    all_targets = list()

    losses = list()

    progress_bar = tqdm(total=len(loader) // grad_acum_steps if is_train else len(loader), 
                        desc="Train" if is_train else "Valid")

    if is_train:
        model.train()
    else:
        model.eval()

    losses = list()

    for n_step, (batch, targets) in enumerate(loader):
        torch.mps.empty_cache()

        batch = batch.to(model.model.device)
        targets = targets.to(model.model.device)
        
        if is_train:
            logits = model(**batch).logits
        else:
            with torch.inference_mode():
                logits = model(**batch).logits

        loss = criterion(logits.view(-1, logits.size(-1)), targets.contiguous().view(-1))

        losses.append(loss.item())

        if is_train:
            loss.backward()
            if n_step > 0 and n_step % grad_acum_steps == 0:
                torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=0.7)
                optimizer.step()
                progress_bar.update()
                progress_bar.set_postfix(loss=np.mean(losses[-100:]))
                
#                 if global_step > 0 and global_step % save_each_steps == 0:
#                     torch.save(model.state_dict(), f"./{model_name.split('/')[-1]}_{global_step}_steps_5_cat_lora.pt")

                global_step += 1
        else:
            progress_bar.update()
            progress_bar.set_postfix(loss=np.mean(losses[-100:]))

    progress_bar.close()

    return losses

In [12]:
for n_epoch in range(1):

    train_losses = loop(n_epoch, is_train=True, loader=train_loader, grad_acum_steps=10)
    # valid_losses = loop(n_epoch, is_train=False, loader=valid_loader, grad_acum_steps=10)

    train_mean_loss = np.mean(train_losses)
    # valid_mean_loss = np.mean(valid_losses)

    epoch_message = [
        f"Epoch {n_epoch} done",
        "",
        "Train",
        f"\tLoss: {train_mean_loss:.3f}",
        # "Valid",
        # f"\tLoss: {valid_mean_loss:.3f}",
    ]

    print("\n".join(epoch_message))

    torch.save(model.state_dict(), f"./{'flan'.split('/')[-1]}_{n_epoch}_epoch_5_category_lora.pt")

Train:   0%|          | 0/60 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Train:  67%|██████▋   | 40/60 [42:49<26:08, 78.40s/it, loss=4.02]

KeyboardInterrupt: 

In [13]:
xs = []
ys = []
for x, y in valid_loader:
    xs.append(x)
    ys.append(y)

In [32]:
from peft import PeftModel, PeftConfig

model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME, 
    # torch_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = PeftModel.from_pretrained(model, './')

ValueError: Can't find config.json at './'

In [27]:
text_data = """Context: 1. The bot_0 mentioned that they swim after school and play Overwatch on PC. 2. The bot_1 mentioned that they are a pizza maker and hobbyist pencil drawer, and also mentioned that they drive to work. 3. The conversation ended with both bots expressing a desire to visit each other's location, with the bot_1 expressing some hesitation about their body shape and the bot_0 expressing a love for spending time at the beach with friends. Additionally, both bots mentioned their families, with bot_0 mentioning that they live with their extended family and bot_1 mentioning that they have 5 brothers. Facts about bot_0: bot_0 is currently renting and is in school bot_0 has mentioned enjoying the weather in Florida Facts about bot_1: bot_1 is thinking about buying a big home to host their brothers. bot_1 is considering buying a home due to its commitment. bot_1 mentions that they have been working on a portrait of their father. bot_1 plans to have the portrait delivered or framed at the pizza place. bot_1 is thinking about a game to play in their free time. Dialog: bot_0: Do you decide what a game you want to play? bot_1: </s>"""
tokenized_contexts = tokenizer(
    text_data,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=1024
)

# tokenized_contexts["decoder_input_ids"] = tokenized_responses["input_ids"][:, :-1]
# tokenized_contexts["decoder_attention_mask"] = tokenized_responses["attention_mask"][:, :-1]

outputs = model.generate(**tokenized_contexts.to('mps'), max_new_tokens=128, do_sample=True, temperature=.9)
outputs = outputs[0].detach().cpu()
tokenizer.decode(outputs[np.where(outputs >= 0)])

AttributeError: 'collections.OrderedDict' object has no attribute 'generate'

In [14]:
outputs = model.generate(**xs[0].to('mps'), max_new_tokens=128, do_sample=True, temperature=.9)
outputs = outputs[0].detach().cpu()
tokenizer.decode(outputs[np.where(outputs >= 0)])

"<pad> Hmm something big probably. I'd love to be able to host all my brothers. "

In [16]:
tokenizer.decode(ys[0][0])

"Hmm something big probably. I'd love to be able to host all my brothers. </s>"

In [18]:
print(tokenizer.decode(xs[0]['input_ids'][0]))

Context: 1. The bot_0 mentioned that they swim after school and play Overwatch on PC. 2. The bot_1 mentioned that they are a pizza maker and hobbyist pencil drawer, and also mentioned that they drive to work. 3. The conversation ended with both bots expressing a desire to visit each other's location, with the bot_1 expressing some hesitation about their body shape and the bot_0 expressing a love for spending time at the beach with friends. Additionally, both bots mentioned their families, with bot_0 mentioning that they live with their extended family and bot_1 mentioning that they have 5 brothers. Facts about bot_0: bot_0 is currently renting and is in school bot_0 has mentioned enjoying the weather in Florida Facts about bot_1: bot_1 is thinking about buying a big home to host their brothers. bot_1 is considering buying a home due to its commitment. bot_1 mentions that they have been working on a portrait of their father. bot_1 plans to have the portrait delivered or framed at the pi

In [None]:
print(tokenizer.decode(y[0]))