In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

In [None]:
from torch.utils.data import Dataset
import pandas as pd
import random

class TrainingData(Dataset):
    def __init__(self, path:str, tokenizer):
        
        data_frame = pd.read_csv(path)
        del data_frame['raw_data_id']
        data_list = data_frame.values.tolist()
        self.X = [sublist[0] for sublist in data_list]

        random.seed(3)
        random.shuffle(self.X)

        self.X = self.X[:23000]
        self.X_encoded = tokenizer(self.X,max_length=100, truncation=True, padding="max_length", return_tensors="pt")

        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [None]:
def train(trainingData, model, optim):

    epochs = 9

    for i in tqdm.tqdm(range(epochs)):
        for X, a in trainingData:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            print(loss)
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "GPT2-med-model_state.pt")
        print(runModel("What is high blood pressure?"))


In [None]:
def runModel(inp):
    inp = "<|user|> "+inp+" <|eos|> <|ai|>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, attention_mask=a)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return output

In [None]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

In [None]:
device

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<|user|>",
                                "eos_token": "<|eod|>"})
tokenizer.add_tokens(["<|ai|>"])
tokenizer.add_tokens(["<|eos|>"])

In [None]:

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# model.load_state_dict(torch.load('GPT2-med-model_state9.pt'))
model = model.to(device)
model.eval()

In [None]:

trainingData = TrainingData("prepared_generated_data_for_nhs_uk_qa.csv", tokenizer)
trainingData = DataLoader(trainingData, batch_size=15, shuffle=True)

In [None]:
model.train()

optim = Adam(model.parameters(), lr=1e-5)

In [None]:

print("training ... ")
train(trainingData, model, optim)

In [None]:
print("run model : ")
print(runModel('What are the early symptoms of bronchiolitis?'))