<a href="https://colab.research.google.com/github/Indukurivigneshvarma/Deep_Learning/blob/main/NLP/GPT_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets transformers --quiet
from datasets import load_dataset


In [12]:
dataset = load_dataset("blended_skill_talk")

print(dataset)
print(dataset["train"][0])

from IPython.display import clear_output
clear_output(wait=False)

In [13]:
def preprocess_function(example):
    dialogues = []
    for conv in example["previous_utterance"]:
        conversation = " ".join(conv)
        dialogues.append(conversation)
    return {"text": dialogues}

processed_dataset = dataset["train"].map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

print(processed_dataset)
from IPython.display import clear_output
clear_output(wait=False)

In [14]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    result = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = processed_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset.set_format("torch")

from IPython.display import clear_output
clear_output(wait=False)

In [15]:
from torch.utils.data import random_split

train_size = int(0.9 * len(tokenized_dataset))
val_size = len(tokenized_dataset) - train_size
train_dataset, val_dataset = random_split(tokenized_dataset, [train_size, val_size])


In [16]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

from IPython.display import clear_output
clear_output(wait=False)

In [17]:
from transformers import Trainer, TrainingArguments
import os

os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./gpt2-chatbot",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [18]:
trainer.train()
from IPython.display import clear_output
clear_output(wait=False)


In [19]:
model.to("cuda")

prompt = "User: Hello, how are you?\nBot:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_length=80,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

User: Hello, how are you?
Bot: I am in the process of getting a new job.
