In [None]:
!pip install transformers
!pip install transformers datasets
!pip install accelerate
!pip install -q gradio
!pip install -q git+https://github.com/huggingface/transformers.git

**Loading the dependencies**

In [46]:
import gradio as gr
import transformers as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

#**Fine tuning of pretrained GPT-2 model**

Load the dataset

In [55]:
df = pd.read_json('/content/chat_data.json')

Text extraction from dialog entries

In [56]:
dialog_texts = []
for dialog in df['dialog']:
    for entry in dialog:
        dialog_texts.append(entry['text'])

In [57]:
# Converting the extracted texts into a DataFrame
dialog_df = pd.DataFrame(dialog_texts, columns=['text'])

In [58]:
dataset = Dataset.from_pandas(dialog_df)

**Loading the GPT2 tokenizer**

In [33]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Add padding

In [34]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

**Tokenization function**
Tokenizing the whole dataset

In [35]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3754 [00:00<?, ? examples/s]

Labelling the dataset

In [36]:
def add_labels(examples):
    examples['labels'] = examples['input_ids'].copy()
    return examples

tokenized_datasets = tokenized_datasets.map(add_labels, batched=True)

Map:   0%|          | 0/3754 [00:00<?, ? examples/s]

In [37]:
#Printing tokenized dataset
print(tokenized_datasets)

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3754
})


**GPT-2 Loading**

In [38]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [39]:
# Resize the model embeddings to match the tokenizer
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 768)

**Defining training arguments to model**

In [40]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

***Training Initialization***

In [None]:
trainer = Trainer(model=model,args=training_args,train_dataset=tokenized_datasets)

In [42]:
# Start training
trainer.train()

Step,Training Loss
500,0.0537
1000,0.0541
1500,0.0536


TrainOutput(global_step=1877, training_loss=0.052939242737434236, metrics={'train_runtime': 670.8913, 'train_samples_per_second': 5.596, 'train_steps_per_second': 2.798, 'total_flos': 1235388727296000.0, 'train_loss': 0.052939242737434236, 'epoch': 1.0})

In [43]:
# Save the model
model.save_pretrained('fine-tuned-gpt2')
tokenizer.save_pretrained('fine-tuned-gpt2')

('fine-tuned-gpt2/tokenizer_config.json',
 'fine-tuned-gpt2/special_tokens_map.json',
 'fine-tuned-gpt2/vocab.json',
 'fine-tuned-gpt2/merges.txt',
 'fine-tuned-gpt2/added_tokens.json')

#**Text Generation with Fine-tuned GPT-2**

Using the Gradio API for GUI

***Loading the model and tokenizer***

In [50]:
tokenizer=GPT2Tokenizer.from_pretrained('/content/fine-tuned-gpt2')
model=TFGPT2LMHeadModel.from_pretrained('/content/fine-tuned-gpt2', pad_token_id=tokenizer.eos_token_id)

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


*Input text box*

In [53]:
def generate_text(INPUT):
  input_ids=tokenizer.encode(INPUT, return_tensors='tf')
  beam_output=model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
  output_text=tokenizer.decode(beam_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
  return ".".join (output_text.split(".")[:-1]) + "."

*Text box for output text*

In [54]:
output_text=gr.Textbox()
gr.Interface(generate_text, "textbox",output_text,title="GPT-2",
             description="This is OpenAI's GPT-2 and I am using the pre-trained model").launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://b98237c68948995f22.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


