<a href="https://colab.research.google.com/github/Mariam-Ghamgui/Domain-Specific-GPT-2-Text-Generator/blob/main/Fine_Tune_GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    get_linear_schedule_with_warmup,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset
from tqdm import tqdm

In [None]:
# Load GPT-2 tokenizer and model
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token exists

model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
# Replace with your JSON file path
data_files = {"train": "dataset/articles.txt"}
datasets = load_dataset("json", data_files=data_files)

In [None]:
# Tokenize the 'text' field
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["title", "text"]  # Remove unused columns
)

In [None]:
# --- Filter empty sequences ---
tokenized_datasets = tokenized_datasets.filter(lambda x: len(x['input_ids']) > 0)

Filter:   0%|          | 0/880 [00:00<?, ? examples/s]

In [None]:
# Create a data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Create DataLoader
train_dataloader = DataLoader(
    tokenized_datasets['train'],
    batch_size=4,
    shuffle=True,
    collate_fn=data_collator
)

In [None]:
# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # smaller LR, add decay
epochs = 5  # increase epochs for small datasets
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
model.train()

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    epoch_loss = 0

    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        if input_ids.size(1) == 0:  # Skip empty sequences
            continue

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_dataloader)
    print(f'Average loss: {avg_loss:.4f}')

Epoch 1/5


100%|██████████| 220/220 [02:21<00:00,  1.55it/s]


Average loss: 3.2834
Epoch 2/5


100%|██████████| 220/220 [02:20<00:00,  1.57it/s]


Average loss: 3.1135
Epoch 3/5


100%|██████████| 220/220 [02:20<00:00,  1.56it/s]


Average loss: 3.0313
Epoch 4/5


100%|██████████| 220/220 [02:20<00:00,  1.57it/s]


Average loss: 2.9774
Epoch 5/5


100%|██████████| 220/220 [02:20<00:00,  1.57it/s]

Average loss: 2.9502





In [None]:
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

print("Training complete! Model saved to './fine_tuned_model'.")

Training complete! Model saved to './fine_tuned_model'.


In [None]:
# --- Gradio interface ---
import gradio as gr
model.eval()
model.to(device)

def generate_text(prompt, max_length=400):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

iface = gr.Interface(
    fn=generate_text,
    inputs=[gr.Textbox(lines=2, placeholder="Enter your prompt here...")],
    outputs="text",
    title="Fine-Tuned GPT-2 Text Generator"
)

iface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://06b3c35acc80731dea.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


