In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Introduction**
This project focuses on text generation using the GPT-2 (Generative Pretrained Transformer 2) model. GPT-2 is a powerful autoregressive Transformer developed by OpenAI, trained to predict the next word in a sequence based on previous context. It is widely used for tasks such as story generation, dialogue creation, and code completion.

We fine-tune GPT-2 on a custom text corpus consisting of literary works (e.g., Moby Dick, Hamlet) to teach the model domain-specific language patterns. Fine-tuning a pretrained language model allows it to adapt to new styles or topics without training from scratch, saving both time and resources.

After training, we use the model to generate coherent and creative text by providing it with prompts. The quality of the generated text is evaluated using common automatic metrics such as ROUGE, BLEU, and METEOR, which compare the generated outputs to reference texts based on word overlap and semantic similarity.

GPT-2 is chosen for this task due to its strong generative capabilities, open accessibility, and support from the Hugging Face Transformers library, which simplifies model loading, tokenization, and training.



In [None]:
# ===============================
# Load Dataset
# ===============================
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

selected_fileids = ['melville-moby_dick.txt' , 'shakespeare-hamlet.txt']  # Example: just 2 books
texts = [gutenberg.raw(fileid) for fileid in selected_fileids]

full_corpus = "\n".join(texts)


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [None]:
# ===============================
# Preprocessing
# ===============================
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\;\:\!\?\'\"]', '', text)
    text = text.strip()
    return text

clean_corpus = clean_text(full_corpus)


In [None]:
# ===============================
# Pre_trained Model
# ===============================
from transformers import AutoTokenizer, AutoModelForCausalLM , Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

model_name="gpt2"

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")


# Save cleaned text to file
with open("corpus.txt", "w") as f:
    f.write(clean_corpus)

# Create dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
        overwrite_cache=True
    )

train_dataset = load_dataset("corpus.txt", tokenizer)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Token indices sequence length is longer than the specified maximum sequence length for this model (329618 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
# ===============================
# Train
# ===============================
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=16,
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
    report_to="none"
)



trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

trainer.train()
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
161,4.4116
322,4.1288
483,3.9948
644,3.8944
805,3.8148
966,3.7564
1127,3.7095
1288,3.6837


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json',
 './gpt2-finetuned/tokenizer.json')

In [None]:
# ===============================
# Text Generation & Evaluation Prep
# ===============================
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load fine-tuned model and tokenizer
model_path = "./gpt2-finetuned"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Define multiple prompts
prompts = ["Once upon a time",
           "To be or not to be",
           "The whale was nowhere to be seen",
           "The ship was ready to sail",
           "This is the king",
           "She loved him once.",
           "He is dead."]
references = [p.lower() for p in prompts]
hypotheses = []

# Generate text for each prompt
for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt")
    output_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"], #tells the model which tokens to focus on (1 = keep, 0 = ignore).
        max_length=50,
        do_sample=True,
        temperature=0.9, # Controls randomness of token selection (used with sampling).
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2 # Penalizes repeating phrases or tokens.
    )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    hypotheses.append(generated_text)

# Show all outputs
print("\nGenerated Texts:")
for i, (prompt, output) in enumerate(zip(prompts, hypotheses)):
    print(f"\nPrompt {i+1}: {prompt}\nOutput: {output}")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generated Texts:

Prompt 1: Once upon a time
Output: Once upon a time, when the white whale's tremendous jaws and teeth should be so closely joined in them with those of other sharks which he then feeds on; it was not till about eight or ten that such unnatural jointing took place. as for

Prompt 2: To be or not to be
Output: To be or not to be, in a certain measure there are two main considerations involved here; both involving the consideration of this matter simultaneously. first: we must remember that all whales generally have their mouths cut into small incisionments below by other creatures

Prompt 3: The whale was nowhere to be seen
Output: The whale was nowhere to be seen, nor for a time could he have been heard from. "there are those among the whalemen who would rather not know their seamen than hear them speak out; and there is no way they can possibly

Prompt 4: The ship was ready to sail
Output: The ship was ready to sail; and she had but a few short steps between when i c

In [None]:
# ===============================
# Evaluation
# ===============================

!pip install rouge_score
!pip install evaluate



In [None]:
# ===============================
# We used three methods to evaluate our text generation model: ROUGE, BLEU, and METEOR.

# ROUGE checks how many words or phrases match between the generated text and the reference.

# BLEU looks for exact word matches and is good for translation tasks.

# METEOR considers similar words, word order, and meaning, which makes it better for creative or open-ended text.

# We chose to focus on METEOR because our model generates free-form sentences (like stories or literary lines), and METEOR gives a better idea of how close the meaning is to the original.
# ROUGE and BLEU are included for comparison, but METEOR is more useful for this kind of task.

# ===============================


import evaluate

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

rouge_result = rouge.compute(predictions=hypotheses, references=references)
print("ROUGE:", rouge_result)

# BLEU expects references as list of lists . We can skip this metric
# bleu_result = bleu.compute(predictions=hypotheses, references=[[ref] for ref in references])
# print("BLEU:", bleu_result)


meteor_result = meteor.compute(predictions=hypotheses, references=references)
print("METEOR:", meteor_result)



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


ROUGE: {'rouge1': np.float64(0.20625850340136057), 'rouge2': np.float64(0.1697648624667258), 'rougeL': np.float64(0.206734693877551), 'rougeLsum': np.float64(0.20573267144695717)}
METEOR: {'meteor': np.float64(0.5046195773847437)}
