In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
!pip install transformers datasets torch pandas nltk



In [38]:
import pandas as pd
import nltk

# Download NLTK tokenizer
nltk.download('punkt')

# Define dataset path (ensure dataset.csv is in your Google Drive)
dataset_path = "/content/drive/MyDrive/DSGP/dataset/dataset.csv"

# Load the dataset
birds_df = pd.read_csv(dataset_path)

# Drop rows with missing values
birds_df.dropna(inplace=True)

# Convert dataset to a training format
train_data = "\n".join(birds_df.apply(lambda row: f"The {row['Name']} is {row['Distinctive Features']}", axis=1))

# Save the formatted dataset to a text file for training
with open("/content/drive/MyDrive/DSGP/dataset/bird_data.txt", "w") as f:
    f.write(train_data)

print("Data Preprocessing Complete. Training data saved to bird_data.txt")


Data Preprocessing Complete. Training data saved to bird_data.txt


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Resize token embeddings to match tokenizer size
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [40]:
from transformers import TextDataset, DataCollatorForLanguageModeling

# Function to load dataset into a format suitable for GPT-2 training
def load_dataset(file_path):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128  # Token length for training
    )

# Load train dataset
train_dataset = load_dataset("/content/drive/MyDrive/DSGP/dataset/bird_data.txt")

# Create a data collator for better model efficiency
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Masked language modeling is off (GPT-2 is autoregressive)
)



In [41]:
from transformers import Trainer, TrainingArguments

# Set training parameters
training_args = TrainingArguments(
    output_dir="./gpt2-bird-model",
    overwrite_output_dir=True,
    evaluation_strategy="no",  # No evaluation set, just training
    save_steps=500,  # Save model every 500 steps
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    logging_dir="./logs",
    logging_steps=100,  # Log loss every 100 steps
    report_to="none"  # Disable Weights & Biases logging
)




In [42]:
# Define Trainer for model fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Start training
trainer.train()


Step,Training Loss


TrainOutput(global_step=3, training_loss=4.514544486999512, metrics={'train_runtime': 54.3237, 'train_samples_per_second': 0.221, 'train_steps_per_second': 0.055, 'total_flos': 783876096000.0, 'train_loss': 4.514544486999512, 'epoch': 3.0})

In [43]:
# Print training loss
metrics = trainer.state.log_history
print("Training Loss Per Epoch:", metrics)

Training Loss Per Epoch: [{'train_runtime': 54.3237, 'train_samples_per_second': 0.221, 'train_steps_per_second': 0.055, 'total_flos': 783876096000.0, 'train_loss': 4.514544486999512, 'epoch': 3.0, 'step': 3}]


In [44]:
from transformers import pipeline

# Load trained GPT-2 model for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define a bird prompt
bird_prompt = "Describe the bird Red-vented Bulbul"

# Generate description
result = generator(bird_prompt, max_length=150, num_return_sequences=1)
print("Generated Description:", result[0]["generated_text"])


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Description: Describe the bird Red-vented Bulbulia celus, Thetis Caelis bard:

The bleriyspecies, Capricorn friek

The male-winged friek, The-winged bard:

The hawker's-wings, Ambus sprucebill

A small-winged bard:
The bard's celt, Capricorn friek:

The king's celt:

The corgid's-crowned crow. Algorn lea:

The corgi's-crowned col-crown:

And the coifs of Theo.


In [47]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = "/content/drive/MyDrive/DSGP/GPT2"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print("✅ Model saved successfully!")


✅ Model saved successfully!


In [54]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = "/content/drive/MyDrive/DSGP/GPT2/"

# Load model (if already trained)
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

print("✅ Model reloaded successfully!")


✅ Model reloaded successfully!


In [55]:
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print("✅ Model saved successfully to Google Drive!")


✅ Model saved successfully to Google Drive!


In [56]:
import torch

# Save in .bin format
bin_model_path = model_path + "pytorch_model.bin"
torch.save(model.state_dict(), bin_model_path)

print("✅ Model saved as pytorch_model.bin in:", bin_model_path)


✅ Model saved as pytorch_model.bin in: /content/drive/MyDrive/DSGP/GPT2/pytorch_model.bin


In [57]:
import os

if "pytorch_model.bin" in os.listdir(model_path):
    print("✅ Model file exists!")
else:
    print("❌ Model file is still missing. Try re-running the save command.")


✅ Model file exists!
