In [3]:
# ==============================================================================
# STEP 0: Setup - Install Libraries and Check GPU
# ==============================================================================
print("Installing necessary libraries...")
!pip install -q transformers datasets accelerate torch

import torch
import os
import math
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline
)
from google.colab import files # For file uploads

print("Checking GPU availability...")
if torch.cuda.is_available():
    print("✅ GPU is available!")
    device = torch.device("cuda")
    # Set fp16=True in TrainingArguments for faster training on compatible GPUs
    use_fp16 = True
else:
    print("⚠️ GPU not available, using CPU. Training will be significantly slower.")
    device = torch.device("cpu")
    use_fp16 = False
print("-" * 30)

# ==============================================================================
# STEP 1: Configuration
# ==============================================================================
print("Configuring parameters...")

# --- Model Configuration ---
MODEL_NAME = "gpt2"  # Base model: "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"
                  # Choose based on your resources. Larger models need more VRAM/time.

# --- Data Configuration ---
# Option 1: Use Dummy Data (included below)
USE_DUMMY_DATA = True # Set to False if you want to upload your own file

# Option 2: Path to your custom data file (if USE_DUMMY_DATA = False)
# Ensure this file is uploaded or accessible in your Colab environment
CUSTOM_DATA_FILE = "my_custom_data.txt" # Change this if your filename is different

# --- Training Hyperparameters ---
OUTPUT_DIR = "./gpt2-finetuned-custom"      # Where the fine-tuned model will be saved
NUM_TRAIN_EPOCHS = 1                      # Start with 1-3 epochs. Increase cautiously.
PER_DEVICE_TRAIN_BATCH_SIZE = 4           # Lower this if you encounter CUDA Out-of-Memory errors (e.g., 2 or 1)
LEARNING_RATE = 5e-5                      # Common starting point for fine-tuning (AdamW optimizer)
WARMUP_STEPS = 100                        # Number of steps for learning rate warmup
WEIGHT_DECAY = 0.01                       # Regularization parameter
LOGGING_STEPS = 100                       # How often to log training metrics (loss, etc.)
SAVE_STEPS = 500                          # How often to save a model checkpoint during training
SAVE_TOTAL_LIMIT = 2                      # Maximum number of checkpoints to keep (saves disk space)

# --- Tokenizer Configuration ---
# Use model's max length or choose a smaller size if memory is limited
# GPT-2 max length is 1024
BLOCK_SIZE = 512                          # Process data in chunks of this size. Adjust based on VRAM.

print(f"Using model: {MODEL_NAME}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Training epochs: {NUM_TRAIN_EPOCHS}")
print(f"Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Use FP16: {use_fp16}")
print(f"Block size: {BLOCK_SIZE}")
print("-" * 30)

# ==============================================================================
# STEP 2: Prepare Data
# ==============================================================================
print("Preparing dataset...")

data_file_path = None

if USE_DUMMY_DATA:
    print("Using dummy dataset (Quantum Computing)...")
    dummy_data_content = """
Introduction to Quantum Computing:
Quantum computing leverages quantum mechanical phenomena like superposition and entanglement to perform computations. Unlike classical bits (0 or 1), qubits can exist in multiple states simultaneously.

Key Concepts in Quantum Computing:
Superposition: A qubit can be 0, 1, or a combination of both until measured.
Entanglement: Two or more qubits can be linked in such a way that their fates are intertwined, regardless of the distance separating them. Measuring one instantly influences the other.
Quantum Gates: Analogous to classical logic gates, these manipulate qubit states (e.g., Hadamard gate, CNOT gate).

Challenges in Quantum Computing:
Decoherence: Qubits are fragile and lose their quantum state due to environmental interactions. Maintaining coherence is a major hurdle.
Error Correction: Quantum errors are complex to correct due to the no-cloning theorem.
Scalability: Building stable quantum computers with a large number of high-quality qubits is difficult.

Applications of Quantum Computing:
Drug Discovery: Simulating molecules to accelerate the development of new medicines.
Materials Science: Designing novel materials with specific properties.
Cryptography: Breaking current encryption standards (e.g., Shor's algorithm) and developing quantum-resistant cryptography.
Optimization: Solving complex optimization problems faster than classical computers.

Advanced Topics:
Quantum Annealing: A heuristic optimization algorithm using quantum fluctuations.
Topological Qubits: More robust qubits based on topological properties.
Quantum Machine Learning: Exploring the intersection of quantum computing and ML.
"""
    data_file_path = "dummy_custom_data.txt"
    # Repeat the dummy data to make the dataset slightly larger for demonstration
    with open(data_file_path, "w") as f:
        for _ in range(20): # Write the content 20 times
             f.write(dummy_data_content + "\n\n") # Add extra newline for separation
    print(f"Dummy data written to {data_file_path}")

else:
    print(f"Attempting to use custom data file: {CUSTOM_DATA_FILE}")
    if os.path.exists(CUSTOM_DATA_FILE):
        print(f"Found existing file: {CUSTOM_DATA_FILE}")
        data_file_path = CUSTOM_DATA_FILE
    else:
        print(f"File '{CUSTOM_DATA_FILE}' not found. Please upload your .txt dataset file.")
        try:
            uploaded = files.upload()
            if not uploaded:
                raise ValueError("No file uploaded.")
            # Get the first uploaded file name
            uploaded_filename = list(uploaded.keys())[0]
            # Rename it to CUSTOM_DATA_FILE if necessary, or just use the uploaded name
            if uploaded_filename != CUSTOM_DATA_FILE:
                 print(f"Uploaded file '{uploaded_filename}', renaming to '{CUSTOM_DATA_FILE}' for consistency.")
                 os.rename(uploaded_filename, CUSTOM_DATA_FILE)

            data_file_path = CUSTOM_DATA_FILE
            print(f"Successfully uploaded and using: {data_file_path}")
        except Exception as e:
            print(f"Error during file upload: {e}")
            print("Please ensure you upload a single .txt file when prompted.")
            # Stop execution if data loading fails
            raise SystemExit("Data loading failed.")

# --- Load the dataset ---
if data_file_path:
    try:
        # Load dataset using 'text' type for plain text files
        # Assumes one document/example per line, or treats the whole file as one long string
        raw_datasets = load_dataset('text', data_files={'train': data_file_path})
        print("\nDataset loaded successfully:")
        print(raw_datasets)
        # Display a sample
        print("\nSample data (first 500 chars of first entry):")
        print(raw_datasets['train'][0]['text'][:500])
    except Exception as e:
        print(f"\nError loading dataset from file {data_file_path}: {e}")
        print("Ensure the file exists, is readable, and is a plain text (.txt) file.")
        raise SystemExit("Dataset loading failed.")
else:
     raise SystemExit("No data file path specified or found. Cannot proceed.")

print("-" * 30)

# ==============================================================================
# STEP 3: Load Tokenizer and Model
# ==============================================================================
print(f"Loading tokenizer for '{MODEL_NAME}'...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set pad token if it's not already set (GPT-2 usually requires this)
if tokenizer.pad_token is None:
    print("Setting pad_token to eos_token")
    tokenizer.pad_token = tokenizer.eos_token

print(f"\nLoading model '{MODEL_NAME}'...")
# Load the model for Causal Language Modeling (text generation)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Move model to the appropriate device (GPU or CPU)
model.to(device)

# Resize token embeddings if new tokens were added (not typical for basic fine-tuning)
# model.resize_token_embeddings(len(tokenizer))

print("\nTokenizer and model loaded successfully.")
print(f"Model loaded on: {model.device}")
print("-" * 30)

# ==============================================================================
# STEP 4: Tokenize and Prepare Data for Training
# ==============================================================================
print("Tokenizing dataset...")

# Tokenization function
def tokenize_function(examples):
    # This simple approach tokenizes each text individually.
    # Padding/truncation will be handled by the data collator.
     return tokenizer(examples["text"],
                      # truncation=True, # Option 1: Truncate long examples
                      # max_length=BLOCK_SIZE,
                      add_special_tokens=True)

# More advanced function to concatenate texts and chunk them into blocks
# This can be more efficient for training but requires careful handling
def group_texts(examples):
    # Concatenate all texts. Add EOS token between documents.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= BLOCK_SIZE:
        total_length = (total_length // BLOCK_SIZE) * BLOCK_SIZE
    # Split by chunks of BLOCK_SIZE.
    result = {
        k: [t[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
        for k, t in concatenated_examples.items()
    }
    # Create labels for Causal LM (predict the next token)
    result["labels"] = result["input_ids"].copy()
    return result

# --- Apply tokenization ---
# First, tokenize individual lines/documents
tokenized_datasets_intermediate = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names, # Remove original 'text' column
    desc="Running tokenizer on dataset",
)

# Then, group into blocks (optional but recommended for Causal LM)
tokenized_datasets = tokenized_datasets_intermediate.map(
    group_texts,
    batched=True,
    desc=f"Grouping texts into chunks of {BLOCK_SIZE}",
)

print("\nTokenization and grouping complete.")
print("Example of processed data:")
print(tokenized_datasets["train"][0])

# --- Data Collator ---
# Handles dynamic padding within batches and prepares labels for Causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # False for Causal LM (GPT-2), True for Masked LM (BERT)
)
print("\nData collator configured for Causal LM.")
print("-" * 30)

# ==============================================================================
# STEP 5: Configure Training Arguments
# ==============================================================================
print("Setting up Training Arguments...")

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,                  # Directory to save model checkpoints and logs
    overwrite_output_dir=True,              # Overwrite the content of the output directory
    num_train_epochs=NUM_TRAIN_EPOCHS,      # Total number of training epochs
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE, # Batch size per GPU/CPU
    learning_rate=LEARNING_RATE,            # Initial learning rate
    warmup_steps=WARMUP_STEPS,              # Number of warmup steps for learning rate scheduler
    weight_decay=WEIGHT_DECAY,              # Strength of weight decay regularization
    logging_dir='./logs',                   # Directory for storing logs (e.g., TensorBoard)
    logging_steps=LOGGING_STEPS,            # Log training metrics every X steps
    save_steps=SAVE_STEPS,                  # Save a checkpoint every X steps
    save_total_limit=SAVE_TOTAL_LIMIT,      # Limit the total number of checkpoints saved
    fp16=use_fp16,                          # Enable mixed precision training if GPU is available and compatible
    report_to="none",                       # Disable external reporting integrations (like WandB/TensorBoard) for simplicity
    # evaluation_strategy="steps",          # Uncomment if you have an eval dataset
    # eval_steps=SAVE_STEPS,                # Evaluate every 'eval_steps'
    # per_device_eval_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE * 2, # Batch size for evaluation
    # load_best_model_at_end=True,          # Uncomment if using evaluation to load the best model found
    # metric_for_best_model="loss",         # Uncomment if using evaluation
)

print("Training Arguments configured.")
print("-" * 30)

# ==============================================================================
# STEP 6: Initialize Trainer and Start Fine-Tuning
# ==============================================================================
print("Initializing Trainer...")

trainer = Trainer(
    model=model,                            # The instantiated Transformers model to be trained
    args=training_args,                     # Training arguments, defined above
    train_dataset=tokenized_datasets["train"], # Training dataset
    # eval_dataset=tokenized_datasets["validation"], # Evaluation dataset (optional)
    tokenizer=tokenizer,                    # Tokenizer for saving purposes
    data_collator=data_collator,            # Data collator to handle batching and padding
)

print("Trainer initialized. Starting fine-tuning...")
print(f"Training for {NUM_TRAIN_EPOCHS} epochs...")

# --- Start Training ---
try:
    train_result = trainer.train()
    print("\n✅ Fine-tuning finished successfully!")

    # --- Log some metrics ---
    metrics = train_result.metrics
    metrics["train_samples"] = len(tokenized_datasets["train"])
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)

    # --- Calculate perplexity if evaluation was done ---
    # Needs an eval_dataset configured in Trainer and evaluation_strategy != "no"
    # try:
    #     eval_metrics = trainer.evaluate()
    #     perplexity = math.exp(eval_metrics["eval_loss"])
    #     print(f"\nPerplexity on evaluation set: {perplexity:.2f}")
    #     trainer.log_metrics("eval", eval_metrics)
    #     trainer.save_metrics("eval", eval_metrics)
    # except KeyError:
    #     print("\nEvaluation metrics not found (is eval_dataset provided and evaluation_strategy set?).")
    # except Exception as e:
    #      print(f"\nError during evaluation: {e}")

except torch.cuda.OutOfMemoryError:
    print("\n❌ CUDA Out of Memory Error!")
    print("Training stopped. Suggestions:")
    print(f"  - Decrease `PER_DEVICE_TRAIN_BATCH_SIZE` (current: {PER_DEVICE_TRAIN_BATCH_SIZE})")
    print(f"  - Decrease `BLOCK_SIZE` (current: {BLOCK_SIZE})")
    print(f"  - Use a smaller model from the {MODEL_NAME} family (e.g., 'gpt2' instead of 'gpt2-medium')")
    print("  - If using Colab Pro, consider upgrading to a High-RAM runtime.")
    # Clean up GPU memory
    del model
    del trainer
    torch.cuda.empty_cache()
    raise SystemExit("OOM Error during training.") # Stop execution

except Exception as e:
    print(f"\n❌ An unexpected error occurred during training: {e}")
    raise SystemExit("Training failed.") # Stop execution

print("-" * 30)

# ==============================================================================
# STEP 7: Save the Fine-Tuned Model and Tokenizer
# ==============================================================================
final_model_path = f"{OUTPUT_DIR}-final"
print(f"Saving the final fine-tuned model and tokenizer to: {final_model_path}")

try:
    # Save the trained model weights and configuration
    trainer.save_model(final_model_path)
    # Save the tokenizer configuration as well (important!)
    tokenizer.save_pretrained(final_model_path)
    print("Model and tokenizer saved successfully.")
except Exception as e:
    print(f"Error saving model/tokenizer: {e}")

print("-" * 30)

# ==============================================================================
# STEP 8: Test Adaptation - Generate Text Comparison
# ==============================================================================
print("Testing model adaptation: Generating text...")

# --- Crucial: Define a prompt relevant to YOUR domain ---
# Replace this example prompt with one related to your custom dataset!
if USE_DUMMY_DATA:
    prompt = "In quantum computing, entanglement describes the phenomenon where"
else:
    prompt = "YOUR_DOMAIN_SPECIFIC_PROMPT_HERE" # <--- CHANGE THIS !!!
    print(f"⚠️ Using placeholder prompt: '{prompt}'. Please change it to be relevant to your domain '{CUSTOM_DATA_FILE}' for meaningful comparison.")


# --- Generation Parameters ---
max_new_tokens = 70      # Max number of *new* tokens to generate after the prompt
num_sequences = 2       # Number of different sequences to generate
temperature = 0.7       # Controls randomness (lower = more deterministic, higher = more random)
top_k = 50              # Consider only the top K most likely tokens at each step
top_p = 0.9             # Consider only tokens whose cumulative probability is >= P (nucleus sampling)
do_sample = True        # Whether to use sampling; False means greedy decoding

generation_config = {
    "max_new_tokens": max_new_tokens,
    "num_return_sequences": num_sequences,
    "temperature": temperature,
    "top_k": top_k,
    "top_p": top_p,
    "do_sample": do_sample,
    "pad_token_id": tokenizer.eos_token_id # Use EOS token for padding during generation
}

print(f"\nUsing prompt: '{prompt}'")
print(f"Generation parameters: {generation_config}")

# --- Load Original Model Pipeline ---
generator_original = None
try:
    print(f"\nLoading original '{MODEL_NAME}' pipeline...")
    generator_original = pipeline(
        'text-generation',
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        device=0 if torch.cuda.is_available() else -1 # Use GPU 0 if available, else CPU
    )
    print("Original model pipeline loaded.")
except Exception as e:
    print(f"Error loading original model pipeline: {e}")

# --- Load Fine-tuned Model Pipeline ---
generator_finetuned = None
if os.path.exists(final_model_path):
    try:
        print(f"\nLoading fine-tuned model pipeline from '{final_model_path}'...")
        generator_finetuned = pipeline(
            'text-generation',
            model=final_model_path,   # Path to your saved model directory
            tokenizer=final_model_path, # Path to your saved tokenizer directory
            device=0 if torch.cuda.is_available() else -1 # Use GPU 0 if available, else CPU
        )
        print("Fine-tuned model pipeline loaded.")
    except Exception as e:
        print(f"Error loading fine-tuned model pipeline: {e}")
else:
    print(f"Fine-tuned model path '{final_model_path}' not found. Skipping fine-tuned generation.")


# --- Generate and Compare ---
print("\n--- Generating with ORIGINAL model ---")
if generator_original:
    try:
        outputs_original = generator_original(prompt, **generation_config)
        for i, output in enumerate(outputs_original):
            print(f"{i+1}: {output['generated_text']}")
    except Exception as e:
        print(f"Error during original model generation: {e}")
else:
    print("Skipping original model generation.")


print("\n--- Generating with FINE-TUNED model ---")
if generator_finetuned:
    try:
        outputs_finetuned = generator_finetuned(prompt, **generation_config)
        for i, output in enumerate(outputs_finetuned):
            print(f"{i+1}: {output['generated_text']}")
    except Exception as e:
        print(f"Error during fine-tuned model generation: {e}")
else:
    print("Skipping fine-tuned model generation.")


# --- Qualitative Analysis Guidance ---
print("\n--- Analysis ---")
print("Compare the outputs above. Consider:")
print("  - Relevance: Does the fine-tuned output seem more focused on the domain of your custom data?")
print("  - Terminology: Does the fine-tuned model use specific words or jargon from your dataset?")
print("  - Style: Is the tone/style (e.g., formal, technical, conversational) closer to your data?")
print("  - Coherence: Are the generations logical and well-structured within the domain context?")
print("\nNOTE: Significant adaptation depends heavily on the size and quality of your custom dataset,")
print("      as well as the chosen hyperparameters (epochs, learning rate, etc.).")
print("      Experimentation might be needed for optimal results.")
print("-" * 30)
print("Script finished.")
# ==============================================================================

Installing necessary libraries...
Checking GPU availability...
⚠️ GPU not available, using CPU. Training will be significantly slower.
------------------------------
Configuring parameters...
Using model: gpt2
Output directory: ./gpt2-finetuned-custom
Training epochs: 1
Batch size: 4
Learning rate: 5e-05
Use FP16: False
Block size: 512
------------------------------
Preparing dataset...
Using dummy dataset (Quantum Computing)...
Dummy data written to dummy_custom_data.txt


Generating train split: 0 examples [00:00, ? examples/s]


Dataset loaded successfully:
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 520
    })
})

Sample data (first 500 chars of first entry):

------------------------------
Loading tokenizer for 'gpt2'...
Setting pad_token to eos_token

Loading model 'gpt2'...

Tokenizer and model loaded successfully.
Model loaded on: cpu
------------------------------
Tokenizing dataset...


Running tokenizer on dataset:   0%|          | 0/520 [00:00<?, ? examples/s]

Grouping texts into chunks of 512:   0%|          | 0/520 [00:00<?, ? examples/s]


Tokenization and grouping complete.
Example of processed data:
{'input_ids': [21906, 284, 29082, 38589, 25, 24915, 388, 14492, 17124, 1095, 14821, 12370, 19428, 588, 2208, 9150, 290, 920, 648, 1732, 284, 1620, 2653, 602, 13, 12101, 15993, 10340, 357, 15, 393, 352, 828, 627, 9895, 460, 2152, 287, 3294, 2585, 11640, 13, 9218, 50053, 287, 29082, 38589, 25, 12442, 9150, 25, 317, 627, 2545, 460, 307, 657, 11, 352, 11, 393, 257, 6087, 286, 1111, 1566, 8630, 13, 14539, 648, 1732, 25, 4930, 393, 517, 627, 9895, 460, 307, 6692, 287, 884, 257, 835, 326, 511, 277, 689, 389, 45905, 11, 7692, 286, 262, 5253, 27259, 606, 13, 2185, 45925, 530, 11101, 16717, 262, 584, 13, 24915, 388, 15953, 25, 50088, 516, 284, 15993, 9156, 17435, 11, 777, 18510, 627, 2545, 2585, 357, 68, 13, 70, 1539, 11161, 321, 446, 8946, 11, 327, 11929, 8946, 737, 41812, 34120, 287, 29082, 38589, 25, 10707, 78, 23545, 25, 1195, 549, 896, 389, 21049, 290, 4425, 511, 14821, 1181, 2233, 284, 6142, 12213, 13, 337, 2913, 1397, 763, 23

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss



✅ Fine-tuning finished successfully!
***** train metrics *****
  epoch                    =        1.0
  total_flos               =     2920GF
  train_loss               =     2.4492
  train_runtime            = 0:02:15.18
  train_samples            =         12
  train_samples_per_second =      0.089
  train_steps_per_second   =      0.022
------------------------------
Saving the final fine-tuned model and tokenizer to: ./gpt2-finetuned-custom-final
Model and tokenizer saved successfully.
------------------------------
Testing model adaptation: Generating text...

Using prompt: 'In quantum computing, entanglement describes the phenomenon where'
Generation parameters: {'max_new_tokens': 70, 'num_return_sequences': 2, 'temperature': 0.7, 'top_k': 50, 'top_p': 0.9, 'do_sample': True, 'pad_token_id': 50256}

Loading original 'gpt2' pipeline...


Device set to use cpu


Original model pipeline loaded.

Loading fine-tuned model pipeline from './gpt2-finetuned-custom-final'...


Device set to use cpu


Fine-tuned model pipeline loaded.

--- Generating with ORIGINAL model ---
1: In quantum computing, entanglement describes the phenomenon where a photon, a photon with many particles, is entangled with a black hole.

The entanglement can be represented by two different types of entangled particles: a black hole and a quantum field.

The black hole is the most dense of the particles and is the only one with any energy.

The quantum field is the most
2: In quantum computing, entanglement describes the phenomenon where one state of the system is in a state that is not in any other state. In quantum computation, this is called quantum entanglement.

For quantum computing, entanglement is a state that is not in any other state.

For quantum computation, this is called quantum entanglement.

Quantum entanglement

--- Generating with FINE-TUNED model ---
1: In quantum computing, entanglement describes the phenomenon where a system of particles is entangled by an entangled state. The entangleme