In [1]:
# Install required libraries
!pip install transformers datasets torch numpy wandb huggingface_hub


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

# Import Packages and Set Device



In [2]:
import torch
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    TrainerCallback
)
from datasets import Dataset
import wandb
import re

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


# Dataset Preprocessing

In [3]:
def preprocess_dataset(dataset):
    """Preprocesses the dataset by cleaning text."""
    def clean_text(text):
        # Replace multiple newlines with a single newline
        text = re.sub(r'\n{3,}', '\n\n', text.strip())
        # Normalize spaces
        text = re.sub(r' +', ' ', text)
        return text

    return dataset.map(lambda x: {"text": clean_text(x["text"])})


# Create Synthetic Dataset

In [4]:
def create_synthetic_textbook_data():
    """Generates synthetic Python textbook-style data."""
    textbook_data = [
        "Chapter 1: Python Basics\n\nPython is a dynamically typed language that supports object-oriented programming.",
        "Chapter 2: Data Structures\n\nLearn about lists, dictionaries, and sets in Python with examples.",
        "Chapter 3: Algorithms\n\nExplore sorting algorithms like merge sort, quick sort, and insertion sort.",
        "Chapter 4: Advanced Topics\n\nUnderstand design patterns such as Singleton and Factory in Python."
    ]
    return Dataset.from_dict({"text": textbook_data})


# Initialize Model and Tokenizer

In [5]:
def setup_model():
    """Initializes the tokenizer and model."""
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
    return tokenizer, model


# Tokenize Dataset

In [6]:
def tokenize_dataset(dataset, tokenizer):
    """Tokenizes the dataset for training."""
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_special_tokens_mask=True
        )

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"]
    )
    tokenized_dataset.set_format(type="torch", device=device)
    return tokenized_dataset


# Train the Model

In [7]:
def train_model(model, tokenizer, dataset):
    """Trains the model using the synthetic dataset."""
    tokenized_dataset = tokenize_dataset(dataset, tokenizer)

    training_args = TrainingArguments(
        output_dir="./model_output",
        num_train_epochs=3,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-5,
        warmup_steps=50,
        weight_decay=0.01,
        logging_steps=10,
        save_steps=50,
        save_total_limit=2,
        report_to="wandb"
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator
    )

    trainer.train()
    return trainer


# Evaluate Model

In [8]:
def evaluate_model(model, tokenizer, prompt):
    """Generates text based on a prompt."""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        inputs.input_ids,
        max_length=150,
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Execution Steps

In [9]:
# Initialize wandb
wandb.init(project="textbooks_case_study", name="textbooks-experiment")

# Create dataset
print("Creating dataset...")
dataset = create_synthetic_textbook_data()
dataset = preprocess_dataset(dataset)

# Setup model and tokenizer
print("Initializing model and tokenizer...")
tokenizer, model = setup_model()

# Train the model
print("Training the model...")
trainer = train_model(model, tokenizer, dataset)

# Save the model
print("Saving the model...")
model.save_pretrained("./model_output")
tokenizer.save_pretrained("./model_output")

# Test the model
test_prompt = "Explain how to create a dictionary in Python:"
print("Generating text...")
generated_text = evaluate_model(model, tokenizer, test_prompt)
print(f"Generated text:\n{generated_text}")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Creating dataset...


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Initializing model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Training the model...


Map:   0%|          | 0/4 [00:00<?, ? examples/s]



Step,Training Loss


Saving the model...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generating text...
Generated text:
Explain how to create a dictionary in Python:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139


In [10]:
test_prompt = """
def calculate_average(numbers):
    '''
"""

print("Testing generation...")
generated_code = evaluate_model(model, tokenizer, test_prompt)
print(f"Generated code:\n{generated_code}")

Testing generation...
Generated code:

def calculate_average(numbers):
    '''
for i in range (i.len()): if n = 0: print " %s for {% s}" .format((r, r)[0])) else : return True def get_total(): """ Returns the number of times a given integer is calculated by multiplying its sum with an average value from that list and then returns it to `sum`. The result will be returned as soon after using this method on all integers are equal or greater than 1 respectively except when they have been rounded down so far into multiple digits can not use more numbers at once because rounding errors occur during arithmetic operations such like multiplication etc... If you want your results only one
