In [1]:
import os
import datetime
import math
import random
from utils import save_data, load_data

import torch
from torchinfo import summary
from transformers import AutoModelForCausalLM, AutoTokenizer

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

from openai import OpenAI
from pathlib import Path

## Inspecting the Phi-1 model

In [10]:
torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1", trust_remote_code=True)

  return self.fget.__get__(instance, owner)()


In [11]:
summary(model)

Layer (type:depth-idx)                                  Param #
PhiForCausalLM                                          --
├─PhiModel: 1-1                                         --
│    └─Embedding: 2-1                                   104,857,600
│    └─Dropout: 2-2                                     --
│    └─ModuleList: 2-3                                  --
│    │    └─PhiDecoderLayer: 3-1                        50,354,176
│    │    └─PhiDecoderLayer: 3-2                        50,354,176
│    │    └─PhiDecoderLayer: 3-3                        50,354,176
│    │    └─PhiDecoderLayer: 3-4                        50,354,176
│    │    └─PhiDecoderLayer: 3-5                        50,354,176
│    │    └─PhiDecoderLayer: 3-6                        50,354,176
│    │    └─PhiDecoderLayer: 3-7                        50,354,176
│    │    └─PhiDecoderLayer: 3-8                        50,354,176
│    │    └─PhiDecoderLayer: 3-9                        50,354,176
│    │    └─PhiDecoderLayer: 

In [None]:
inputs = tokenizer('''
Hello my name is             
''', return_tensors="pt", return_attention_mask=False)
print(inputs)

outputs = model.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)

## Replicating the textbook dataset creation

### Loading data

According to the original authors, they use three different source datasets:
- A filtered code-language dataset, which is a subset of The Stack and StackOverflow, obtained by
using a language model-based classifier (consisting of about 6B tokens).
- A synthetic textbook dataset consisting of <1B tokens of GPT-3.5 generated Python textbooks.
- A small synthetic exercises dataset consisting of ∼180M tokens of Python exercises and solutions.

In [2]:
from datasets import load_dataset

# Load the dataset in streaming mode
ds = load_dataset("bigcode/the-stack", data_dir="data/python", streaming=True, split="train")

# Initialize a counter
counter = 0

# Iterate over the dataset
dataset = {
    "sample": [],
    "label": [],
    "logprob": []
}

for sample in ds:
    dataset["sample"].append(sample["content"])
    counter += 1
    if counter >= 10000:
        break

Downloading readme:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/206 [00:00<?, ?it/s]

In [None]:
# Randomly check some examples from the dataset
import random 

random_samples = random.choices(dataset["sample"], k=3)
for i in random_samples: 
    print(i)
    print("-------")

### Label data with GPT-3.5-turbo

In [3]:
from openai import OpenAI
import os

client = OpenAI()
client.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
def gpt_labeling(sample: str, model_type: str="gpt-3.5-turbo-0125"): 
    response = client.chat.completions.create(
        model=model_type,
        messages=[
            {"role": "system", "content": """
                You are an AI assistant and your job is to classify. 
                Your job is to determine its educational value for a student whose goal is to learn basic coding concepts. 
            
                Here are the main points if an example is of a bad quality: 
                - Many samples are not self-contained, meaning that they depend on other modules or files that are
                external to the snippet, making them hard to understand without additional context.
                - Typical examples do not involve any meaningful computation, but rather consist of trivial or boil-
                erplate code, such as defining constants, setting parameters, or configuring GUI elements.
                - Samples that do contain algorithmic logic are often buried inside complex or poorly documented
                functions, making them difficult to follow or learn from.
                - The examples are skewed towards certain topics or use cases, resulting in an unbalanced distribution
                of coding concepts and skills across the dataset.
            
                If the educational value is high, return a 1. If the educational value is low, return a 0. 
                Return ONLY a number and nothing else. Otherwise I will NOT process your output!
            """},
            {"role": "user", "content": f"Code example: {sample[:10000]}"},
            {"role": "user", "content": "Classification: "}
        ],
        temperature=0.0,
        logprobs=True,
        logit_bias={15: 1, 16: 1},
        max_tokens=1, 
    )

    return response

In [5]:
for i in tqdm(range(len(dataset["sample"]))): 
    # Label data with GPT-3.5
    response = gpt_labeling(sample=dataset["sample"][i])

    # Get the label from the response
    label = response.choices[0].message.content
    logprobs = response.choices[0].logprobs.content[0].logprob

    # Add the label and prob to the dataset
    dataset["label"].append(int(label))
    dataset["logprob"].append(float(logprobs))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [12]:
training_df = pd.DataFrame(dataset)

In [13]:
# Check label distribution
training_df["label"].value_counts()

0    564
1    436
Name: label, dtype: int64

In [15]:
save_data(training_df, "./data/training-subset-labeled-1000.parquet")

### Training a random forest classifier

In [35]:
df = load_data("./data/training-subset-labeled.parquet")

X = df["sample"].tolist()
y = df["label"].tolist()

In [36]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("microsoft/codebert-base")

X_embedded = model.encode(X)

No sentence-transformers model found with name microsoft/codebert-base. Creating a new one with MEAN pooling.


In [37]:
# import requests
# import os 

# hf_key = os.getenv("HF_API_KEY")

# API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/intfloat/e5-small-v2"
# headers = {"Authorization": f"Bearer {hf_key}"}

# def query(texts):
#     response = requests.post(API_URL, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
#     return response.json()
	
# X_embedded = query(texts=X)

In [38]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embedded, y, test_size=0.2, random_state=42)

In [39]:
X_train[0]

array([-5.07316470e-01,  2.28404537e-01,  3.19959611e-01, -8.99160653e-02,
       -3.03865075e-01, -7.22756386e-01, -1.73169514e-03,  4.02829677e-01,
        3.07657182e-01,  4.90501344e-01, -2.62190342e-01,  8.52692842e-01,
       -2.67546445e-01, -3.12847883e-01,  8.49323392e-01, -2.10826129e-01,
        2.10718408e-01,  4.13056642e-01, -1.18505880e-02, -1.34410530e-01,
       -2.42161959e-01, -2.29697570e-01,  6.18695974e-01, -7.99583077e-01,
        3.46429884e-01,  4.40871119e-01, -6.44949675e-02,  7.53669143e-01,
       -5.93237162e-01,  9.20362294e-01, -2.09415555e-01,  2.70892143e-01,
        1.42405534e+00,  1.58426315e-01,  5.43486834e-01, -4.18687552e-01,
       -5.12024939e-01,  2.20933735e-01,  1.29838452e-01, -4.68540162e-01,
       -1.08978644e-01,  5.91604114e-01, -9.44795370e-01, -2.06896085e-02,
        4.49479342e-01,  3.86463046e-01,  5.64160347e-01, -2.75487095e-01,
        5.91123551e-02,  6.77372217e-01,  5.91932654e-01,  2.54811972e-01,
       -6.32566094e-01, -

In [40]:
y_train[:3]

[0, 0, 0]

In [41]:
from sklearn.linear_model import LogisticRegression

In [44]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)
print(y_pred)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
Accuracy: 0.8


### Generating the synthetic textbook dataset

In [2]:
from openai import OpenAI
import os

client = OpenAI()
client.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
def gpt_data_generation(topic: str, model_type: str="gpt-3.5-turbo-0125"): 
    response = client.chat.completions.create(
        model=model_type,
        messages=[
            {"role": "system", "content": """
                You are an expert on Python and an author of Python textbooks. 
                Your job is to create snippets of Python code with detailed English explanations. 
                The explanation should be at least five to eight sentences and be places above and below the code. 
                Ensure that all the code is of a very high quality and doesn't involve repetitive examples.
                Include comments in the generated code.
            """},
            {"role": "user", "content": f"The code and text should be about {topic}"},
            {"role": "user", "content": "Textbook snippet: "}
        ],
        temperature=0.4,
        max_tokens=512, 
    )

    return response.choices[0].message.content

In [4]:
topics = [
    "Introduction to Python: Basic syntax, variables, and data types",
    "Control Flow: Conditional statements (if, elif, else) and loops (for, while)",
    "Functions: Defining and calling functions, parameter passing",
    "Data Structures: Lists, tuples, dictionaries, sets",
    "File Handling: Reading from and writing to files",
    "Exception Handling: Handling errors and exceptions gracefully",
    "Object-Oriented Programming (OOP): Classes, objects, inheritance, polymorphism",
    "Modules and Packages: Importing and using external libraries",
    "String Manipulation: String methods, formatting, regular expressions",
    "Working with Dates and Times: Date objects, timedelta, formatting dates",
    "Input/Output: User input, output formatting",
    "List Comprehensions: Concise way to create lists",
    "Generators and Iterators: Iterable objects, yield statement",
    "Recursion: Functions calling themselves, solving problems recursively",
    "Functional Programming: Lambda functions, map, filter, reduce",
    "Debugging Techniques: Using print statements, debugging tools",
    "Testing: Writing and running tests using unittest or pytest",
    "Web Scraping: Extracting data from websites using libraries like BeautifulSoup",
    "GUI Programming: Creating graphical user interfaces with Tkinter or PyQt",
    "Data Visualization: Creating charts, graphs, and plots with libraries like Matplotlib or Seaborn",
    "NumPy: Introduction to numerical computing in Python",
    "Pandas: Data manipulation and analysis library for Python",
    "PyTorch: Deep learning framework for building and training neural networks"
]

In [None]:
# Set the base directory for saving the Parquet files
base_dir = "./data/"

# Define the number of total samples and save intervals
num_samples = 10000
save_interval = 250

# Initialize an empty list to store generated data points
datapoints = []

# Create or get the latest file name based on the current iteration
for i in tqdm(range(num_samples), leave=False):
    if i % save_interval == 0:
        random_topic = random.choice(topics)

        # Generate a textbook snippet using GPT-3.5
        textbook_page = gpt_data_generation(topic=random_topic)
        datapoints.append(textbook_page)

        # Create DataFrame from generated data points
        textbook_df = pd.DataFrame({"sample": datapoints})

        # Determine the next filename
        current_file = max(Path(base_dir).glob("synthetic-textbook*.parquet"), key=os.path.getctime)
        filename = f"synthetic-textbook-{int(os.path.splitext(current_file.name)[0].split('-')[-1]) + 1}.parquet" if current_file else "synthetic-textbook-01.parquet"
        print(f"Saving to {filename}...")

        # Save the DataFrame to Parquet file
        textbook_df.to_parquet(f"{base_dir}/{filename}", index=False)

        # Clear the list to prepare for new data points
        datapoints = []
    else:
        random_topic = random.choice(topics)
        textbook_page = gpt_data_generation(topic=random_topic)
        datapoints.append(textbook_page)

### Concatinating the datasets

In [6]:
# Get list of all parquet files in the directory
parquet_files = [f for f in os.listdir("./data") if f.endswith(".parquet")]

texts = []

# Loop through each parquet file
for file in parquet_files:
    df = pd.read_parquet(os.path.join("./data", file))
    texts.extend(df["sample"].tolist())

# Add top token to all texts
texts = [t + " <|endoftext|>" for t in texts]

print(texts[:100])

["PyTorch is a popular deep learning framework that provides a flexible platform for building and training neural networks. It offers dynamic computation graphs, which allow for easy debugging and efficient model training. One of the key features of PyTorch is its seamless integration with NumPy, making it easy to convert NumPy arrays to PyTorch tensors and vice versa. PyTorch also supports GPU acceleration, enabling faster computation for training large neural networks. Additionally, PyTorch provides a rich set of tools and libraries for tasks such as data loading, model optimization, and visualization, making it a comprehensive framework for deep learning research and applications.\n\n```python\n# Importing the necessary libraries\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\n\n# Define a simple neural network using PyTorch\nclass SimpleNN(nn.Module):\n    def __init__(self):\n        super(SimpleNN, self).__init__()\n        self.fc = nn.Linear(10, 1)  # Fully c

In [13]:
import tiktoken

def tiktoken_token_counter(text: str, encoding_name: str = "cl100k_base") -> int:
    """
    @param text: Text you want to count the number of tokens in
    @return: Integer with the token count
    """
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text.replace("<|endoftext|>", ""))
    return len(tokens)

token_count = 0
for t in texts:
    tokens = tiktoken_token_counter(t)
    token_count += int(tokens)

print(token_count)

3134907


In [16]:
from torch.utils.data import Dataset, DataLoader, random_split

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze()

# Instantiate your tokenizer (replace 'tokenizer' with your actual tokenizer)
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")

# Set tokens
eos_token = "<|endoftext|>"
pad_token = "<pad>"
tokenizer.pad_token = pad_token

# Instantiate your dataset
dataset = TextDataset(texts, tokenizer)

# Define the ratio for the train and test split (e.g., 80:20)
train_ratio = 0.8

# Calculate the number of samples to include in the train and test sets
train_size = int(train_ratio * len(dataset))
test_size = len(dataset) - train_size

# Split the dataset
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create dataloaders for the train and test sets
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [17]:
vocab = tokenizer.get_vocab()
print(f"The tokenizer knows {len(vocab)} tokens.")

The tokenizer knows 50295 tokens.


## Building the model

In [18]:
import torch
from torch import nn

In [19]:
# Assuming df is your DataFrame and "samples" is your column
def count_unique_words(df, column):
    # Split the strings into words, concatenate them and count the unique words
    unique_words = pd.Series(' '.join(df[column]).split()).nunique()
    return unique_words

# Call the function
df = pd.read_parquet("./data/synthetic-textbook-01.parquet")
unique_words = count_unique_words(df, "sample")
print(f"The column 'samples' contains {unique_words} unique words.")

The column 'samples' contains 4774 unique words.


In [20]:
# Hyperparams for phi-small
num_layers = 5 
hidden_dim = 256
mlp_dim = 1024
num_heads = 4
attention_head_dim = 16

# Hyperparams Optimizer
learning_rate = 1e-3
weight_decay = 0.1
warmup_steps = 750
vocab_size = len(vocab) 

class PhiModel(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 hidden_dim: int = 1024,
                 num_heads: int = 16,
                 mlp_dim: int = 4096,
                 num_layers: int = 16,
                 dropout: float = 0.1,
                 activation: str="gelu"):
        super(PhiModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.dropout = nn.Dropout(dropout)

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=mlp_dim,
            dropout=dropout,
            activation=activation
        )
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        # MLP output layer
        self.output_layer = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x): 
        x_embed = self.embedding(x)
        x_embed = self.dropout(x_embed)
        out = self.transformer_decoder(x_embed, x_embed)  # Use x_embed as both tgt and memory
        out = self.dropout(out)
        out = self.output_layer(out)
        return out

In [21]:
phi_model = PhiModel(vocab_size=vocab_size, hidden_dim=hidden_dim, num_heads=num_heads, mlp_dim=mlp_dim, num_layers=num_layers)

summary(phi_model)

Layer (type:depth-idx)                                            Param #
PhiModel                                                          --
├─Embedding: 1-1                                                  12,875,520
├─Dropout: 1-2                                                    --
├─TransformerDecoder: 1-3                                         --
│    └─ModuleList: 2-1                                            --
│    │    └─TransformerDecoderLayer: 3-1                          1,053,440
│    │    └─TransformerDecoderLayer: 3-2                          1,053,440
│    │    └─TransformerDecoderLayer: 3-3                          1,053,440
│    │    └─TransformerDecoderLayer: 3-4                          1,053,440
│    │    └─TransformerDecoderLayer: 3-5                          1,053,440
├─Linear: 1-4                                                     12,925,815
Total params: 31,068,535
Trainable params: 31,068,535
Non-trainable params: 0

## Training the model

In [22]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

device = torch.device("cpu")
device

device(type='cpu')

In [23]:
# Instantiate your model
model = PhiModel(vocab_size=vocab_size, hidden_dim=hidden_dim, num_heads=num_heads, mlp_dim=mlp_dim, num_layers=num_layers)

# Move your model to the device
model = model.to(device)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    # Training phase
    model.train()
    log_str = ""  # Initialize the log string for this epoch
    for step, batch in enumerate(train_dataloader):
        print(f"Step {step+1} of {len(train_dataloader)} total steps...")
        # Move your inputs to the device
        inputs, attention_mask = batch
        inputs = inputs.to(device)
        
        targets = inputs[:, 1:].contiguous()  # Shift the inputs to the right to create the targets
        inputs = inputs[:, :-1].contiguous()  # Remove the last token from the inputs. contiguous() makes sure that the tensor is stores in a contiguous block of memeory

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Append the loss to the log string
        log_str += f"Step {step+1} | Loss: {loss.item()}\\n"

    print(f'Epoch {epoch+1}/{num_epochs} | Training Loss: {loss.item()}')
    log_str += f'Epoch {epoch+1}/{num_epochs} | Training Loss: {loss.item()}\\n'

    # Save the model weights
    date_str = datetime.datetime.now().strftime("%Y-%m-%d")
    torch.save(model.state_dict(), f"training/models/phi-model-{date_str}-epoch-{epoch+1}.pt")

    # Testing phase
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for step, batch in enumerate(test_dataloader):
            # Move your inputs to the device
            inputs, attention_mask = batch
            inputs = inputs.to(device)

            targets = inputs[:, 1:].contiguous()  # Shift the inputs to the right to create the targets
            inputs = inputs[:, :-1].contiguous()  # Remove the last token from the inputs. contiguous() makes sure that the tensor is stores in a contiguous block of memeory

            # Forward pass
            outputs = model(inputs)

            # Compute loss
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            test_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs} | Test Loss: {test_loss / len(test_dataloader)}')
    log_str += f'Epoch {epoch+1}/{num_epochs} | Test Loss: {test_loss / len(test_dataloader)}\\n'

    # Save the log for this epoch
    with open(f"training/logs/log-{date_str}-epoch-{epoch+1}.txt", "w") as f:
        f.write(log_str)

KeyboardInterrupt: 

In [59]:
random_sentence = torch.tensor(tokenizer.encode("This Python coder does"))
random_sentence

tensor([ 1212, 11361,   269, 12342,   857])

In [60]:
# inputs = torch.randint(0, 100, (10,)) # 10 sequences of length 100 each

outputs = model(random_sentence) # generate a batch of output data from the input data and store it in the memory buffer
outputs

In [61]:
# Convert logits to token ids
token_ids = outputs.argmax(dim=-1)
token_ids

tensor([2438,   11,   13,   11,   11])

In [54]:
# Decode token ids to text
tokenized_outputs = tokenizer.batch_decode(token_ids)
print(tokenized_outputs)

[' code', ',', '.', ',', ',']


In [64]:
def generate(model, start_tokens, eos_token, max_length=512):
    model.eval()  # Set the model to evaluation mode
    generated = list(start_tokens)  # Start with the provided start tokens
    for _ in range(max_length):
        inputs = torch.tensor([generated]).to(device)  # Convert generated sequence to tensor
        with torch.no_grad():  # No need to track gradients
            outputs = model(inputs)  # Generate output
        next_token = outputs[0, -1].argmax(-1).item()  # Get the token with the highest score from the last position
        generated.append(next_token)  # Append the token to the generated sequence
        if next_token == eos_token:  # Stop if end-of-sequence token is generated
            break
    return generated

In [65]:
# Define your input string
input_str = "This Python code"

# Encode the input string to tokens
input_tokens = tokenizer.encode(input_str)

# Generate a sequence of tokens starting with the input tokens
generated_tokens = generate(model, input_tokens, eos_token, max_length=512)

# Decode the generated tokens to text
generated_text = tokenizer.decode(generated_tokens)

print(generated_text)

This Python code snippet above, and `unittest`python






















































































































































































































































































































































































































































































































