In [4]:
import torch
from torchinfo import summary
from transformers import AutoModelForCausalLM, AutoTokenizer

from tqdm.auto import tqdm

import pandas as pd

# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import math

import random
from utils import save_data, load_data

## Inspecting the Phi-1 model

In [10]:
torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1", trust_remote_code=True)

  return self.fget.__get__(instance, owner)()


In [11]:
summary(model)

Layer (type:depth-idx)                                  Param #
PhiForCausalLM                                          --
├─PhiModel: 1-1                                         --
│    └─Embedding: 2-1                                   104,857,600
│    └─Dropout: 2-2                                     --
│    └─ModuleList: 2-3                                  --
│    │    └─PhiDecoderLayer: 3-1                        50,354,176
│    │    └─PhiDecoderLayer: 3-2                        50,354,176
│    │    └─PhiDecoderLayer: 3-3                        50,354,176
│    │    └─PhiDecoderLayer: 3-4                        50,354,176
│    │    └─PhiDecoderLayer: 3-5                        50,354,176
│    │    └─PhiDecoderLayer: 3-6                        50,354,176
│    │    └─PhiDecoderLayer: 3-7                        50,354,176
│    │    └─PhiDecoderLayer: 3-8                        50,354,176
│    │    └─PhiDecoderLayer: 3-9                        50,354,176
│    │    └─PhiDecoderLayer: 

In [None]:
inputs = tokenizer('''
Hello my name is             
''', return_tensors="pt", return_attention_mask=False)
print(inputs)

outputs = model.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)

## Replicating the textbook dataset creation

### Loading data

According to the original authors, they use three different source datasets:
- A filtered code-language dataset, which is a subset of The Stack and StackOverflow, obtained by
using a language model-based classifier (consisting of about 6B tokens).
- A synthetic textbook dataset consisting of <1B tokens of GPT-3.5 generated Python textbooks.
- A small synthetic exercises dataset consisting of ∼180M tokens of Python exercises and solutions.

In [2]:
from datasets import load_dataset

# Load the dataset in streaming mode
ds = load_dataset("bigcode/the-stack", data_dir="data/python", streaming=True, split="train")

# Initialize a counter
counter = 0

# Iterate over the dataset
dataset = {
    "sample": [],
    "label": [],
    "logprob": []
}

for sample in ds:
    dataset["sample"].append(sample["content"])
    counter += 1
    if counter >= 1000:
        break

Downloading readme:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/206 [00:00<?, ?it/s]

In [None]:
# Randomly check some examples from the dataset
import random 

random_samples = random.choices(dataset["sample"], k=3)
for i in random_samples: 
    print(i)
    print("-------")

### Label data with GPT-3.5-turbo

In [3]:
from openai import OpenAI
import os

client = OpenAI()
client.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
def gpt_labeling(sample: str, model_type: str="gpt-3.5-turbo-0125"): 
    response = client.chat.completions.create(
        model=model_type,
        messages=[
            {"role": "system", "content": """
                You are an AI assistant and your job is to classify. 
                Your job is to determine its educational value for a student whose goal is to learn basic coding concepts. 
            
                Here are the main points if an example is of a bad quality: 
                - Many samples are not self-contained, meaning that they depend on other modules or files that are
                external to the snippet, making them hard to understand without additional context.
                - Typical examples do not involve any meaningful computation, but rather consist of trivial or boil-
                erplate code, such as defining constants, setting parameters, or configuring GUI elements.
                - Samples that do contain algorithmic logic are often buried inside complex or poorly documented
                functions, making them difficult to follow or learn from.
                - The examples are skewed towards certain topics or use cases, resulting in an unbalanced distribution
                of coding concepts and skills across the dataset.
            
                If the educational value is high, return a 1. If the educational value is low, return a 0. 
                Return ONLY a number and nothing else. Otherwise I will NOT process your output!
            """},
            {"role": "user", "content": f"Code example: {sample[:10000]}"},
            {"role": "user", "content": "Classification: "}
        ],
        temperature=0.0,
        logprobs=True,
        logit_bias={15: 1, 16: 1},
        max_tokens=1, 
    )

    return response

In [5]:
for i in tqdm(range(len(dataset["sample"]))): 
    # Label data with GPT-3.5
    response = gpt_labeling(sample=dataset["sample"][i])

    # Get the label from the response
    label = response.choices[0].message.content
    logprobs = response.choices[0].logprobs.content[0].logprob

    # Add the label and prob to the dataset
    dataset["label"].append(int(label))
    dataset["logprob"].append(float(logprobs))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [12]:
training_df = pd.DataFrame(dataset)

In [13]:
# Check label distribution
training_df["label"].value_counts()

0    564
1    436
Name: label, dtype: int64

In [15]:
save_data(training_df, "./data/training-subset-labeled-1000.parquet")

### Training a random forest classifier

In [35]:
df = load_data("./data/training-subset-labeled.parquet")

X = df["sample"].tolist()
y = df["label"].tolist()

In [36]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("microsoft/codebert-base")

X_embedded = model.encode(X)

No sentence-transformers model found with name microsoft/codebert-base. Creating a new one with MEAN pooling.


In [37]:
# import requests
# import os 

# hf_key = os.getenv("HF_API_KEY")

# API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/intfloat/e5-small-v2"
# headers = {"Authorization": f"Bearer {hf_key}"}

# def query(texts):
#     response = requests.post(API_URL, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
#     return response.json()
	
# X_embedded = query(texts=X)

In [38]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embedded, y, test_size=0.2, random_state=42)

In [39]:
X_train[0]

array([-5.07316470e-01,  2.28404537e-01,  3.19959611e-01, -8.99160653e-02,
       -3.03865075e-01, -7.22756386e-01, -1.73169514e-03,  4.02829677e-01,
        3.07657182e-01,  4.90501344e-01, -2.62190342e-01,  8.52692842e-01,
       -2.67546445e-01, -3.12847883e-01,  8.49323392e-01, -2.10826129e-01,
        2.10718408e-01,  4.13056642e-01, -1.18505880e-02, -1.34410530e-01,
       -2.42161959e-01, -2.29697570e-01,  6.18695974e-01, -7.99583077e-01,
        3.46429884e-01,  4.40871119e-01, -6.44949675e-02,  7.53669143e-01,
       -5.93237162e-01,  9.20362294e-01, -2.09415555e-01,  2.70892143e-01,
        1.42405534e+00,  1.58426315e-01,  5.43486834e-01, -4.18687552e-01,
       -5.12024939e-01,  2.20933735e-01,  1.29838452e-01, -4.68540162e-01,
       -1.08978644e-01,  5.91604114e-01, -9.44795370e-01, -2.06896085e-02,
        4.49479342e-01,  3.86463046e-01,  5.64160347e-01, -2.75487095e-01,
        5.91123551e-02,  6.77372217e-01,  5.91932654e-01,  2.54811972e-01,
       -6.32566094e-01, -

In [40]:
y_train[:3]

[0, 0, 0]

In [41]:
from sklearn.linear_model import LogisticRegression

In [44]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)
print(y_pred)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
Accuracy: 0.8


### Generating the synthetic textbook dataset

In [16]:
from openai import OpenAI
import os

client = OpenAI()
client.api_key = os.getenv("OPENAI_API_KEY")

In [28]:
def gpt_data_generation(topic: str, model_type: str="gpt-3.5-turbo-0125"): 
    response = client.chat.completions.create(
        model=model_type,
        messages=[
            {"role": "system", "content": """
                You are an expert on Python and an author of Python textbooks. 
                Your job is to create snippets of Python code with detailed English explanations. 
                The explanation should be at least five to eight sentences and be places above and below the code. 
                Ensure that all the code is of a very high quality and doesn't involve repetitive examples.
                Include comments in the generated code.
            """},
            {"role": "user", "content": f"The code and text should be about {topic}"},
            {"role": "user", "content": "Textbook snippet: "}
        ],
        temperature=0.4,
        max_tokens=512, 
    )

    return response.choices[0].message.content

In [29]:
topics = [
    "Introduction to Python: Basic syntax, variables, and data types",
    "Control Flow: Conditional statements (if, elif, else) and loops (for, while)",
    "Functions: Defining and calling functions, parameter passing",
    "Data Structures: Lists, tuples, dictionaries, sets",
    "File Handling: Reading from and writing to files",
    "Exception Handling: Handling errors and exceptions gracefully",
    "Object-Oriented Programming (OOP): Classes, objects, inheritance, polymorphism",
    "Modules and Packages: Importing and using external libraries",
    "String Manipulation: String methods, formatting, regular expressions",
    "Working with Dates and Times: Date objects, timedelta, formatting dates",
    "Input/Output: User input, output formatting",
    "List Comprehensions: Concise way to create lists",
    "Generators and Iterators: Iterable objects, yield statement",
    "Recursion: Functions calling themselves, solving problems recursively",
    "Functional Programming: Lambda functions, map, filter, reduce",
    "Debugging Techniques: Using print statements, debugging tools",
    "Testing: Writing and running tests using unittest or pytest",
    "Web Scraping: Extracting data from websites using libraries like BeautifulSoup",
    "GUI Programming: Creating graphical user interfaces with Tkinter or PyQt",
    "Data Visualization: Creating charts, graphs, and plots with libraries like Matplotlib or Seaborn",
    "NumPy: Introduction to numerical computing in Python",
    "Pandas: Data manipulation and analysis library for Python",
    "PyTorch: Deep learning framework for building and training neural networks"
]


In [33]:
datapoints = []
for i in tqdm(range(250)): 
    random_topic = random.choice(topics)

    # Generate a textbook snippet using GPT-3.5
    textbook_page = gpt_data_generation(topic=random_topic)
    datapoints.append(textbook_page)

  0%|          | 0/250 [00:00<?, ?it/s]

In [36]:
textbook_df = pd.DataFrame({"sample": datapoints})

In [39]:
save_data(textbook_df, "./data/synthetic-textbook-01.parquet")

### Concatinating the datasets

In [25]:
df = pd.read_parquet("./data/synthetic-textbook-01.parquet")
texts = df["sample"].tolist()

# Add top token to all texts
texts = [t + " <|endoftext|>" for t in texts]
print(texts)

["PyTorch is a popular deep learning framework that provides a flexible platform for building and training neural networks. It offers dynamic computation graphs, which allow for easy debugging and efficient model training. One of the key features of PyTorch is its seamless integration with NumPy, making it easy to convert NumPy arrays to PyTorch tensors and vice versa. PyTorch also supports GPU acceleration, enabling faster computation for training large neural networks. Additionally, PyTorch provides a rich set of tools and libraries for tasks such as data loading, model optimization, and visualization, making it a comprehensive framework for deep learning research and applications.\n\n```python\n# Importing the necessary libraries\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\n\n# Define a simple neural network using PyTorch\nclass SimpleNN(nn.Module):\n    def __init__(self):\n        super(SimpleNN, self).__init__()\n        self.fc = nn.Linear(10, 1)  # Fully c

In [26]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze()

# Instantiate your tokenizer (replace 'tokenizer' with your actual tokenizer)
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")

# Set tokens
eos_token = "<|endoftext|>"
pad_token = "<pad>"
tokenizer.pad_token = pad_token

# Instantiate your dataset
dataset = TextDataset(texts, tokenizer)

# Create a dataloader
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


## Building the model

In [27]:
import torch
from torch import nn

In [28]:
# Assuming df is your DataFrame and "samples" is your column
def count_unique_words(df, column):
    # Split the strings into words, concatenate them and count the unique words
    unique_words = pd.Series(' '.join(df[column]).split()).nunique()
    return unique_words

# Call the function
df = pd.read_parquet("./data/synthetic-textbook-01.parquet")
unique_words = count_unique_words(df, "sample")
print(f"The column 'samples' contains {unique_words} unique words.")

The column 'samples' contains 4774 unique words.


In [29]:
# Hyperparams for phi-small
num_layers = 5 
hidden_dim = 256
mlp_dim = 1024
num_heads = 4
attention_head_dim = 16

# Hyperparams Optimizer
learning_rate = 1e-3
weight_decay = 0.1
warmup_steps = 750
vocab_size = unique_words # set this

class PhiModel(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 hidden_dim: int = 1024,
                 num_heads: int = 16,
                 mlp_dim: int = 4096,
                 num_layers: int = 16,
                 dropout: float = 0.1,
                 activation: str="gelu"):
        super(PhiModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.dropout = nn.Dropout(dropout)

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=mlp_dim,
            dropout=dropout,
            activation=activation
        )
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        # MLP output layer
        self.output_layer = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x): 
        x_embed = self.embedding(x)
        x_embed = self.dropout(x_embed)
        out = self.transformer_decoder(x_embed, x_embed)  # Use x_embed as both tgt and memory
        out = self.dropout(out)
        out = self.output_layer(out)
        return out

    def generate(model, x, start_token, max_length=512):
        model.eval()  # Set the model to evaluation mode
        generated = [start_token]  # Start with the start-of-sequence token
        for _ in range(max_length):
            inputs = torch.tensor([generated]).to(x.device)  # Convert generated sequence to tensor
            with torch.no_grad():  # No need to track gradients
                outputs = model(inputs)  # Generate output
            next_token = outputs.argmax(-1).item()  # Get the token with the highest score
            generated.append(next_token)  # Append the token to the generated sequence
            if next_token == eos_token:  # Stop if end-of-sequence token is generated
                break
        return generated


In [30]:
phi_model = PhiModel(vocab_size=vocab_size, hidden_dim=hidden_dim, num_heads=num_heads, mlp_dim=mlp_dim, num_layers=num_layers)

summary(phi_model)

Layer (type:depth-idx)                                            Param #
PhiModel                                                          --
├─Embedding: 1-1                                                  1,222,144
├─Dropout: 1-2                                                    --
├─TransformerDecoder: 1-3                                         --
│    └─ModuleList: 2-1                                            --
│    │    └─TransformerDecoderLayer: 3-1                          1,053,440
│    │    └─TransformerDecoderLayer: 3-2                          1,053,440
│    │    └─TransformerDecoderLayer: 3-3                          1,053,440
│    │    └─TransformerDecoderLayer: 3-4                          1,053,440
│    │    └─TransformerDecoderLayer: 3-5                          1,053,440
├─Linear: 1-4                                                     1,226,918
Total params: 7,716,262
Trainable params: 7,716,262
Non-trainable params: 0

In [33]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [37]:
# Instantiate your model
model = PhiModel(vocab_size=vocab_size, hidden_dim=hidden_dim, num_heads=num_heads, mlp_dim=mlp_dim, num_layers=num_layers)

# Move your model to the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    for batch in dataloader:
        # Move your inputs to the device
        inputs, attention_mask = batch
        inputs = inputs.to(device)
        # targets = inputs[:, 1:].contiguous()  # Shift the inputs to the right to create the targets
        # inputs = inputs[:, :-1].contiguous()  # Remove the last token from the inputs. contiguous() makes sure that the tensor is stores in a contiguous block of memeory

        # Forward pass
        loss = model(inputs)

        # Compute loss
        #loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [32]:
inputs = torch.randint(0, 100, (10,)) # 10 sequences of length 100 each
memory = torch.zeros((1, hidden_dim), dtype=torch.float) # create a memory buffer with size 1 and hidden_dim
outputs = model(inputs) # generate a batch of output data from the input data and store it in the memory buffer

TypeError: forward() got an unexpected keyword argument 'memory'

In [17]:
print(outputs)

tensor([[[-6.3668e-02,  1.7865e-01, -1.1382e+00,  ..., -1.4692e+00,
           1.4502e+00,  2.6285e-01],
         [ 1.4431e-02,  6.2598e-01, -1.3159e+00,  ..., -1.2661e+00,
           1.1853e+00, -1.7711e-01],
         [-1.7979e-01,  7.8707e-01, -1.3020e+00,  ..., -1.4371e+00,
           1.1514e+00, -1.2720e-01],
         ...,
         [ 1.6945e-01,  5.4839e-01, -1.0510e+00,  ..., -1.2968e+00,
           1.0868e+00, -3.9867e-02],
         [-7.5416e-01,  5.9857e-01, -1.2969e+00,  ..., -1.7847e+00,
           1.2358e+00, -1.8738e-01],
         [-1.0361e-01,  4.3842e-01, -1.2048e+00,  ..., -1.3742e+00,
           1.4077e+00, -1.7767e-01]],

        [[ 4.2880e-01,  5.0112e-01, -4.1016e-01,  ..., -4.1959e-02,
          -8.1995e-02, -6.8570e-01],
         [ 8.1349e-02,  2.3530e-01, -2.9831e-01,  ..., -2.1917e-01,
          -2.0951e-02, -6.4014e-01],
         [ 8.8203e-02,  3.3912e-01, -3.3997e-01,  ..., -2.8221e-03,
          -1.3763e-01, -4.8870e-01],
         ...,
         [ 3.9257e-01,  4

In [19]:
# Convert logits to token ids
token_ids = outputs.argmax(dim=-1)
token_ids

tensor([[98, 22, 22,  ..., 22, 22, 22],
        [22, 22, 22,  ..., 84, 22, 22],
        [22, 22, 50,  ..., 22, 26, 84],
        ...,
        [50, 50, 50,  ...,  9, 50, 50],
        [ 9,  9,  9,  ...,  9,  9,  9],
        [94, 17, 55,  ..., 55, 85, 17]])

In [20]:
# Decode token ids to text
tokenized_outputs = tokenizer.batch_decode(token_ids)
print(tokenized_outputs)



## Training the model 