In [91]:
import torch
from torchinfo import summary
from transformers import AutoModelForCausalLM, AutoTokenizer

from tqdm.auto import tqdm

import pandas as pd

In [4]:
torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1", trust_remote_code=True)

In [5]:
summary(model)

Layer (type:depth-idx)                                  Param #
PhiForCausalLM                                          --
├─PhiModel: 1-1                                         --
│    └─Embedding: 2-1                                   104,857,600
│    └─Dropout: 2-2                                     --
│    └─ModuleList: 2-3                                  --
│    │    └─PhiDecoderLayer: 3-1                        50,354,176
│    │    └─PhiDecoderLayer: 3-2                        50,354,176
│    │    └─PhiDecoderLayer: 3-3                        50,354,176
│    │    └─PhiDecoderLayer: 3-4                        50,354,176
│    │    └─PhiDecoderLayer: 3-5                        50,354,176
│    │    └─PhiDecoderLayer: 3-6                        50,354,176
│    │    └─PhiDecoderLayer: 3-7                        50,354,176
│    │    └─PhiDecoderLayer: 3-8                        50,354,176
│    │    └─PhiDecoderLayer: 3-9                        50,354,176
│    │    └─PhiDecoderLayer: 

In [14]:
inputs = tokenizer('''
Hello my name is             
''', return_tensors="pt", return_attention_mask=False)
print(inputs)

outputs = model.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'input_ids': tensor([[  198, 15496,   616,  1438,   318, 50275,   198]], device='cuda:0')}

Hello my name is             

A:

You can use the following regex:
^[a-zA-Z0-9\s]+$

This will match any string that contains only letters, numbers, spaces, and the beginning of the string.

A:

You can use this regex:
^[a-zA-Z0-9\s]+$

A:

You can use this regex:
^[a-zA-Z0-9\s]+$

A:

You can use this regex:
^[a-zA-Z0-9\s]+$

A:

You can use this regex:
^[a-zA-Z0-9\s]+$

A:

You can use this regex:
^[a-zA-Z0-9\s]+$




## Replicating the textbook dataset creation

### Loading data

According to the original authors, they use three different source datasets:
- A filtered code-language dataset, which is a subset of The Stack and StackOverflow, obtained by
using a language model-based classifier (consisting of about 6B tokens).
- A synthetic textbook dataset consisting of <1B tokens of GPT-3.5 generated Python textbooks.
- A small synthetic exercises dataset consisting of ∼180M tokens of Python exercises and solutions.

In [97]:
from datasets import load_dataset

# Load the dataset in streaming mode
ds = load_dataset("bigcode/the-stack", data_dir="data/python", streaming=True, split="train")

# Initialize a counter
counter = 0

# Iterate over the dataset
dataset = {
    "sample": [],
    "label": [],
    "logprob": []
}

for sample in ds:
    dataset["sample"].append(sample["content"])
    print(sample["content"])
    counter += 1
    if counter >= 100:
        break

#!/usr/bin/python
# -*- coding: utf-8 -*-
# #*** <License> ************************************************************#
# This module is part of the repository CNDB.
#
# This module is licensed under the terms of the BSD 3-Clause License
# <http://www.c-tanzer.at/license/bsd_3c.html>.
# #*** </License> ***********************************************************#

from   _TFL.pyk           import pyk

from   rsclib.HTML_Parse  import tag, Page_Tree
from   rsclib.autosuper   import autosuper
from   spider.common      import Interface, Inet4, Inet6, unroutable
from   spider.common      import WLAN_Config
from   spider.luci        import Version_Mixin

class Status (Page_Tree, Version_Mixin) :
    url          = 'cgi-bin/luci/freifunk/status/status'
    retries      = 2
    timeout      = 10
    html_charset = 'utf-8' # force utf-8 encoding

    wl_names = dict \
        ( ssid    = 'ssid'
        , _bsiid  = 'bssid'
        , channel = 'channel'
        , mode    = 'mode'
        )

    

In [85]:
# Randomly check some examples from the dataset
import random 

random_samples = random.choices(dataset["sample"], k=3)
for i in random_samples: 
    print(i)
    print("-------")

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""The setup script."""

from setuptools import find_packages, setup

test_requirements = [
    "black>=19.10b0",
    "flake8>=3.8.3",
    "flake8-debugger>=3.2.1",
]

dev_requirements = [
    *test_requirements,
    "wheel>=0.34.2",
]

requirements = [
    "cdp-backend[pipeline]==3.0.2",
    "cdp-scrapers[king_county]>=0.3.2",
]

extra_requirements = {
    "test": test_requirements,
    "dev": dev_requirements,
    "all": [
        *requirements,
        *dev_requirements,
    ],
}

setup(
    author="JacksonMaxfield",
    classifiers=[
        "Development Status :: 2 - Pre-Alpha",
        "Intended Audience :: Developers",
        "License :: OSI Approved :: MIT License",
        "Natural Language :: English",
        "Programming Language :: Python :: 3.9",
    ],
    description="Package containing the gather functions for Example.",
    install_requires=requirements,
    license="MIT license",
    long_description_content_type="text/

### Label data with GPT-3.5-turbo

In [4]:
from openai import OpenAI
import os

client = OpenAI()
client.api_key = os.getenv("OPENAI_API_KEY")

In [101]:
def gpt_labeling(sample: str, model_type: str="gpt-3.5-turbo-0125"): 
    response = client.chat.completions.create(
        model=model_type,
        messages=[
            {"role": "system", "content": """
                You are an AI assistant and your job is to classify. 
                Your job is to determine its educational value for a student whose goal is to learn basic coding concepts. 
            
                Here are the main points if an example is of a bad quality: 
                - Many samples are not self-contained, meaning that they depend on other modules or files that are
                external to the snippet, making them hard to understand without additional context.
                - Typical examples do not involve any meaningful computation, but rather consist of trivial or boil-
                erplate code, such as defining constants, setting parameters, or configuring GUI elements.
                - Samples that do contain algorithmic logic are often buried inside complex or poorly documented
                functions, making them difficult to follow or learn from.
                - The examples are skewed towards certain topics or use cases, resulting in an unbalanced distribution
                of coding concepts and skills across the dataset.
            
                If the educational value is high, return a 1. If the educational value is low, return a 0. 
                Return ONLY a number and nothing else. Otherwise I will NOT process your output!
            """},
            {"role": "user", "content": f"Code example: {sample[:10000]}"},
            {"role": "user", "content": "Classification: "}
        ],
        temperature=0.0,
        logprobs=True,
        logit_bias={15: 1, 16: 1},
        max_tokens=1, 
    )

    return response

In [102]:
for i in tqdm(range(len(dataset["sample"]))): 
    # Label data with GPT-3.5
    response = gpt_labeling(sample=dataset["sample"][i])

    # Get the label from the response
    label = response.choices[0].message.content
    logprobs = response.choices[0].logprobs.content[0].logprob

    # Add the label and prob to the dataset
    dataset["label"].append(int(label))
    dataset["logprob"].append(float(logprobs))

  0%|          | 0/100 [00:00<?, ?it/s]

In [103]:
len(dataset["sample"])

100

In [104]:
len(dataset["label"])

100

In [105]:
len(dataset["logprob"])

100

In [106]:
df = pd.DataFrame(dataset)

In [110]:
df.to_parquet("data/training-subset-labeled.csv", index=False)

In [112]:
df_high_value = df[df["label"] == 1]

### Training a random forest classifier

### Generating the synthetic textbook dataset

### Generating the synthetic excercise dataset

## Building the model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")