In [1]:
!pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.2


In [2]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors, trainers

# Preprocessing data
def preprocess_data(data: List[Tuple[str, str, str]]) -> Tuple[List[str], List[str]]:
    input_texts = []
    target_texts = []

    for desc, file_structure, code in data:
        input_texts.append(desc)
        target_texts.append(json.dumps({"file_structure": json.loads(file_structure), "code": json.loads(code)}))

    return input_texts, target_texts

# Training tokenizer
def train_tokenizer(input_texts: List[str], target_texts: List[str]) -> Tokenizer:
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

    trainer = trainers.BpeTrainer(vocab_size=30000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    tokenizer.train_from_iterator(input_texts + target_texts, trainer=trainer)

    return tokenizer

# Dataset class
class ProjectGeneratorDataset(Dataset):
    def __init__(self, tokenizer: Tokenizer, input_texts: List[str], target_texts: List[str], block_size: int):
        self.tokenizer = tokenizer
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.block_size = block_size

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]

        input_tokens = self.tokenizer.encode(input_text).ids
        target_tokens = self.tokenizer.encode(target_text).ids

        # Truncate or pad input and target tokens
        input_tokens = input_tokens[:self.block_size]
        input_tokens += [0] * (self.block_size - len(input_tokens))

        target_tokens = target_tokens[:self.block_size]
        target_tokens += [0] * (self.block_size - len(target_tokens))

        return torch.tensor(input_tokens, dtype=torch.long), torch.tensor(target_tokens, dtype=torch.long)

# Your dataset
data = [
    ("Create a simple Python script that prints 'Hello, World!' with a separate file for the function",
     '{"root": ["main.py", "utils.py"]}',
     '{"main.py": "from utils import print_hello\\n\\nprint_hello()", "utils.py": "def print_hello():\\n    print(\'Hello, World!\')"}'),

    # Example 1
    ("Create a Python script that reads a CSV file and prints the first column",
     '{"root": ["read_csv.py"]}',
     '{"read_csv.py": "import csv\\n\\nwith open(\'data.csv\', mode=\'r\') as csvfile:\\n    csv_reader = csv.reader(csvfile)\\n    for row in csv_reader:\\n        print(row[0])"}'),

    # Example 2
    ("Create a Python script that calculates the area of a rectangle given its length and width as command-line arguments",
     '{"root": ["area_of_rectangle.py"]}',
     '{"area_of_rectangle.py": "import sys\\n\\ndef calculate_area(length: float, width: float) -> float:\\n    return length * width\\n\\nif __name__ == \'__main__\':\\n    length = float(sys.argv[1])\\n    width = float(sys.argv[2])\\n    area = calculate_area(length, width)\\n    print(f\'Area of the rectangle: {area}\')"}'),

    ("Create a Python script that reads a text file and prints its content",
     '{"root": ["file_reader.py"]}',
     '{"file_reader.py": "with open(\'input.txt\', \'r\') as file:\\n    content = file.read()\\nprint(content)"}'),

    ("Create a Python script that calculates the factorial of a number using a recursive function",
     '{"root": ["factorial.py"]}',
     '{"factorial.py": "def factorial(n):\\n    if n == 0:\\n        return 1\\n    return n * factorial(n - 1)\\n\\nnumber = int(input(\'Enter a number: \'))\\nprint(\'Factorial:\', factorial(number))"}'),

    ("Create a Python script that finds the first 10 prime numbers using a function",
     '{"root": ["prime_numbers.py"]}',
     '{"prime_numbers.py": "def is_prime(n):\\n    if n < 2:\\n        return False\\n    for i in range(2, n):\\n        if n % i == 0:\\n            return False\\n    return True\\n\\nprimes = []\\ncurrent = 2\\nwhile len(primes) < 10:\\n    if is_prime(current):\\n        primes.append(current)\\n    current += 1\\n\\nprint(\'First 10 prime numbers:\', primes)"}'),

    ("Create a Python script that generates Fibonacci sequence up to the specified number using a function",
     '{"root": ["fibonacci.py"]}',
     '{"fibonacci.py": "def fibonacci(n):\\n    fib_sequence = [0, 1]\\n    while fib_sequence[-1] + fib_sequence[-2] < n:\\n        fib_sequence.append(fib_sequence[-1] + fib_sequence[-2])\\n    return fib_sequence\\n\\nnumber = int(input(\'Enter a number: \'))\\nprint(\'Fibonacci sequence up to \', number, \':\', fibonacci(number))"}'),

    # Example 3
    ("Create a Python script that fetches the current weather for a given city using the OpenWeatherMap API",
     '{"root": ["get_weather.py"]}',
     '{"get_weather.py": "import requests\\nimport sys\\n\\ndef get_weather(city: str, api_key: str) -> dict:\\n    url = f\'https://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}\'\\n    response = requests.get(url)\\n    return response.json()\\n\\nif __name__ == \'__main__\':\\n    city = sys.argv[1]\\n    api_key = \'your_api_key_here\'\\n    weather_data = get_weather(city, api_key)\\n    print(weather_data)"}'),

    # ...
]


input_texts, target_texts = preprocess_data(data)
# Write preprocessed data to file
with open("preprocessed.txt", "w") as f:
    for input_text, target_text in zip(input_texts, target_texts):
        f.write(f"Input: {input_text}\n")
        f.write(f"Target: {target_text}\n")
        f.write("\n")
print("Preprocessed data saved to preprocessed.txt")

tokenizer = train_tokenizer(input_texts, target_texts)

block_size = 128
n_embd = 768
n_head = 12
n_layer = 6
dropout = 0.1

# Transformer model
class ProjectGeneratorModel(nn.Module):
    def __init__(self, vocab_size, n_embd, n_head, n_layer, dropout):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head =        nn.Linear(n_embd, vocab_size, bias=False)

    def forward(self, input_tokens):
        token_embeddings = self.token_embedding_table(input_tokens)
        position_ids = torch.arange(input_tokens.shape[1], dtype=torch.long).unsqueeze(0).to(input_tokens.device)
        position_embeddings = self.position_embedding_table(position_ids)

        x = token_embeddings + position_embeddings
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        return logits

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.ln_1 = nn.LayerNorm(n_embd)
        self.ln_2 = nn.LayerNorm(n_embd)
        self.attn = nn.MultiheadAttention(n_embd, n_head)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, n_embd * 4),
            nn.GELU(),
            nn.Linear(n_embd * 4, n_embd),
        )

    def forward(self, x):
        a = self.ln_1(x)
        x = x + self.attn(a, a, a)[0]
        x = x + self.mlp(self.ln_2(x))
        return x

# Model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ProjectGeneratorModel(tokenizer.get_vocab_size(), n_embd, n_head, n_layer, dropout).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id("<pad>"))

# Training loop
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0

    for input_tokens, target_tokens in dataloader:
        input_tokens, target_tokens = input_tokens.to(device), target_tokens.to(device)

        optimizer.zero_grad()
        logits = model(input_tokens)
        loss = criterion(logits.view(-1, logits.size(-1)), target_tokens.view(-1))
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    return running_loss / len(dataloader)

# Dataset and DataLoader
dataset = ProjectGeneratorDataset(tokenizer, input_texts, target_texts, block_size)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Train the model
n_epochs = 200
for epoch in range(n_epochs):
    loss = train_epoch(model, dataloader, optimizer, criterion, device)
    print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {loss:.4f}")



Preprocessed data saved to preprocessed.txt
Epoch 1/200, Loss: 6.1282
Epoch 2/200, Loss: 4.7948
Epoch 3/200, Loss: 4.7225
Epoch 4/200, Loss: 4.1529
Epoch 5/200, Loss: 4.0425
Epoch 6/200, Loss: 3.9252
Epoch 7/200, Loss: 3.7226
Epoch 8/200, Loss: 3.5363
Epoch 9/200, Loss: 3.4429
Epoch 10/200, Loss: 3.4054
Epoch 11/200, Loss: 3.3171
Epoch 12/200, Loss: 3.1863
Epoch 13/200, Loss: 3.0841
Epoch 14/200, Loss: 3.0233
Epoch 15/200, Loss: 2.9565
Epoch 16/200, Loss: 2.8536
Epoch 17/200, Loss: 2.7305
Epoch 18/200, Loss: 2.6273
Epoch 19/200, Loss: 2.5406
Epoch 20/200, Loss: 2.4359
Epoch 21/200, Loss: 2.3344
Epoch 22/200, Loss: 2.2659
Epoch 23/200, Loss: 2.1917
Epoch 24/200, Loss: 2.1039
Epoch 25/200, Loss: 2.0406
Epoch 26/200, Loss: 1.9888
Epoch 27/200, Loss: 1.9210
Epoch 28/200, Loss: 1.8595
Epoch 29/200, Loss: 1.8211
Epoch 30/200, Loss: 1.7751
Epoch 31/200, Loss: 1.7242
Epoch 32/200, Loss: 1.6908
Epoch 33/200, Loss: 1.6588
Epoch 34/200, Loss: 1.6240
Epoch 35/200, Loss: 1.5997
Epoch 36/200, Loss: 

In [3]:
def generate_project(model, tokenizer, input_text, block_size, device):
    model.eval()
    input_tokens = tokenizer.encode(input_text).ids

    # Truncate or pad input tokens
    input_tokens = input_tokens[:block_size]
    input_tokens += [0] * (block_size - len(input_tokens))

    input_tensor = torch.tensor(input_tokens, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(input_tensor)
    predictions = torch.argmax(logits, dim=-1)

    decoded_output = tokenizer.decode(predictions.squeeze().tolist())
    
    try:
        output_data = json.loads(decoded_output)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        print(f"Decoded output: {decoded_output}")
        output_data = decoded_output

    return output_data

In [4]:
    # Example input description
    input_description = "Create a simple Python script that prints 'Hello, World!'"

    # Generate output
    generated_output = generate_project(model, tokenizer, input_description, block_size, device)
    print(json.dumps(generated_output, indent=2))


Error decoding JSON: Expecting ',' delimiter: line 1 column 48 (char 47)
Decoded output:  {"file_structure": {"root": ["main.py", "code": {"file_ ".py(n):\n\nimportn\n\ndef get_ file:\n: n, file.   )" -> in:\n   ect( =\n\ number_ '))\__('Hello, World_(
" {\"file_structure\": {\"root\": [\"main.py\", \"code\": {\"file_ \".py(n):\\n\\nimportn\\n\\ndef get_ file:\\n: n, file.   )\" -> in:\\n   ect( =\\n\\ number_ '))\\__('Hello, World_("


In [7]:

    # Example input description
    input_description = "Create a Python script that calculates the area of a rectangle given its length and width as command-line arguments"

    # Generate output
    generated_output = generate_project(model, tokenizer, input_description, block_size, device)
    print(json.dumps(generated_output, indent=2))

Error decoding JSON: Expecting ',' delimiter: line 1 column 134 (char 133)
Decoded output:  {"file_structure": {"root": ["area_of_rectangle.py"]}, "code": {"area_of_rectangle.py": "importn\n\ndef get_ file:\n: n, file.   )" -> in:\n   ect( =\n\ number_ '))\__('Hello, World_(
" {\"file_structure\": {\"root\": [\"area_of_rectangle.py\"]}, \"code\": {\"area_of_rectangle.py\": \"importn\\n\\ndef get_ file:\\n: n, file.   )\" -> in:\\n   ect( =\\n\\ number_ '))\\__('Hello, World_("
