**Install Required Libraries**

In [None]:
!pip install -q transformers datasets evaluate sacrebleu gradio


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import evaluate
import torch


***Upload Dataset***

In [None]:
from google.colab import files
uploaded = files.upload()


Saving spoc-train-eval.tsv to spoc-train-eval.tsv
Saving spoc-train-test.tsv to spoc-train-test.tsv
Saving spoc-train-train.tsv to spoc-train-train.tsv


In [None]:
!ls /content


sample_data  spoc-train-eval.tsv  spoc-train-test.tsv  spoc-train-train.tsv


***Load TSV into HuggingFace dataset***

In [None]:
dataset = load_dataset("csv",
                       data_files={
                           "train": "/content/spoc-train-train.tsv",
                           "validation": "/content/spoc-train-eval.tsv",
                           "test": "/content/spoc-train-test.tsv"
                       },
                       delimiter="\t")

# Check dataset structure
print(dataset)
print(dataset["train"][0])


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'],
        num_rows: 246086
    })
    validation: Dataset({
        features: ['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'],
        num_rows: 27288
    })
    test: Dataset({
        features: ['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'],
        num_rows: 20480
    })
})
{'text': None, 'code': 'int main() {', 'workerid': 1, 'probid': '3A', 'subid': 41470897, 'line': 0, 'indent': 0}


In [None]:
print(dataset["train"].column_names)


['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent']


***Step 3: Tokenization and formatting***

In [None]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no default pad token

def preprocess_function(examples):
    # 'text' = pseudo-code, 'code' = target Python code
    inputs = [f"### PSEUDOCODE:\n{p}\n### CODE:" for p in examples["text"]]
    targets = [f" {c}" for c in examples["code"]]
    full_texts = [i + t for i, t in zip(inputs, targets)]
    return tokenizer(full_texts, truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names  # remove all original columns
)

print(tokenized_datasets)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/246086 [00:00<?, ? examples/s]

Map:   0%|          | 0/27288 [00:00<?, ? examples/s]

Map:   0%|          | 0/20480 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 246086
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 27288
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 20480
    })
})


In [None]:
!pip install -q --upgrade transformers


***Step 4: Fine-tune GPT-2***

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

model_name = "distilgpt2"  # smaller and faster
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="/content/gpt2-pseudocode",
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=1,
    fp16=True,  # mixed precision
    logging_dir="/content/logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(5000)),   # small subset
    eval_dataset=tokenized_datasets["validation"].select(range(1000)),
    data_collator=data_collator,
)

trainer.train()


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Step,Training Loss,Validation Loss
500,0.6338,1.039639


TrainOutput(global_step=626, training_loss=0.8091517728738511, metrics={'train_runtime': 237.1084, 'train_samples_per_second': 42.175, 'train_steps_per_second': 2.64, 'total_flos': 653241876480000.0, 'train_loss': 0.8091517728738511, 'epoch': 2.0})

In [None]:
import os

model_dir = "/content/gpt2-pseudocode"  # <-- replace if your folder is different
os.listdir(model_dir)


['checkpoint-626']

In [None]:
checkpoint_dir = "/content/gpt2-pseudocode/checkpoint-626"
os.listdir(checkpoint_dir)


['merges.txt',
 'rng_state.pth',
 'trainer_state.json',
 'training_args.bin',
 'config.json',
 'generation_config.json',
 'scaler.pt',
 'vocab.json',
 'special_tokens_map.json',
 'model.safetensors',
 'optimizer.pt',
 'scheduler.pt',
 'tokenizer_config.json']

In [None]:
from google.colab import files

for file_name in os.listdir(checkpoint_dir):
    files.download(os.path.join(checkpoint_dir, file_name))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

***Step 5: Evaluation***

In [None]:
# Load evaluation metrics
bleu = evaluate.load("bleu")
sacrebleu = evaluate.load("sacrebleu")

try:
    from codebleu import calc_codebleu
    codebleu_available = True
except:
    codebleu_available = False
    print("CodeBLEU not available, install with: pip install codebleu")

def evaluate_model(model, tokenizer, test_dataset, num_samples=100):
    """Comprehensive model evaluation"""
    model.eval()
    predictions = []
    references = []

    # Generate predictions
    for i in range(min(num_samples, len(test_dataset))):
        # Extract pseudo-code
        pseudo_code = dataset["test"][i]["text"]

        # Generate code
        generated_code = generate_code(pseudo_code, model, tokenizer)

        # Get reference
        reference_code = dataset["test"][i]["code"]

        predictions.append(generated_code)
        references.append([reference_code])  # BLEU expects list of references

    # Calculate BLEU score
    bleu_result = bleu.compute(predictions=predictions, references=references)
    sacrebleu_result = sacrebleu.compute(predictions=predictions, references=references)

    print(f"BLEU Score: {bleu_result['bleu']:.4f}")
    print(f"SacreBLEU Score: {sacrebleu_result['score']:.4f}")

    # Calculate CodeBLEU if available
    if codebleu_available and len(predictions) > 0:
        try:
            codebleu_result = calc_codebleu(
                references=[[ref[0]] for ref in references],  # Format for codebleu
                predictions=predictions,
                lang="python"
            )
            print(f"CodeBLEU Score: {codebleu_result['codebleu']:.4f}")
        except:
            print("CodeBLEU calculation failed")

    return predictions, references, bleu_result, sacrebleu_result

# Enhanced generation function
def generate_code(pseudo_code, model, tokenizer, max_new_tokens=128, temperature=0.7):
    """Generate Python code from pseudo-code"""
    prompt = f"Translate pseudo-code to Python:\nPseudo-code: {pseudo_code}\nPython code:"

    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
            early_stopping=True
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the generated code part
    if "Python code:" in generated_text:
        generated_code = generated_text.split("Python code:")[-1].strip()
    else:
        generated_code = generated_text.replace(prompt, "").strip()

    return generated_code

# Run evaluation
print("Evaluating model...")
predictions, references, bleu_results, sacrebleu_results = evaluate_model(
    model, tokenizer, dataset["test"]
)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


CodeBLEU not available, install with: pip install codebleu
Evaluating model...
BLEU Score: 0.0263
SacreBLEU Score: 2.6278


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# --- Load tokenizer and fine-tuned model ---
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # important for decoder-only models

model = GPT2LMHeadModel.from_pretrained("/content/gpt2-pseudocode")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# --- Function to generate code from pseudo-code ---
def generate_code(pseudo_code, max_new_tokens=128):
    # Prepare input prompt
    prompt = f"### PSEUDOCODE:\n{pseudo_code}\n### CODE:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate model output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,           # set to True for varied outputs
            top_p=0.95,               # nucleus sampling
            temperature=0.8
        )

    # Decode generated tokens
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the generated code after the prompt
    return generated_code.split("### CODE:")[-1].strip()

# --- Test examples ---
pseudo_examples = [
    "Create a function to add two numbers",
    "Write a Python function to check if a number is prime",
    "Sort a list of integers in ascending order using bubble sort"
]

for i, pc in enumerate(pseudo_examples, 1):
    print(f"\nExample {i} - Pseudo-code:\n{pc}")
    code = generate_code(pc)
    print(f"Generated Python Code:\n{code}")


OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /content/gpt2-pseudocode.

In [None]:
!mkdir -p /content/gpt2-pseudocode


In [None]:
trainer.save_model("/content/gpt2-pseudocode")  # saves model weights + config
tokenizer.save_pretrained("/content/gpt2-pseudocode")  # saves tokenizer


('/content/gpt2-pseudocode/tokenizer_config.json',
 '/content/gpt2-pseudocode/special_tokens_map.json',
 '/content/gpt2-pseudocode/vocab.json',
 '/content/gpt2-pseudocode/merges.txt',
 '/content/gpt2-pseudocode/added_tokens.json')

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("/content/gpt2-pseudocode")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = GPT2LMHeadModel.from_pretrained("/content/gpt2-pseudocode")
model.to("cuda" if torch.cuda.is_available() else "cpu")


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# --- Load tokenizer and model ---
model_path = "/content/gpt2-pseudocode"  # path where you saved the trained model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # important for decoder-only models

model = GPT2LMHeadModel.from_pretrained(model_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# --- Function to generate code from pseudo-code ---
def generate_code(pseudo_code, max_new_tokens=128):
    prompt = f"### PSEUDOCODE:\n{pseudo_code}\n### CODE:"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,          # for more diverse outputs
            temperature=0.7,         # adjust creativity
            top_p=0.9
        )

    # Decode and return
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # remove the prompt from output
    return decoded.replace(prompt, "").strip()

# --- Example test ---
pseudo_code_example = """
Define a function add_numbers that takes x and y
Return the sum of x and y
"""

generated_code = generate_code(pseudo_code_example)
print("Generated Python code:\n", generated_code)


Generated Python code:
 a = x, y = y; a = y; } else { a = y; } else { a = y; } else { a = y; } else { a = y; } } } else { a = y; } } } else { a = y; } } else { a = y; } } } else { a = y; } } } else { a = y; } } else { a = y; } } } } } else { a = y; } } else { a = y; } } else { a = y; } } } else { a = y;


In [None]:
def preprocess_function(examples):
    inputs = [f"### PSEUDOCODE:\n{p}\n### CODE:" for p in examples["text"]]  # 'text' column has pseudo-code
    targets = [f" {c}" for c in examples["code"]]
    full_texts = [i + t for i, t in zip(inputs, targets)]
    return tokenizer(full_texts, truncation=True, padding="max_length", max_length=256)

# Example: tokenized_datasets is a DatasetDict with train, validation, test
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/246086 [00:00<?, ? examples/s]

Map:   0%|          | 0/27288 [00:00<?, ? examples/s]

Map:   0%|          | 0/20480 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Use smaller batch size if GPU memory is limited, but accumulate gradients to simulate larger batch
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="/content/gpt2-pseudocode",
    eval_strategy="no",                 # skip evaluation during training to save time
    learning_rate=5e-5,
    per_device_train_batch_size=4,      # increase if GPU allows
    per_device_eval_batch_size=4,       # increase if GPU allows
    gradient_accumulation_steps=4,      # simulates larger batch
    num_train_epochs=1,                 # only 2 epochs
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="/content/logs",
    fp16=True,                          # use mixed precision for faster training
    dataloader_num_workers=2,           # use multiple CPU threads for loading data
    report_to="none"                    # disable extra logging for speed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)


In [None]:
# Start training
trainer.train()


Step,Training Loss
500,0.4846
1000,0.5409
1500,0.536
2000,0.5307
2500,0.534
3000,0.5221
3500,0.5188
4000,0.5116
4500,0.5054
5000,0.495


Step,Training Loss
500,0.4846
1000,0.5409
1500,0.536
2000,0.5307
2500,0.534
3000,0.5221
3500,0.5188
4000,0.5116
4500,0.5054
5000,0.495


TrainOutput(global_step=15381, training_loss=0.5034726279724807, metrics={'train_runtime': 9745.6191, 'train_samples_per_second': 25.251, 'train_steps_per_second': 1.578, 'total_flos': 3.2150155493376e+16, 'train_loss': 0.5034726279724807, 'epoch': 1.0})