In [13]:
import warnings, torch, pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model, TaskType

warnings.filterwarnings("ignore", message="fan_in_fan_out is set to False.*")

# ------------------------------------------------------------------
# 1.  Load TSV
# ------------------------------------------------------------------
def load_tsv_data(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path, sep='\t', header=None)
    df = df.iloc[1:, :2]          # drop header row + keep 2 cols
    df.columns = ['pseudo_code', 'code']
    return df.dropna()

file_path = "/kaggle/input/code-dataset/spoc-train.tsv"   # <— change if needed
df = load_tsv_data(file_path)

def create_prompt(row):
    return f"""Translate the following pseudo-code to working code:
Pseudo-code: {row['pseudo_code']}
Code: {row['code']}
"""

df['text'] = df.apply(create_prompt, axis=1)
df = df.sample(n=50_000, random_state=42)          # <-- NEW


# ------------------------------------------------------------------
# 2.  80 / 10 / 10 split
# ------------------------------------------------------------------
train_df, temp_df = train_test_split(df, test_size=0.20, random_state=42, shuffle=True)
val_df, test_df   = train_test_split(temp_df, test_size=0.50, random_state=42, shuffle=True)

print(f"Train: {len(train_df)}  |  Val: {len(val_df)}  |  Test: {len(test_df)}")

# ------------------------------------------------------------------
# 3.  Tokeniser & model
# ------------------------------------------------------------------
model_name = "openai-community/gpt2"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj", "c_fc"],
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# ------------------------------------------------------------------
# 4.  Tokenise datasets
# ------------------------------------------------------------------
block_size = 256

def tokenise(examples):
    tok = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=block_size,
        return_special_tokens_mask=False,
    )
    tok["labels"] = tok["input_ids"].copy()
    return tok

raw_ds = DatasetDict({
    "train":     Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test":      Dataset.from_pandas(test_df,  preserve_index=False),
})

tokenized_ds = raw_ds.map(
    tokenise,
    batched=True,
    remove_columns=raw_ds["train"].column_names,
    desc="Tokenising",
)

# ------------------------------------------------------------------
# 5.  Trainer
# ------------------------------------------------------------------
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

args = TrainingArguments(
    output_dir="./gpt2-pseudo-code-translator",
    eval_strategy="steps",
    eval_steps=200,       
    save_strategy="steps",
    save_steps=500,
    learning_rate=1e-4,
     per_device_train_batch_size=16,   # ↑ from 2
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,    # ← remove acc
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    dataloader_pin_memory=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Starting training...")
trainer.train()
trainer.save_model("/kaggle/working/gpt2-pseudo-code-translator-final")

  df = pd.read_csv(file_path, sep='\t', header=None)


Train: 40000  |  Val: 5000  |  Test: 5000
trainable params: 2,359,296 || all params: 126,799,104 || trainable%: 1.8607


Tokenising:   0%|          | 0/40000 [00:00<?, ? examples/s]

Tokenising:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenising:   0%|          | 0/5000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


Step,Training Loss,Validation Loss
200,0.5152,0.885828
400,0.4329,0.796761
600,0.3983,0.758671
800,0.3832,0.734461
1000,0.3692,0.721284
1200,0.376,0.712118
1400,0.3802,0.70547
1600,0.3707,0.698094
1800,0.3698,0.693501
2000,0.3745,0.690403




In [27]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# ------------------------------------------------------------------
# 1.  Load base GPT-2 + tokenizer
# ------------------------------------------------------------------
base_model = "openai-community/gpt2"
tokenizer  = AutoTokenizer.from_pretrained(base_model)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,
    device_map="auto",
)

# ------------------------------------------------------------------
# 2.  Load the LoRA checkpoint you trained
# ------------------------------------------------------------------
lora_path = "/kaggle/working/gpt2-pseudo-code-translator-final"  # <- your save dir
model = PeftModel.from_pretrained(base, lora_path)
model.eval()

# ------------------------------------------------------------------
# 3.  Generation helper (unchanged except model->model)
# ------------------------------------------------------------------
def generate_code(pseudo_code, model=model, tokenizer=tokenizer, max_length=256):
    prompt = f"Translate the following pseudo-code to working code:\n\nPseudo-code: {pseudo_code}\n\nCode:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text.split("Code:")[-1].strip() if "Code:" in generated_text else generated_text

# ------------------------------------------------------------------
# 4.  Quick test
# ------------------------------------------------------------------
if __name__ == "__main__":
    test_pseudo = "print hello world"
    print("Generated code:", generate_code(test_pseudo))

Generated code: cout << "Hello!" << endl;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;
;


In [30]:
# ---------------------------------
# 1.  create a ZIP of the whole folder
# ---------------------------------
import shutil, os
zip_path = "/kaggle/working/gpt2-pseudo-code-translator-final.zip"
shutil.make_archive(zip_path.replace(".zip",""), 'zip', "/kaggle/working/gpt2-pseudo-code-translator-final")



'/kaggle/working/gpt2-pseudo-code-translator-final.zip'