In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset,DatasetDict
import kagglehub

# Download latest version
path = kagglehub.dataset_download("thedevastator/python-code-instruction-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/thedevastator/python-code-instruction-dataset?dataset_version_number=2...


100%|██████████| 3.88M/3.88M [00:00<00:00, 156MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/thedevastator/python-code-instruction-dataset/versions/2





In [4]:
dataset = load_dataset("csv", data_files="/root/.cache/kagglehub/datasets/thedevastator/python-code-instruction-dataset/versions/2/train.csv")["train"]
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["instruction", "output"]])

def transform(the_row):
    return {"input": the_row["instruction"] + "\n\n" + the_row["output"]}

dataset = dataset.map(transform)

# Drop the original columns
dataset = dataset.remove_columns(["instruction", "output"])

# 4. Split into train/val/test
splits = dataset.train_test_split(test_size=0.2, seed=42)  # 80/20
test_valid = splits["test"].train_test_split(test_size=0.5, seed=42)  # split 20% into 10/10
final_dataset = DatasetDict({
    "train": splits["train"],
    "validation": test_valid["train"],
    "test": test_valid["test"]
})

print(final_dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/18612 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input'],
        num_rows: 14889
    })
    validation: Dataset({
        features: ['input'],
        num_rows: 1861
    })
    test: Dataset({
        features: ['input'],
        num_rows: 1862
    })
})


In [5]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    inputs =  tokenizer(examples['input'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

tokenized_datasets = final_dataset.map(tokenize_function, batched=True)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/14889 [00:00<?, ? examples/s]

Map:   0%|          | 0/1861 [00:00<?, ? examples/s]

Map:   0%|          | 0/1862 [00:00<?, ? examples/s]

In [6]:
training_args = TrainingArguments(
    output_dir='results',
    eval_strategy='epoch',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='logs'
)

In [7]:
import wandb
wandb.init(mode="offline")

  | |_| | '_ \/ _` / _` |  _/ -_)


In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,1.0334,0.970028


TrainOutput(global_step=3723, training_loss=1.1788945429881605, metrics={'train_runtime': 1098.4169, 'train_samples_per_second': 13.555, 'train_steps_per_second': 3.389, 'total_flos': 972594266112000.0, 'train_loss': 1.1788945429881605, 'epoch': 1.0})

In [9]:
model.save_pretrained("model")
tokenizer.save_pretrained("model")

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.json',
 'model/merges.txt',
 'model/added_tokens.json',
 'model/tokenizer.json')

In [11]:
ls -lA

total 3144332
-rw-r--r-- 1 root root 3219765289 Sep  5 17:19 all_files.zip
drwxr-xr-x 4 root root       4096 Sep  3 13:35 [0m[01;34m.config[0m/
drwxr-xr-x 2 root root       4096 Sep  5 16:51 [01;34mlogs[0m/
drwxr-xr-x 2 root root       4096 Sep  5 17:13 [01;34mmodel[0m/
drwxr-xr-x 4 root root       4096 Sep  5 17:09 [01;34mresults[0m/
drwxr-xr-x 1 root root       4096 Sep  3 13:36 [01;34msample_data[0m/
drwxr-xr-x 3 root root       4096 Sep  5 16:51 [01;34mwandb[0m/


In [16]:
ls -lA ./model

total 490828
-rw-r--r-- 1 root root       874 Sep  5 17:13 config.json
-rw-r--r-- 1 root root       119 Sep  5 17:13 generation_config.json
-rw-r--r-- 1 root root    456318 Sep  5 17:13 merges.txt
-rw-r--r-- 1 root root 497774208 Sep  5 17:13 model.safetensors
-rw-r--r-- 1 root root       131 Sep  5 17:13 special_tokens_map.json
-rw-r--r-- 1 root root       507 Sep  5 17:13 tokenizer_config.json
-rw-r--r-- 1 root root   3557957 Sep  5 17:13 tokenizer.json
-rw-r--r-- 1 root root    798156 Sep  5 17:13 vocab.json


In [17]:
!zip -r model.zip model

  adding: model/ (stored 0%)
  adding: model/tokenizer.json (deflated 82%)
  adding: model/model.safetensors (deflated 7%)
  adding: model/generation_config.json (deflated 24%)
  adding: model/merges.txt (deflated 53%)
  adding: model/config.json (deflated 51%)
  adding: model/special_tokens_map.json (deflated 60%)
  adding: model/vocab.json (deflated 59%)
  adding: model/tokenizer_config.json (deflated 54%)


In [18]:
ls -lA

total 452720
drwxr-xr-x 4 root root      4096 Sep  3 13:35 [0m[01;34m.config[0m/
drwxr-xr-x 2 root root      4096 Sep  5 17:32 [01;34m.ipynb_checkpoints[0m/
drwxr-xr-x 2 root root      4096 Sep  5 16:51 [01;34mlogs[0m/
drwxr-xr-x 2 root root      4096 Sep  5 17:13 [01;34mmodel[0m/
-rw-r--r-- 1 root root 463555912 Sep  5 17:32 model.zip
drwxr-xr-x 4 root root      4096 Sep  5 17:09 [01;34mresults[0m/
drwxr-xr-x 1 root root      4096 Sep  3 13:36 [01;34msample_data[0m/
drwxr-xr-x 3 root root      4096 Sep  5 16:51 [01;34mwandb[0m/
