# Training

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import locale
locale.getpreferredencoding = lambda: "UTF-8" # this is needed to get rid of weird colab locale error
# if you are still running into issues, please restart the runtime to initialize a new environment

!wget https://zenodo.org/records/7908468/files/python.zip
!unzip python.zip
!gzip -d python/final/jsonl/train/python_train_0.jsonl.gz

--2025-02-11 03:20:28--  https://zenodo.org/records/7908468/files/python.zip
Resolving zenodo.org (zenodo.org)... 188.185.45.92, 188.185.48.194, 188.185.43.25, ...
Connecting to zenodo.org (zenodo.org)|188.185.45.92|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 940909997 (897M) [application/octet-stream]
Saving to: ‘python.zip’


2025-02-11 03:21:42 (12.3 MB/s) - ‘python.zip’ saved [940909997/940909997]

Archive:  python.zip
   creating: python/
   creating: python/final/
   creating: python/final/jsonl/
   creating: python/final/jsonl/train/
  inflating: python/final/jsonl/train/python_train_9.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_12.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_10.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_0.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_6.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_2.jsonl.gz  
  inflating: python/final/jsonl/train

In [3]:
import numpy as np
import json

def grab_raw_dataset():
    raw_dataset = []
    file = "python/final/jsonl/train/python_train_0.jsonl"
    with open(file, "r") as f:
        raw_dataset.extend([json.loads(x) for x in f.readlines()])
    print("grabbed data from file {}".format(file))
    print("Number of raw functions: {}".format(len(raw_dataset)))
    return raw_dataset

raw_dataset = grab_raw_dataset()

grabbed data from file python/final/jsonl/train/python_train_0.jsonl
Number of raw functions: 30000


In [4]:
import torch

from torch.utils.data import DataLoader
from transformers import AdamW, AutoTokenizer, T5ForConditionalGeneration

def load_base_model():
    tokenizer = AutoTokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    return model, tokenizer

model, tokenizer = load_base_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [20]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        return self.encodings['input_ids'][i], self.encodings['labels'][i]


import random

def mask_spans(tokens, mask_rate=0.15, max_masks=5):
    """
    Replaces random spans of tokens with special tokens <extra_id_X>.
    Follows the HW1 requirements:
    - Masks random spans, not just individual tokens
    - Uses <extra_id_X> where X is the mask order
    - Ground truth contains <extra_id_X> followed by original masked span
    """
    num_tokens = len(tokens)
    num_to_mask = max(1, int(num_tokens * mask_rate))  # At least 1 mask
    num_spans = min(num_to_mask, max_masks)  # Limit the number of mask spans

    # Randomly select span start points
    span_starts = sorted(random.sample(range(num_tokens), num_spans))
    masked_tokens = []
    input_tokens = tokens[:]

    for i, start in enumerate(span_starts):
        if start >= num_tokens:
            continue

        # Determine a random span length (between 1-4 tokens)
        span_length = min(random.randint(1, 4), num_tokens - start)
        span = tokens[start:start + span_length]

        # Replace the span with <extra_id_X>
        input_tokens[start:start + span_length] = [f"<extra_id_{i}>"]

        # Ground truth: <extra_id_X> + original span
        masked_tokens.append(f"<extra_id_{i}>")
        masked_tokens.extend(span)

    # Append the final <extra_id_N> (end of sequence)
    masked_tokens.append(f"<extra_id_{num_spans}>")

    return input_tokens, masked_tokens


from torch.nn.utils.rnn import pad_sequence

def prepare_dataset(raw_dataset, tokenizer, max_length=512):
    """
    Prepares the dataset using Masked Span Prediction (MSP).
    Ensures masked spans follow <extra_id_X> format, and sequences are padded/truncated.
    """
    inputs, outputs = [], []

    for item in raw_dataset:
        if "code" not in item:
            continue  # Skip if "code" key is missing

        code = item["code"]
        tokenized_code = tokenizer.tokenize(code)

        if len(tokenized_code) == 0:
            continue  # Skip empty sequences

        # Truncate sequences longer than max_length
        tokenized_code = tokenized_code[:max_length]

        masked_input, ground_truth = mask_spans(tokenized_code)

        input_ids = tokenizer.convert_tokens_to_ids(masked_input)
        labels = tokenizer.convert_tokens_to_ids(ground_truth)

        if len(input_ids) == 0 or len(labels) == 0:
            continue  # Avoid empty inputs

        inputs.append(torch.tensor(input_ids, dtype=torch.long))
        outputs.append(torch.tensor(labels, dtype=torch.long))

    print(f"✅ Processed {len(inputs)} valid samples")

    # **PAD SEQUENCES TO SAME LENGTH**
    input_padded = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(outputs, batch_first=True, padding_value=-100)  # -100 is ignored in loss calculation

    encodings = {
        "input_ids": input_padded,
        "labels": labels_padded
    }

    dataset = Dataset(encodings)
    return dataset

In [21]:
# obtain the training dataset
training_dataset = prepare_dataset(raw_dataset=raw_dataset, tokenizer=tokenizer)

✅ Processed 30000 valid samples


In [22]:
print(training_dataset[0])

(tensor([   20,    89,  2412,   599,  9719,   834, 12594,     6, 32099,     7,
            9,   162,   834,  8292,  2423,   567,   782,     6,     3,    29,
          834, 29848,   107,  6693,     7,  2423,   567,   782,     6,     3,
          157,    29,    29,   834,   138,   839,  2423,    31,  3184,   834,
          929,    15,    31,     6,  7375,    32,     7,    15,  2423,   371,
         5405,    15,    61,    10,    96,   121,   121, 15059,     7,     3,
            9,     3,   157,    18,    29,  2741,   222, 11195,   853,  7903,
           21,   522,  5786,     5,     3,    10,  6583,    51,  2412,   834,
        12594,    10,  8174,    24,  2579,     3,     9,   769,    18, 25982,
           63,    21,   284,   801,   568,     6,    28,   165,   564,     5,
           41, 15270, 32098,    12,   217,  2412,   834, 12594,   677,  2195,
         1809,    61, 21627,    10,     3,     2,  9719,   834, 12594,  3155,
           87,     3,     2,     3,     2,  6075,   536,  3155,

In [23]:
from tqdm.auto import tqdm
from torch.utils.data import DataLoader

def train(model, train_dataset, lr, batch_size, num_epochs):

  optimizer = AdamW(model.parameters(), lr=lr)

  if torch.cuda.is_available():
    print("cuda is available")
    model.cuda()

  loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

  total_losses = []
  for epoch in range(num_epochs):
      model.train()
      total_loss = 0
      for input_ids, labels in tqdm(loader):
        input_ids = input_ids.cuda()
        labels = labels.cuda()

        outputs = model(input_ids=input_ids, labels=labels)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

  model.eval()
  return model

In [None]:
# TODO: Apply hyperparameters
# NOTE: you may want to use a smaller epoch amount to reduce the time cost of training
# NOTE: you may also want to add additional checks to observe the performance of the model
# (validation dataset can be found here: https://zenodo.org/records/7908468 with the same format
# as the training dataset) they are also already downloaded to this notebook if you have ran
# the previous initialization steps

batch_size = 8
learning_rate = 3e-5
num_epochs = 3

trained_model = train(model, training_dataset, lr=learning_rate, batch_size=batch_size, num_epochs=num_epochs)



cuda is available


  0%|          | 0/3750 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


# Inference

In [None]:
!wget https://raw.githubusercontent.com/uiuc-cs598lmz-s25/hw1/main/hw_1_inference_dataset.jsonl

In [None]:
# reload the base model again
base_model, _ = load_base_model()

In [None]:
def grab_inference_dataset():
    inference_dataset = []
    file = "hw_1_inference_dataset.jsonl"
    with open(file, "r") as f:
        inference_dataset.extend([json.loads(x) for x in f.readlines()])
    print("grabbed data from file {}".format(file))
    print("Number of tasks: {}".format(len(inference_dataset)))
    return inference_dataset

inference_dataset = grab_inference_dataset()

In [None]:
!pip install Levenshtein

In [None]:
def code_infill(code_model, tokenizer, prefix, suffix):
    # TODO: complete the implementation of this function
    return model_output


from Levenshtein import distance as levenshtein_distance


def evaluate(code_model, tokenizer, inference_dataset):

    def edit_distance(model_output, gt):
        # TODO: implement this
        return levenshtein_distance(model_output, gt)

    ed_scores = []
    for data in inference_dataset:

        prefix = data['prefix']
        suffix = data['suffix']
        gt = data['gt']

        model_output = code_infill(code_model, tokenizer, prefix, suffix)
        ed_scores.append(edit_distance(model_output, gt))

    import statistics as st
    return st.mean(ed_scores)

# Report the results
base_model_ed = evaluate(base_model, tokenizer, inference_dataset)
trained_model_ed = evaluate(trained_model, tokenizer, inference_dataset)