# Code Autocompletion with GPT-2

GPT-2 is an autoregressive model trained on a Causal Language Modeling task. This menas that the GPT-2 model was trained on a next token prediction task, such that the model, provided a sequence of $n$ tokens, had to predict the $n+1$*th* token. This is a Causal Language Modeling task since the prediction of the $n+1$*th* token can be framed as the below probabilistic task:

$$t_{n+1} = \argmax_{x} \Pr(x∣t_1,t_2,…,t_n)$$

By giving this model a sequence of code ($n$ tokens of code, to be specific), we can expect to receive what, probabilistically, the next bit of code should be (the $n+1$*th* token). Once the model predicts the $n+1$*th* token, we can use this new sequence of tokens $[t_0, ..., t_{n+1}]$ to predict the $n+2$*th* token, and this process can be repeated recursively to generate as many tokens as we would like. This is known as autoregressive generation.

In [2]:
import os
import torch
import evaluate
import regex as re
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, pipeline
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset, IterableDataset
# these are all the libraries you'd need

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))  # Should return the name of the GPU

Using device: cpu


In [90]:
import re
def clean_data(inp: str) -> str:
    """OPTIONAL: Perform data cleaning, if necessary."""
    s = re.sub(r'^#.*\n?', '', inp, flags=re.MULTILINE)
    return s

def get_data() -> Dataset:
    # https://huggingface.co/datasets/codeparrot/codeparrot-clean
    # Load the dataset
    ds = load_dataset("codeparrot/codeparrot-clean", streaming=True, trust_remote_code=True, split="train")

    # Clean the data
    ds = ds.map(lambda x: {"content": clean_data(x["content"])})

    return ds

dataset = get_data()

In [5]:
type(dataset) # This is important...

datasets.iterable_dataset.IterableDataset

In [None]:
clean out comments
Retrain tokenizer (specify vocabulary size)
tokenize data (bucket code, chunk)
perplexity (lower is better, 0-10)


In [91]:
model     = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3, weight_decay= 0.001)

model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [92]:
def get_train_valid_data(dataset: Dataset) -> (Dataset, Dataset):
    """TODO: Split the dataset into training and validation sets."""
    # This is not too straightforward because the dataset is a streaming dataset
    #n = 300000
    n = 150
    split = int(n*0.75)
    dataset.shuffle()
    ds_train = dataset.take(n)
    ds_valid = ds_train.skip(split)
    ds_train = ds_train.take(split)
    return ds_train, ds_valid

train_data, valid_data = get_train_valid_data(dataset)

In [31]:
print(type(train_data))
print(type(valid_data))

<class 'datasets.iterable_dataset.IterableDataset'>
<class 'datasets.iterable_dataset.IterableDataset'>


In [93]:
class SafeIterableDataset(torch.utils.data.IterableDataset):
    """Wrapper to account for download errors so training doesn't stop due to error pulling data from HF."""
    def __init__(self, dataset):
        self.dataset = dataset

    def __iter__(self):
        iterator = iter(self.dataset)
        while True:
            try:
                item = next(iterator)
                yield item
            except StopIteration:
                break
            except Exception as e:
                print(f"Caught exception during data loading: {e}. Skipping item.")
                continue

train_data = SafeIterableDataset(train_data)
valid_data = SafeIterableDataset(valid_data)

train_loader = DataLoader(train_data,  batch_size=16)
test_loader  = DataLoader(valid_data,  batch_size=16)

In [94]:
def tokenize(inp: list[str]):
    """
    TODO: Tokenize the input.
    Consider:
    - Padding?
    - Truncation?
    - Anything else?
    """
    # truncate to first 256 tokens
    # pad to make every example the same size (ex: 256 tokens)
    inp = tokenizer(inp)['input_ids']
    results = []

    for ex in inp:
        ex.extend([0] * (max(0, 256 - len(ex))))
        ex = ex[:256]
        results.append(torch.tensor(ex))
    return torch.stack(results)


    #return(tokenizer(inp)["input_ids"])


In [87]:
x = tokenize(["import pandas as pd", "import numpy as np", "def sum(a, b):\n\treturn a + b"])
#print(x.shape)
for i in x:
    print(i.shape)
#for i in x:
    #print(i)

torch.Size([256])
torch.Size([256])
torch.Size([256])


In [95]:
def train():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.train()

    for batch in train_loader:
        #single_example_split = split_into_groups_of_256_tokens(tokenized_example)
        #for _265_token_example in example_split:
        # TODO: Implement training loop
        # Note that device that data is on should be the same as the model
        #for k in batch:
            #print(k, batch[k])
        input_ids = tokenize(batch["content"])
        labels = input_ids.clone()
        # labels are automatically shifted for next token prediction
        # assuming model is of type AutoModelForCausalLM
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        #raise NotImplementedError

train()

Token indices sequence length is longer than the specified maximum sequence length for this model (1310 > 1024). Running this sequence through the model will result in indexing errors


In [105]:
def val():
    losses=[]
    model.eval()

    with torch.no_grad():
        for batch in test_loader:
            # Implement validation loop
            # Note that device that data is on should be the same as the model
            input_ids = tokenize(batch["content"])
            labels = input_ids.clone()
            outputs = model(input_ids, labels=labels)
            losses.append(outputs.loss)
        loss = torch.mean(torch.tensor(losses))
        try:
            perplexity = torch.exp(loss)
        except OverflowError:
            perplexity = float("inf")
            raise NotImplementedError
        return loss.item(), perplexity.item()

loss, perplexity = val()
print("loss:", loss)
print("perplexity:", perplexity)

loss: 6.62711763381958
perplexity: 755.302001953125


         -920462829349169424, -8993424027276596643,  2272791862150649423,
         9129429547262664033,  7864434311480931289,  6523961272627054749,
         7445365950426380491,  5228835369578569741,  2893657285305545592,
        -6282021567314024766, -8372066044332046382, -4717631936433851971,
        -1037034474115704284]), 'line_mean': tensor([41.3719, 37.9353, 37.7336, 35.3333, 35.4429, 33.9280, 31.9006, 44.1182,
        36.1017, 45.5620, 34.0183, 35.6689, 34.2530, 23.1712, 30.2727, 34.1087],
       dtype=torch.float64), 'line_max': tensor([ 80,  79, 137,  98,  80, 109,  74, 143, 101, 107, 114, 116, 100,  70,
         97,  77]), 'alpha_frac': tensor([0.6290, 0.6451, 0.5446, 0.6272, 0.5735, 0.5296, 0.5372, 0.5984, 0.5539,
        0.6324, 0.5174, 0.5068, 0.7198, 0.4933, 0.6449, 0.6449],
       dtype=torch.float64), 'autogenerated': tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False])}


RuntimeError: torch.cat(): expected a non-empty list of Tensors

In [None]:
os.environ["HF_HUB_ETAG_TIMEOUT"]     = "500"
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "500"

In [None]:
# TODO: Consider setting up model checkpointing (set up a directory to save checkpoints)
...

In [None]:
# Clear residual gradients (might cause issues with taking grad. of frozen layers)
model.zero_grad(set_to_none=True)

n_epochs = ...

for epoch in range(n_epochs):
    print(f"Epoch: {epoch}")

    # TODO: Implement training and validation
    ...
    raise NotImplementedError

print("Training complete")

Common antidotes to CUDA Out of Memory errors include:
1. Freezing layers of your model (training less parameters).
2. Using gradient checkpointing to save GPU memory.
3. Reducing the max sequence length of your data (default=1024 with GPT-2 tokenizer, which is colossal).
4. Reducing batch size (look into gradient accumulation).

And, of course:

5. Using a smaller model.

In [None]:
# TODO: Save the model
...
raise NotImplementedError