<a href="https://colab.research.google.com/github/NicoPolazzi/autocomplete/blob/feat%2Fmodel/notebooks/train_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

In [None]:
!git clone https://github.com/NicoPolazzi/autocomplete.git
%cd autocomplete

In [2]:
#TODO: delete this code cell
!rm -r /content/autocomplete
!git clone -b feat/model --single-branch https://github.com/NicoPolazzi/autocomplete.git
%cd autocomplete

Cloning into 'autocomplete'...
remote: Enumerating objects: 148, done.[K
remote: Counting objects: 100% (148/148), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 148 (delta 80), reused 101 (delta 42), pack-reused 0 (from 0)[K
Receiving objects: 100% (148/148), 32.84 KiB | 525.00 KiB/s, done.
Resolving deltas: 100% (80/80), done.
/content/autocomplete


In [3]:
import sys
sys.path.append('.')

import torch
from torch.utils.data import random_split
from torch.utils.data import DataLoader

from src.dataset import CodeDataset
from src.model import CodeAutocompleteModel
from src.optimization import train_and_evaluate

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

Using device = cuda:0


In [18]:
import time
import torch.nn as nn
import torch
from torch.optim import Adam
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

def train_and_evaluate(
    model,
    train_set,
    validation_set,
    epochs=2,
    lr=1e-3,
    device="cuda" if torch.cuda.is_available() else "cpu",
):
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=0.01)
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    model.to(device)
    total_start = time.time()

    for epoch in range(epochs):
        total_loss = 0.0
        eval_loss = 0.0

        model.train()
        for batch in train_set:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            target_ids = batch["target_ids"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs.view(-1, outputs.size(-1)), target_ids.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Train Loss: {total_loss:.4f}")

        model.eval()
        with torch.no_grad():
            for batch in validation_set:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                target_ids = batch["target_ids"].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs.view(-1, outputs.size(-1)), target_ids.view(-1))
                eval_loss += loss.item()

        print(f"Epoch {epoch+1}, Eval Loss: {eval_loss:.4f}")

    total_time = time.time() - total_start
    print(f"Total training time: {total_time:.2f} seconds")


In [20]:
dataset = CodeDataset(max_length=64, max_samples=20000)

In [13]:
batch_size = 128
epochs = 20
lr = 1e-3  # top value

generator = torch.Generator(device='cuda').manual_seed(42)

val_size = int(len(dataset) * 0.2)
train_size = len(dataset) - val_size
train_dataset, val_dataset = random_split(
        dataset, [train_size, val_size], generator=generator
    )

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, generator= generator)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, generator= generator)

In [21]:
from src.model import CodeAutocompleteRNN

hidden_dimension = 256
embed_dimension = 128
num_layers = 2

model = CodeAutocompleteRNN(
            dataset.tokenizer.vocab_size, embed_dimension, hidden_dimension, num_layers
        )
train_and_evaluate(model, train_loader, val_loader, epochs, lr)

Epoch 1, Train Loss: 629.5298
Epoch 1, Eval Loss: 126.1314
Epoch 2, Train Loss: 484.7964
Epoch 2, Eval Loss: 121.9092
Epoch 3, Train Loss: 469.3506
Epoch 3, Eval Loss: 118.2670
Epoch 4, Train Loss: 454.2737
Epoch 4, Eval Loss: 112.7238
Epoch 5, Train Loss: 425.7894
Epoch 5, Eval Loss: 106.1157
Epoch 6, Train Loss: 409.4990
Epoch 6, Eval Loss: 104.4106
Epoch 7, Train Loss: 409.6201
Epoch 7, Eval Loss: 105.0077
Epoch 8, Train Loss: 411.8685
Epoch 8, Eval Loss: 105.9431
Epoch 9, Train Loss: 413.6548
Epoch 9, Eval Loss: 106.9124
Epoch 10, Train Loss: 417.8784
Epoch 10, Eval Loss: 106.3757
Epoch 11, Train Loss: 422.3387
Epoch 11, Eval Loss: 106.4571
Epoch 12, Train Loss: 425.5301
Epoch 12, Eval Loss: 109.2924
Epoch 13, Train Loss: 431.2689
Epoch 13, Eval Loss: 112.2062
Epoch 14, Train Loss: 437.5584
Epoch 14, Eval Loss: 111.0365
Epoch 15, Train Loss: 435.9713
Epoch 15, Eval Loss: 112.5897
Epoch 16, Train Loss: 443.7543
Epoch 16, Eval Loss: 112.0915
Epoch 17, Train Loss: 448.9489
Epoch 17, E