# Simple initial notebook for training very small model on simple selective copying task configuration

In [153]:
import torch
from transformers import GPT2Config, GPT2Model
from model_load import MyModel
from custom_gpt import MyGPT2Attention, SkipBlock, IdentityGPT2Block
from trainer import eval, train
from visualization import model_viz_data

In [154]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

In [155]:
# Configuration for training
training_config = {
    "batch_size": 1000,
    "learning_rate": 0.0003,
    "num_steps": 10000
}

# Configuration for dataset
dataset_config = {
    "span_length": 4,
    "num_spans": 3,
    "copying_ratio": .5,
    "n_tokens": 10,  # alphabet size
    "lag": False,
    "variable": True,  # Randomly distribute memorization tokens throughout sequence instead of frontloading them
    "variable_length": False,  # Randomize number of tokens to memorize
    "one_hot": False,
    "reverse": False,
    "static": False,
}

custom_config =  GPT2Config(
  bos_token_id= dataset_config['n_tokens'],
  eos_token_id= dataset_config['n_tokens'],
  n_embd= 16,
  n_head= 2,
  n_layer= 1,
  vocab_size= dataset_config['n_tokens']+1
)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [156]:
gpt_base = GPT2Model(custom_config).to(device)
my_model = MyModel(gpt_base, device, custom_config)

# Swap layers out for getting per-layer activations more easily
# This fixes bug where we cannot attach captum activation listeners to blocks directly
my_model.base_model.h = nn.ModuleList([IdentityGPT2Block(custom_config, layer_idx=i) 
                                       for i in range(custom_config.num_hidden_layers)]).to(device)

# Swap the operations in the attention layer to output attention layer values by head
for block in my_model.base_model.h:
    block.attn = MyGPT2Attention(my_model.base_model.config, output_per_head=True).to(device)


train(my_model, dataset_config, training_config, device)

Step [1/10000], Loss: 0.0026, Accuracy: 4.08%
5.15
Step [26/10000], Loss: 0.0022, Accuracy: 12.38%
Step [51/10000], Loss: 0.0022, Accuracy: 13.07%
Step [76/10000], Loss: 0.0021, Accuracy: 12.76%
Step [101/10000], Loss: 0.0021, Accuracy: 13.91%
14.241666666666667
Step [126/10000], Loss: 0.0021, Accuracy: 18.40%
Step [151/10000], Loss: 0.0021, Accuracy: 21.89%
Step [176/10000], Loss: 0.0020, Accuracy: 24.43%
Step [201/10000], Loss: 0.0019, Accuracy: 25.76%
25.966666666666665
Step [226/10000], Loss: 0.0019, Accuracy: 27.53%
Step [251/10000], Loss: 0.0019, Accuracy: 28.33%
Step [276/10000], Loss: 0.0018, Accuracy: 29.71%
Step [301/10000], Loss: 0.0017, Accuracy: 34.23%
35.21666666666667
Step [326/10000], Loss: 0.0016, Accuracy: 37.57%
Step [351/10000], Loss: 0.0015, Accuracy: 41.08%
Step [376/10000], Loss: 0.0014, Accuracy: 43.50%
Step [401/10000], Loss: 0.0013, Accuracy: 48.45%
48.516666666666666
Step [426/10000], Loss: 0.0012, Accuracy: 54.16%
Step [451/10000], Loss: 0.0010, Accuracy: 61

[5.15,
 14.241666666666667,
 25.966666666666665,
 35.21666666666667,
 48.516666666666666,
 69.45,
 74.36666666666666,
 77.09166666666667,
 77.59166666666667,
 78.93333333333334,
 79.95833333333333,
 80.5,
 81.79166666666667,
 83.74166666666666,
 85.46666666666667,
 86.7,
 86.95,
 87.675,
 91.94166666666666,
 94.125,
 94.0,
 94.525,
 93.84166666666667,
 94.00833333333334,
 94.15833333333333,
 94.69166666666666,
 94.76666666666667,
 94.75,
 94.75,
 94.84166666666667,
 95.45,
 96.8,
 97.40833333333333,
 97.425,
 97.54166666666667,
 97.525,
 97.6,
 97.75833333333334,
 97.93333333333334,
 97.80833333333334,
 99.55,
 99.925,
 99.99166666666666,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 100.0,
 

In [157]:
eval(my_model, dataset_config, training_config, device)

100.0


100.0

In [158]:
head = True
split = True
attention, x, y = model_viz_data(my_model, dataset_config, training_config, 
                                 device, head=head, split=split, x=None, y=None)

tensor([[7, 8, 7, 5, 8, 2, 8, 2, 5, 5, 4, 6, 3, 3, 5, 5, 6, 2, 3, 2, 5, 2, 5, 5,
         1, 3, 5, 8, 4, 2, 2, 6, 4, 7, 5, 3]], device='cuda:0')


<IPython.core.display.Javascript object>