In [7]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer, BatchEncoding, GPT2LMHeadModel, GPT2Config, GPT2ForSequenceClassification
from tqdm import tqdm
import torch as torch
#from modular_transformers.models.gpt2.configuration_gpt2 import GPT2Config
from modular_transformers.models import components
from datasets import load_dataset, load_from_disk
import os
from torch.utils.data import Dataset
import wandb
from modular_transformers.straightening.straightening_utils import compute_model_activations, compute_model_curvature

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
path = "/om2/user/jackking/modular_transformers/scripts/input_statistics/data"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
wandb.login(key="a338f755915cccd861b14f29bf68601d8e1ec2c9")

#set seed
seed = 21
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjack-g-king[0m ([33mmodular_transformers[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jackking/.netrc


In [8]:
class LMDataset(Dataset):
    def __init__(self, inputs, attn_mask=None, labels=None):
        #cast to tensors if not already tensors
        if not torch.is_tensor(inputs):
            inputs = torch.tensor(inputs)
        if not torch.is_tensor(labels):
            labels = torch.tensor(labels)
        if attn_mask is not None and not torch.is_tensor(attn_mask):
            attn_mask = torch.tensor(attn_mask)
            
        self.inputs = inputs
        self.attn_mask = attn_mask
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        if self.labels is None:
            item = {
                'input_ids': self.inputs[idx],
                'attention_mask': self.attn_mask[idx]}
        elif self.attn_mask is None:
            item = {
                'input_ids': self.inputs[idx],
                'labels': self.labels[idx]
            }
        else:
            item = {
                'input_ids': self.inputs[idx],
                'attention_mask': self.attn_mask[idx],
                'labels': self.labels[idx]
            }
        return item

def make_autoregressive_dataset(data):
    tokenizer.pad_token = tokenizer.eos_token
    dataset = tokenizer.batch_encode_plus(data, add_special_tokens=True, padding='longest', return_tensors="pt")
    inputs = dataset["input_ids"]
    attn_mask = dataset["attention_mask"]
    labels = dataset["input_ids"].clone()
    context_len = inputs.size(1)
    return LMDataset(inputs, attn_mask, labels), context_len

def make_classification_dataset(data1, data2):
    tokenizer.pad_token = tokenizer.eos_token
    len1 = len(data1)
    len2 = len(data2)
    combined = data1 + data2
    labels = [0]*len1 + [1]*len2
    dataset = tokenizer.batch_encode_plus(combined, add_special_tokens=True, padding='longest', return_tensors="pt")
    inputs = dataset["input_ids"]
    attn_mask = dataset["attention_mask"]
    context_len = inputs.size(1)
    return LMDataset(inputs, attn_mask, torch.tensor(labels)), context_len

### Dataloading

In [3]:
def get_vocab_size(data):
    blah = []
    for sent in data:
        blah.extend(sent)
    vocab_size = len(set(blah))
    return vocab_size

## Experiment 1

### M1

In [4]:
datatype = "experiment_1"

train_data = pickle.load(open(f"{path}/{datatype}/train_data_A.pkl", 'rb'))
valid_data = pickle.load(open(f"{path}/{datatype}/valid_data_A.pkl", 'rb'))

vocab_size = get_vocab_size(train_data+valid_data)

batch_size = 128

ctx_len = len(train_data[0])

trainset = LMDataset(train_data, labels=train_data)
valset = LMDataset(valid_data, labels=valid_data)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "M1"

### M2

##### Bigrams

In [None]:
batch_size = 128
datatype = "experiment_1"
train_data_B1 = pickle.load(open(f"{path}/{datatype}/train_data_B1.pkl", "rb"))
val_data_B1 = pickle.load(open(f"{path}/{datatype}/valid_data_B1.pkl", "rb"))
train_data_B2 = pickle.load(open(f"{path}/{datatype}/train_data_B2.pkl", "rb"))
val_data_B2 = pickle.load(open(f"{path}/{datatype}/valid_data_B2.pkl", "rb"))

train_data = train_data_B1 #+ train_data_B2
val_data = val_data_B1 #+ val_data_B2

vocab_size = get_vocab_size(train_data+val_data)
ctx_len = len(train_data[0])

trainset = LMDataset(train_data, labels=train_data)
valset = LMDataset(val_data, labels=val_data)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "M2_B1"

##### Trigrams

In [None]:
batch_size = 16
datatype = "shorter_data"
train_data_T1 = pickle.load(open(f"{path}/{datatype}/train_data_T1.pkl", "rb"))
val_data_T1 = pickle.load(open(f"{path}/{datatype}/valid_data_T1.pkl", "rb"))
train_data_T2 = pickle.load(open(f"{path}/{datatype}/train_data_T2.pkl", "rb"))
val_data_T2 = pickle.load(open(f"{path}/{datatype}/valid_data_T2.pkl", "rb"))

train_data = train_data_T1 + train_data_T2
val_data = val_data_T1 + val_data_T2

vocab_size = get_vocab_size(train_data+val_data)
ctx_len = len(train_data[0])

trainset = LMDataset(train_data, labels=train_data)
valset = LMDataset(val_data, labels=val_data)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "M2_T"

##### Fourgrams

In [None]:
batch_size = 16
datatype = "shorter_data"
train_data_T1 = pickle.load(open(f"{path}/{datatype}/train_data_F1.pkl", "rb"))
val_data_T1 = pickle.load(open(f"{path}/{datatype}/valid_data_F1.pkl", "rb"))
train_data_T2 = pickle.load(open(f"{path}/{datatype}/train_data_F2.pkl", "rb"))
val_data_T2 = pickle.load(open(f"{path}/{datatype}/valid_data_F2.pkl", "rb"))

train_data = train_data_T1 + train_data_T2
val_data = val_data_T1 + val_data_T2

vocab_size = get_vocab_size(train_data+val_data)
ctx_len = len(train_data[0])

trainset = LMDataset(train_data, labels=train_data)
valset = LMDataset(val_data, labels=val_data)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "M2_F"

### M3

##### Bigrams

In [None]:
batch_size = 8
datatype = "experiment_1"
train_data_B1 = pickle.load(open(f"{path}/{datatype}/train_data_B1.pkl", "rb"))
val_data_B1 = pickle.load(open(f"{path}/{datatype}/valid_data_B1.pkl", "rb"))
train_data_B2 = pickle.load(open(f"{path}/{datatype}/train_data_B2.pkl", "rb"))
val_data_B2 = pickle.load(open(f"{path}/{datatype}/valid_data_B2.pkl", "rb"))

len1 = len(train_data_B1)
len2 = len(train_data_B2)
train_data = train_data_B1 + train_data_B2
train_labels = [0]*len1 + [1]*len2
val_data = val_data_B1 + val_data_B2
len1 = len(val_data_B1)
len2 = len(val_data_B2)
val_labels = [0]*len1 + [1]*len2

ctx_len = len(train_data[0])

vocab_size = get_vocab_size(train_data+val_data)

trainset = LMDataset(train_data, labels=torch.tensor(train_labels))
valset = LMDataset(val_data, labels=torch.tensor(val_labels))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False) 

model_type = "M3_B"

##### Trigrams

In [26]:
batch_size = 64
datatype = "experiment_1"
train_data_T1 = pickle.load(open(f"{path}/{datatype}/train_data_T1.pkl", "rb"))
val_data_T1 = pickle.load(open(f"{path}/{datatype}/valid_data_T1.pkl", "rb"))
train_data_T2 = pickle.load(open(f"{path}/{datatype}/train_data_T2.pkl", "rb"))
val_data_T2 = pickle.load(open(f"{path}/{datatype}/valid_data_T2.pkl", "rb"))

len1 = len(train_data_T1)
len2 = len(train_data_T2)
train_data = train_data_T1 + train_data_T2
train_labels = [0]*len1 + [1]*len2
val_data = val_data_T1 + val_data_T2
len1 = len(val_data_T1)
len2 = len(val_data_T2)
val_labels = [0]*len1 + [1]*len2

ctx_len = len(train_data[0])

vocab_size = get_vocab_size(train_data+val_data)

trainset = LMDataset(train_data, labels=torch.tensor(train_labels))
valset = LMDataset(val_data, labels=torch.tensor(val_labels))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False) 

model_type = "M3_T"

##### Fourgrams

In [None]:
batch_size = 8
datatype = "experiment_1"
train_data_T1 = pickle.load(open(f"{path}/{datatype}/train_data_F1.pkl", "rb"))
val_data_T1 = pickle.load(open(f"{path}/{datatype}/valid_data_F1.pkl", "rb"))
train_data_T2 = pickle.load(open(f"{path}/{datatype}/train_data_F2.pkl", "rb"))
val_data_T2 = pickle.load(open(f"{path}/{datatype}/valid_data_F2.pkl", "rb"))

len1 = len(train_data_T1)
len2 = len(train_data_T2)
train_data = train_data_T1 + train_data_T2
train_labels = [0]*len1 + [1]*len2
val_data = val_data_T1 + val_data_T2
len1 = len(val_data_T1)
len2 = len(val_data_T2)
val_labels = [0]*len1 + [1]*len2

ctx_len = len(train_data[0])

vocab_size = get_vocab_size(train_data+val_data)

trainset = LMDataset(train_data, labels=torch.tensor(train_labels))
valset = LMDataset(val_data, labels=torch.tensor(val_labels))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False) 

model_type = "M3_F"

## Random trials

### R1

In [None]:
datatype = "random"

train_data = pickle.load(open(f"{path}/{datatype}/train_data.pkl", 'rb'))
valid_data = pickle.load(open(f"{path}/{datatype}/valid_data.pkl", 'rb'))

vocab_size = get_vocab_size(train_data+valid_data)

batch_size = 128

ctx_len = len(train_data[0])

trainset = LMDataset(train_data, labels=train_data)
valset = LMDataset(valid_data, labels=valid_data)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "R1"

In [None]:
embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2
tokenizer.pad_token = tokenizer.eos_token
vocab_size = vocab_size + 5 #for special tokens
num_labels = 2

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
# model = GPT2ForSequenceClassification._from_config(model_config)
model = GPT2LMHeadModel._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 30, "lr": 0.0002, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

### R2

In [None]:
datatype = "random"

train_data = pickle.load(open(f"{path}/{datatype}/train_data.pkl", 'rb'))
valid_data = pickle.load(open(f"{path}/{datatype}/valid_data.pkl", 'rb'))

train_labels = np.random.randint(0, 2, len(train_data))
val_labels = np.random.randint(0, 2, len(valid_data))

vocab_size = get_vocab_size(train_data+valid_data)

batch_size = 128

ctx_len = len(train_data[0])

trainset = LMDataset(train_data, labels=train_labels)
valset = LMDataset(valid_data, labels=val_labels)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "R2"

In [None]:
embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.3
attn_pdrop = 0.3
tokenizer.pad_token = tokenizer.eos_token
vocab_size = vocab_size + 5 #for special tokens
num_labels = 2

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
model = GPT2ForSequenceClassification._from_config(model_config)
# model = GPT2LMHeadModel._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 50, "lr": 0.000001, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

### X1

#### Natural

In [7]:
datatype = "random"

train_data = pickle.load(open(f"{path}/experiment_1/train_data_A.pkl", 'rb'))
train_data = train_data[:15000]
valid_data = pickle.load(open(f"{path}/experiment_1/valid_data_A.pkl", 'rb'))
valid_data = valid_data[:5000]

train_labels = pickle.load(open(f"{path}/{datatype}/train_data.pkl", 'rb'))
val_labels = pickle.load(open(f"{path}/{datatype}/valid_data.pkl", 'rb'))

vocab_size = get_vocab_size(train_data+valid_data)

batch_size = 128

ctx_len = len(train_data[0])

trainset = LMDataset(train_data, labels=train_labels)
valset = LMDataset(valid_data, labels=val_labels)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "X1"

  labels = torch.tensor(labels)


In [8]:
embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2
tokenizer.pad_token = tokenizer.eos_token
vocab_size = vocab_size + 5 #for special tokens
num_labels = 2

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
# model = GPT2ForSequenceClassification._from_config(model_config)
model = GPT2LMHeadModel._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 50, "lr": 0.0002, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

  0%|          | 0/50 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 118/118 [00:16<00:00,  7.15it/s]
100%|██████████| 40/40 [00:01<00:00, 23.41it/s]
100%|██████████| 118/118 [00:16<00:00,  7.36it/s]
100%|██████████| 40/40 [00:01<00:00, 23.35it/s]
100%|██████████| 118/118 [00:16<00:00,  7.35it/s]
100%|██████████| 40/40 [00:01<00:00, 23.34it/s]
100%|██████████| 118/118 [00:16<00:00,  7.34it/s]
100%|██████████| 40/40 [00:01<00:00, 23.26it/s]
100%|██████████| 118/118 [00:16<00:00,  7.34it/s]
100%|██████████| 40/40 [00:01<00:00, 23.29it/s]
100%|██████████| 118/118 [00:16<00:00,  7.33it/s]
100%|██████████| 40/40 [00:01<00:00, 23.30it/s]
100%|██████████| 118/118 [00:16<00:00,  7.32it/s]
100%|██████████| 40/40 [00:01<00:00, 23.28it/s]
100%|██████████| 118/118 [00:16<00:00,  7.33it/s]
100%|██████████| 40/40 [0

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,▅▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▅▅▆▆▆▇▇▇█

0,1
epoch,49.0
learning_rate,0.0002
loss,6.54649
step,5899.0
val_loss,7.62655


#### Bigram

In [50]:
datatype = "experiment_1"
train_data_B1 = pickle.load(open(f"{path}/{datatype}/train_data_B1.pkl", "rb"))
val_data_B1 = pickle.load(open(f"{path}/{datatype}/valid_data_B1.pkl", "rb"))
train_data_B2 = pickle.load(open(f"{path}/{datatype}/train_data_B2.pkl", "rb"))
val_data_B2 = pickle.load(open(f"{path}/{datatype}/valid_data_B2.pkl", "rb"))

train_data = train_data_B1 + train_data_B2
val_data = val_data_B1 + val_data_B2

datatype = "random"

train_labels = pickle.load(open(f"{path}/{datatype}/train_data.pkl", 'rb'))
val_labels = pickle.load(open(f"{path}/{datatype}/valid_data.pkl", 'rb'))

train_data = train_data[:15000]
valid_data = val_data[:5000]

batch_size = 128

ctx_len = len(train_data[0])

trainset = LMDataset(train_data, labels=train_labels)
valset = LMDataset(valid_data, labels=val_labels)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "X1_B"

In [51]:
embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2
tokenizer.pad_token = tokenizer.eos_token
num_labels = 2

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
# model = GPT2ForSequenceClassification._from_config(model_config)
model = GPT2LMHeadModel._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 40, "lr": 0.001, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)



0,1
epoch,▁▂▄▅▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
val_loss,█▄▄▃▁▂

0,1
epoch,5.0
learning_rate,0.001
loss,7.08353
step,757.0
val_loss,7.09314


100%|██████████| 118/118 [00:15<00:00,  7.43it/s]
100%|██████████| 40/40 [00:01<00:00, 23.53it/s]
100%|██████████| 118/118 [00:15<00:00,  7.42it/s]
100%|██████████| 40/40 [00:01<00:00, 23.49it/s]
100%|██████████| 118/118 [00:15<00:00,  7.38it/s]
100%|██████████| 40/40 [00:01<00:00, 23.48it/s]
100%|██████████| 118/118 [00:16<00:00,  7.29it/s]
100%|██████████| 40/40 [00:01<00:00, 23.44it/s]
100%|██████████| 118/118 [00:15<00:00,  7.39it/s]
100%|██████████| 40/40 [00:01<00:00, 23.44it/s]
100%|██████████| 118/118 [00:16<00:00,  7.28it/s]
100%|██████████| 40/40 [00:01<00:00, 23.45it/s]
100%|██████████| 118/118 [00:15<00:00,  7.38it/s]
100%|██████████| 40/40 [00:01<00:00, 23.18it/s]
100%|██████████| 118/118 [00:16<00:00,  7.28it/s]
100%|██████████| 40/40 [00:01<00:00, 23.44it/s]
100%|██████████| 118/118 [00:16<00:00,  7.35it/s]
100%|██████████| 40/40 [00:01<00:00, 23.41it/s]
100%|██████████| 118/118 [00:16<00:00,  7.27it/s]
100%|██████████| 40/40 [00:01<00:00, 23.39it/s]
100%|██████████| 118

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,████████▇▇█▇▇▇▇▇▇▇▇▇▆▇▆▆▅▅▅▄▅▄▃▄▃▃▂▂▂▂▁▁
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▅▅▆▅▅▇▇▇█

0,1
epoch,39.0
learning_rate,0.001
loss,6.90235
step,4719.0
val_loss,7.34712


### X2

#### Natural

In [None]:
datatype = "random"

train_data = pickle.load(open(f"{path}/experiment_1/train_data_A.pkl", 'rb'))
valid_data = pickle.load(open(f"{path}/experiment_1/valid_data_A.pkl", 'rb'))

train_labels = np.random.randint(0, 2, len(train_data))
val_labels = np.random.randint(0, 2, len(valid_data))

vocab_size = get_vocab_size(train_data+valid_data)

batch_size = 128

ctx_len = len(train_data[0])

trainset = LMDataset(train_data, labels=train_labels)
valset = LMDataset(valid_data, labels=val_labels)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "X2"

In [None]:
embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.3
attn_pdrop = 0.3
tokenizer.pad_token = tokenizer.eos_token
vocab_size = vocab_size + 5 #for special tokens
num_labels = 2

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
model = GPT2ForSequenceClassification._from_config(model_config)
# model = GPT2LMHeadModel._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 50, "lr": 0.000001, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

#### Bigram

In [None]:
datatype = "experiment_1"
train_data_B1 = pickle.load(open(f"{path}/{datatype}/train_data_B1.pkl", "rb"))
val_data_B1 = pickle.load(open(f"{path}/{datatype}/valid_data_B1.pkl", "rb"))
train_data_B2 = pickle.load(open(f"{path}/{datatype}/train_data_B2.pkl", "rb"))
val_data_B2 = pickle.load(open(f"{path}/{datatype}/valid_data_B2.pkl", "rb"))

train_data = train_data_B1 + train_data_B2
val_data = val_data_B1 + val_data_B2

datatype = "random"

train_labels = np.random.randint(0, 2, len(train_data))
val_labels = np.random.randint(0, 2, len(val_data))

batch_size = 128

ctx_len = len(train_data[0])

trainset = LMDataset(train_data, labels=train_labels)
valset = LMDataset(valid_data, labels=val_labels)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "X2_B"

In [None]:
embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.3
attn_pdrop = 0.3
tokenizer.pad_token = tokenizer.eos_token
vocab_size = vocab_size + 5 #for special tokens
num_labels = 2

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
model = GPT2ForSequenceClassification._from_config(model_config)
# model = GPT2LMHeadModel._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 150, "lr": 0.000001, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

## experiment 2

#### M1

In [34]:
datacategory = "G"

In [32]:
datatype = f"experiment_2S_{datacategory}"
data_path = f"{path}/experiment_2S"

batch_size = 128

train_data = pickle.load(open(f"{data_path}/train_data_{datacategory}.pkl", 'rb'))
valid_data = pickle.load(open(f"{data_path}/valid_data_{datacategory}.pkl", 'rb'))

vocab_size = get_vocab_size(train_data["inputs"]+valid_data["inputs"])

ctx_len = len(train_data["inputs"][0])

trainset = LMDataset(train_data["inputs"], labels=train_data["inputs"])
valset = LMDataset(valid_data["inputs"], labels=valid_data["inputs"])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "M1"

#### M2

In [35]:
datatype = f"experiment_2S_{datacategory}"
data_path = f"{path}/experiment_2S"

train_data = pickle.load(open(f"{data_path}/train_data_{datacategory}.pkl", 'rb'))
valid_data = pickle.load(open(f"{data_path}/valid_data_{datacategory}.pkl", 'rb'))

ctx_len = len(train_data["inputs"][0])

vocab_size = get_vocab_size(train_data["inputs"] + valid_data["inputs"])

batch_size = 128

num_labels = 200

trainset = LMDataset(train_data["inputs"], labels=train_data["labels"])
valset = LMDataset(valid_data["inputs"], labels=valid_data["labels"])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "M2"

### training loop

In [4]:
def evaluate(model, valloader):
    model.eval()
    losses = []
    for step, batch in tqdm(enumerate(valloader), total=len(valloader)):
        with torch.no_grad():
            inputs = batch["input_ids"].to(device)
            if "attention_mask" in batch:
                attention_mask = batch["attention_mask"].to(device)
            else:
                attention_mask = None
            labels = batch["labels"].to(device)
            outputs = model(inputs, labels=labels, attention_mask=attention_mask)
        losses.append(outputs.loss)
    loss = torch.mean(torch.stack(losses))
    return loss.item()

In [5]:
def train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader):
    wandb.init(project="input statistics", config=train_config)
    run_name = wandb.run.name

    save_epochs = train_config["num_epochs"] // 10

    for epoch in tqdm(range(train_config["num_epochs"])):
        model.train()
        torch.cuda.empty_cache()
        for step, batch in tqdm(enumerate(trainloader), total=len(trainloader)):
            optimizer.zero_grad()
            inputs = batch["input_ids"].to(device)
            if "attention_mask" in batch:
                attention_mask = batch["attention_mask"].to(device)
            else:
                attention_mask = None
            labels = batch["labels"].to(device)

            # print(inputs[0], labels[0])

            outputs = model(inputs, labels=labels, attention_mask=attention_mask)
            loss = outputs.loss 
            loss.backward()
            if train_config["lr_scheduler"] is not None:
                lr_scheduler.step()
            optimizer.step()

            wandb.log({"step": step + len(trainloader) * epoch})
            wandb.log({"loss": loss.item()})
            wandb.log({"learning_rate": optimizer.param_groups[0]['lr']})

        wandb.log({"epoch": epoch})
        val_loss = evaluate(model, valloader)
        wandb.log({"val_loss": val_loss})

        #save model
        if epoch % save_epochs == 0:
            model_dir = os.path.join(path, model_name, run_name, f"epoch_{epoch}")
            os.makedirs(model_dir, exist_ok=True)
            model.save_pretrained(model_dir)

    wandb.finish()

    #save model
    model_dir = os.path.join(path, model_name, run_name, "final_chkpoint")
    os.makedirs(model_dir, exist_ok=True)
    model.save_pretrained(model_dir)


In [6]:
datacategory = "C"

datatype = f"experiment_2S_{datacategory}"
data_path = f"{path}/experiment_2S"

batch_size = 128

train_data = pickle.load(open(f"{data_path}/train_data_{datacategory}.pkl", 'rb'))
valid_data = pickle.load(open(f"{data_path}/valid_data_{datacategory}.pkl", 'rb'))

vocab_size = get_vocab_size(train_data["inputs"]+valid_data["inputs"])

ctx_len = len(train_data["inputs"][0])

trainset = LMDataset(train_data["inputs"], labels=train_data["inputs"])
valset = LMDataset(valid_data["inputs"], labels=valid_data["inputs"])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "M1"

In [7]:
embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2
tokenizer.pad_token = tokenizer.eos_token
vocab_size = vocab_size + 5 #for special tokens
num_labels = 200

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
# model = GPT2ForSequenceClassification._from_config(model_config)
model = GPT2LMHeadModel._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 150, "lr": 0.0001, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

100%|██████████| 118/118 [00:20<00:00,  5.78it/s]
100%|██████████| 40/40 [00:01<00:00, 23.13it/s]
100%|██████████| 118/118 [00:16<00:00,  7.09it/s]
100%|██████████| 40/40 [00:01<00:00, 23.02it/s]
100%|██████████| 118/118 [00:16<00:00,  7.04it/s]
100%|██████████| 40/40 [00:01<00:00, 22.86it/s]
100%|██████████| 118/118 [00:17<00:00,  6.89it/s]
100%|██████████| 40/40 [00:01<00:00, 21.45it/s]
100%|██████████| 118/118 [00:18<00:00,  6.55it/s]
100%|██████████| 40/40 [00:01<00:00, 21.21it/s]
100%|██████████| 118/118 [00:18<00:00,  6.42it/s]
100%|██████████| 40/40 [00:01<00:00, 20.48it/s]
100%|██████████| 118/118 [00:18<00:00,  6.50it/s]
100%|██████████| 40/40 [00:01<00:00, 20.13it/s]
100%|██████████| 118/118 [00:18<00:00,  6.38it/s]
100%|██████████| 40/40 [00:01<00:00, 20.17it/s]
100%|██████████| 118/118 [00:18<00:00,  6.33it/s]
100%|██████████| 40/40 [00:01<00:00, 20.25it/s]
100%|██████████| 118/118 [00:18<00:00,  6.38it/s]
100%|██████████| 40/40 [00:01<00:00, 20.57it/s]
100%|██████████| 118

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▆▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▆▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,149.0
learning_rate,0.0001
loss,0.58061
step,17699.0
val_loss,0.61709


In [8]:
datacategory = "D"

datatype = f"experiment_2S_{datacategory}"
data_path = f"{path}/experiment_2S"

batch_size = 128

train_data = pickle.load(open(f"{data_path}/train_data_{datacategory}.pkl", 'rb'))
valid_data = pickle.load(open(f"{data_path}/valid_data_{datacategory}.pkl", 'rb'))

vocab_size = get_vocab_size(train_data["inputs"]+valid_data["inputs"])

ctx_len = len(train_data["inputs"][0])

trainset = LMDataset(train_data["inputs"], labels=train_data["inputs"])
valset = LMDataset(valid_data["inputs"], labels=valid_data["inputs"])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "M1"

In [9]:
embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2
tokenizer.pad_token = tokenizer.eos_token
vocab_size = vocab_size + 5 #for special tokens
num_labels = 200

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
# model = GPT2ForSequenceClassification._from_config(model_config)
model = GPT2LMHeadModel._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 150, "lr": 0.0001, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

100%|██████████| 125/125 [00:17<00:00,  6.97it/s]
100%|██████████| 32/32 [00:01<00:00, 22.10it/s]
100%|██████████| 125/125 [00:18<00:00,  6.63it/s]
100%|██████████| 32/32 [00:01<00:00, 20.97it/s]
100%|██████████| 125/125 [00:19<00:00,  6.47it/s]
100%|██████████| 32/32 [00:01<00:00, 20.85it/s]
100%|██████████| 125/125 [00:19<00:00,  6.43it/s]
100%|██████████| 32/32 [00:01<00:00, 20.16it/s]
100%|██████████| 125/125 [00:19<00:00,  6.51it/s]
100%|██████████| 32/32 [00:01<00:00, 20.93it/s]
100%|██████████| 125/125 [00:19<00:00,  6.48it/s]
100%|██████████| 32/32 [00:01<00:00, 20.17it/s]
100%|██████████| 125/125 [00:19<00:00,  6.34it/s]
100%|██████████| 32/32 [00:01<00:00, 20.88it/s]
100%|██████████| 125/125 [00:19<00:00,  6.29it/s]
100%|██████████| 32/32 [00:01<00:00, 20.13it/s]
100%|██████████| 125/125 [00:19<00:00,  6.32it/s]
100%|██████████| 32/32 [00:01<00:00, 19.82it/s]
100%|██████████| 125/125 [00:19<00:00,  6.34it/s]
100%|██████████| 32/32 [00:01<00:00, 20.33it/s]
100%|██████████| 125

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,149.0
learning_rate,0.0001
loss,0.55696
step,18749.0
val_loss,0.57309


In [10]:
datacategory = "F"

datatype = f"experiment_2S_{datacategory}"
data_path = f"{path}/experiment_2S"

batch_size = 128

train_data = pickle.load(open(f"{data_path}/train_data_{datacategory}.pkl", 'rb'))
valid_data = pickle.load(open(f"{data_path}/valid_data_{datacategory}.pkl", 'rb'))

vocab_size = get_vocab_size(train_data["inputs"]+valid_data["inputs"])

ctx_len = len(train_data["inputs"][0])

trainset = LMDataset(train_data["inputs"], labels=train_data["inputs"])
valset = LMDataset(valid_data["inputs"], labels=valid_data["inputs"])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "M1"

In [11]:
embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2
tokenizer.pad_token = tokenizer.eos_token
vocab_size = vocab_size + 5 #for special tokens
num_labels = 200

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
# model = GPT2ForSequenceClassification._from_config(model_config)
model = GPT2LMHeadModel._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 150, "lr": 0.0001, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

100%|██████████| 118/118 [00:16<00:00,  6.96it/s]
100%|██████████| 40/40 [00:01<00:00, 21.64it/s]
100%|██████████| 118/118 [00:17<00:00,  6.63it/s]
100%|██████████| 40/40 [00:01<00:00, 20.42it/s]
100%|██████████| 118/118 [00:18<00:00,  6.47it/s]
100%|██████████| 40/40 [00:01<00:00, 20.22it/s]
100%|██████████| 118/118 [00:18<00:00,  6.49it/s]
100%|██████████| 40/40 [00:01<00:00, 20.30it/s]
100%|██████████| 118/118 [00:18<00:00,  6.53it/s]
100%|██████████| 40/40 [00:01<00:00, 20.22it/s]
100%|██████████| 118/118 [00:18<00:00,  6.45it/s]
100%|██████████| 40/40 [00:01<00:00, 20.82it/s]
100%|██████████| 118/118 [00:18<00:00,  6.51it/s]
100%|██████████| 40/40 [00:01<00:00, 20.62it/s]
100%|██████████| 118/118 [00:18<00:00,  6.45it/s]
100%|██████████| 40/40 [00:01<00:00, 20.69it/s]
100%|██████████| 118/118 [00:18<00:00,  6.43it/s]
100%|██████████| 40/40 [00:01<00:00, 21.47it/s]
100%|██████████| 118/118 [00:18<00:00,  6.49it/s]
100%|██████████| 40/40 [00:01<00:00, 20.51it/s]
100%|██████████| 118

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▅▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▅▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,149.0
learning_rate,0.0001
loss,0.56027
step,17699.0
val_loss,0.59591


In [12]:
datacategory = "G"

datatype = f"experiment_2S_{datacategory}"
data_path = f"{path}/experiment_2S"

batch_size = 128

train_data = pickle.load(open(f"{data_path}/train_data_{datacategory}.pkl", 'rb'))
valid_data = pickle.load(open(f"{data_path}/valid_data_{datacategory}.pkl", 'rb'))

vocab_size = get_vocab_size(train_data["inputs"]+valid_data["inputs"])

ctx_len = len(train_data["inputs"][0])

trainset = LMDataset(train_data["inputs"], labels=train_data["inputs"])
valset = LMDataset(valid_data["inputs"], labels=valid_data["inputs"])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

model_type = "M1"

In [13]:
embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2
tokenizer.pad_token = tokenizer.eos_token
vocab_size = vocab_size + 5 #for special tokens
num_labels = 200

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
# model = GPT2ForSequenceClassification._from_config(model_config)
model = GPT2LMHeadModel._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 150, "lr": 0.0001, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

100%|██████████| 118/118 [00:16<00:00,  7.11it/s]
100%|██████████| 40/40 [00:01<00:00, 21.58it/s]
100%|██████████| 118/118 [00:17<00:00,  6.78it/s]
100%|██████████| 40/40 [00:01<00:00, 21.07it/s]
100%|██████████| 118/118 [00:17<00:00,  6.70it/s]
100%|██████████| 40/40 [00:01<00:00, 20.10it/s]
100%|██████████| 118/118 [00:17<00:00,  6.61it/s]
100%|██████████| 40/40 [00:01<00:00, 20.74it/s]
100%|██████████| 118/118 [00:17<00:00,  6.57it/s]
100%|██████████| 40/40 [00:01<00:00, 21.97it/s]
100%|██████████| 118/118 [00:17<00:00,  6.60it/s]
100%|██████████| 40/40 [00:01<00:00, 21.06it/s]
100%|██████████| 118/118 [00:17<00:00,  6.62it/s]
100%|██████████| 40/40 [00:01<00:00, 20.90it/s]
100%|██████████| 118/118 [00:17<00:00,  6.56it/s]
100%|██████████| 40/40 [00:01<00:00, 20.60it/s]
100%|██████████| 118/118 [00:17<00:00,  6.69it/s]
100%|██████████| 40/40 [00:01<00:00, 20.22it/s]
100%|██████████| 118/118 [00:17<00:00,  6.57it/s]
100%|██████████| 40/40 [00:01<00:00, 20.24it/s]
100%|██████████| 118

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▇▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▇▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,149.0
learning_rate,0.0001
loss,0.6195
step,17699.0
val_loss,0.64355


In [18]:
batch_size = 128
datatype = "experiment_1"
datacategory = "EF"

train_data_1 = pickle.load(open(f"{path}/{datatype}/train_data_{datacategory}1.pkl", "rb"))
val_data_1 = pickle.load(open(f"{path}/{datatype}/valid_data_{datacategory}1.pkl", "rb"))
train_data_2 = pickle.load(open(f"{path}/{datatype}/train_data_{datacategory}2.pkl", "rb"))
val_data_2 = pickle.load(open(f"{path}/{datatype}/valid_data_{datacategory}2.pkl", "rb"))

train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2

len1 = len(train_data_1)
len2 = len(train_data_2)
train_labels = [0]*len1 + [1]*len2
len1 = len(val_data_1)
len2 = len(val_data_2)
val_labels = [0]*len1 + [1]*len2

ctx_len = len(train_data[0])

vocab_size = get_vocab_size(train_data+val_data)

trainset = LMDataset(train_data, labels=torch.tensor(train_labels))
valset = LMDataset(val_data, labels=torch.tensor(val_labels))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False) 

model_type = f"M3_{datacategory}"

embedding_dim = 128
n_layer = 12
n_head = 4
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2
tokenizer.pad_token = tokenizer.eos_token
num_labels = 2

model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = ctx_len, #vocab_size = vocab_size,
                          resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels
                          )
model = GPT2ForSequenceClassification._from_config(model_config)

model.config.pad_token_id = model.config.eos_token_id
model.to(device)

lr_scheduler = None

model_name = f"{datatype}/{model_type}_{embedding_dim}_{n_layer}"
train_config = {"num_epochs": 200, "lr": 0.0000001, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "ctx_len": ctx_len, "datatype": datatype}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)



0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,██▆▆▆▆▅▅▅▄▃▅▃▄▃▂▂▄▃▄▂▃▂▃▃▁▃▁▂▄▂▂▂▂▃▂▂▁▂▃
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
val_loss,█▇▄▃▃▃▃▂▂▂▂▂▁▂▁▁▂▂▁▂▁▁▁▂▂▂▁▂▂▂▁▁▁▃▂▂▃▃▂▃

0,1
epoch,107.0
learning_rate,0.0
loss,0.29507
step,17055.0
val_loss,0.53149


100%|██████████| 157/157 [00:09<00:00, 15.96it/s]
100%|██████████| 40/40 [00:00<00:00, 55.39it/s]
100%|██████████| 157/157 [00:09<00:00, 16.08it/s]
100%|██████████| 40/40 [00:00<00:00, 55.40it/s]
100%|██████████| 157/157 [00:09<00:00, 15.92it/s]
100%|██████████| 40/40 [00:00<00:00, 55.33it/s]
100%|██████████| 157/157 [00:09<00:00, 16.05it/s]
100%|██████████| 40/40 [00:00<00:00, 55.22it/s]
100%|██████████| 157/157 [00:09<00:00, 16.06it/s]
100%|██████████| 40/40 [00:00<00:00, 55.18it/s]
100%|██████████| 157/157 [00:09<00:00, 15.93it/s]
100%|██████████| 40/40 [00:00<00:00, 51.62it/s]
100%|██████████| 157/157 [00:09<00:00, 15.91it/s]
100%|██████████| 40/40 [00:00<00:00, 54.73it/s]
100%|██████████| 157/157 [00:09<00:00, 15.76it/s]
100%|██████████| 40/40 [00:00<00:00, 52.51it/s]
100%|██████████| 157/157 [00:09<00:00, 15.89it/s]
100%|██████████| 40/40 [00:00<00:00, 52.85it/s]
100%|██████████| 157/157 [00:09<00:00, 15.87it/s]
100%|██████████| 40/40 [00:00<00:00, 54.89it/s]
100%|██████████| 157

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,▇███▇▇█▇▆▇▆▆▄▆▅▅▅▆▄▆▄▅▃▂▄▃▄▄▂▂▃▂▄▃▅▃▄▄▄▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,███▇▇▆▆▅▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,199.0
learning_rate,0.0
loss,0.59589
step,31399.0
val_loss,0.53425
