In [1]:
%cd ../..

/home/shapkin/effective-inference


## Import libs

In [2]:
import os

import yaml
import h5py
import numpy as np
import torch
import torch.nn as nn
import json
import seaborn as sns
import matplotlib.pyplot as plt

from utils.dataset_cache import cache_embeddings, get_dataset_for_regression, build_dataset_from_cached, load_cached_dataset
from utils.dataset_cache import build_dict_dataset_from_cached
from utils.prepare_dataset import load_datasets, cut_datasets
from utils.config import ConfigWrapper
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel
from typing import Tuple, List, Dict, Optional, Union
from numpy.random import shuffle
from sklearn.metrics import r2_score

from IPython.display import clear_output

In [3]:
from utils.attentions.bert.linear import BertWrapperLin, LinearClassifierBertAttention, LinearAttention
from utils.dataset_utils import get_dict_batch, prepare_batches
from utils.train_linear_utils import train_epoch, eval_epoch, plot_history

## Project configuration

In [37]:
config_path = 'config.yaml'

with open(config_path, "r") as f:
    config = ConfigWrapper(yaml.load(f, Loader=yaml.FullLoader))

In [35]:
def train_linear_model(X_train, X_test, y_train, y_test, config, save_pattern='', 
                       use_plots=False, save_final_results=False, 
                       verbose=False, use_pbars=False, save_model=False):
    
    add_ = 0 if len(X_train) % config.attention_config.train_batch_size == 0 else 1
    total_len = (len(X_train) // config.attention_config.train_batch_size) + add_
    
    model = LinearAttention(config.attention_config).to(config.general.device)

    for param_name, param in model.named_parameters():
        print(param_name, param)
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.01, weight_decay=1)
    
    # Learning rate scheduler
    scheduler = None #torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001, steps_per_epoch=total_len, epochs=config.general.num_epochs)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    train_log = []
    val_log = []
    for epoch in range(config.general.num_epochs):
        if use_plots:
            clear_output()
            
        train_loss, _ = train_epoch(model, optimizer, criterion, X_train, y_train, config, scheduler=scheduler, use_pbar=use_pbars)
        val_loss, val_preds = eval_epoch(model, criterion, X_test, y_test, config, use_pbar=use_pbars)
        train_log.extend(train_loss)
        steps = len(train_loss)
        val_log.append((steps * (epoch + 1), np.mean(val_loss)))
        
        if use_plots:
            print(f'{epoch} -- VAL R2 score:', r2_score(y_test, val_preds))
            plot_history(train_log, val_log)
        elif verbose:
            print(f'{epoch} -- Mean train loss:', np.mean(train_loss))
            print(f'{epoch} -- Mean val loss:', np.mean(val_loss))
            print(f'{epoch} -- VAL R2 score:', r2_score(y_test, val_preds))
            print()

        if epoch + 1 == config.general.num_epochs and save_final_results and save_pattern != '':
            if not os.path.exists(f'{config.data.data_path}/linear_models'):
                os.makedirs(f'{config.data.data_path}/linear_models')
            if not os.path.exists(f'{config.data.data_path}/linear_models/{save_pattern}'):
                os.makedirs(f'{config.data.data_path}/linear_models/{save_pattern}')
            with open(f'{config.data.data_path}/linear_models/{save_pattern}/preds.json', 'wb') as f:
                np.save(f, val_preds) # json.dump(val_preds, f)
            with open(f'{config.data.data_path}/linear_models/{save_pattern}/true.json', 'wb') as f:
                np.save(f, y_test) # json.dump(y_test, f)

    if save_model:
        if not os.path.exists(f'{config.data.data_path}/linear_models'):
            os.makedirs(f'{config.data.data_path}/linear_models')
        if not os.path.exists(f'{config.data.data_path}/linear_models/{save_pattern}'):
            os.makedirs(f'{config.data.data_path}/linear_models/{save_pattern}')
        model.to('cpu')
        torch.save(model.state_dict(), f'{config.data.data_path}/linear_models/{save_pattern}/model.pth')

    if epoch + 1 == config.general.num_epochs and not verbose and not use_plots:
        print(f'Final val loss:', np.mean(val_loss))
        print(f'Final val R2 score:', r2_score(y_test, val_preds))
    return model
        

In [6]:
tokenizer = AutoTokenizer.from_pretrained(config.model.model_name, max_length=config.general.max_len)
initial_model = AutoModel.from_pretrained(config.model.model_name).to(config.general.device)

In [7]:
train_datasets = load_datasets(config.data.train_datasets, config.data.cut_size)
train_datasets

{'imdb': DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 25000
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 25000
     })
     unsupervised: Dataset({
         features: ['text', 'label'],
         num_rows: 50000
     })
 })}

## Load data

In [38]:
if config.attention_config.split_heads or config.attention_config.model_for_each_head:
    pbar = tqdm(total=len(config.attention_config.layers_to_train) * len(config.attention_config.heads_to_train), position=0, leave=True)
    for layer_N in config.attention_config.layers_to_train:
        for head_N in config.attention_config.heads_to_train:
            print(f'Training {layer_N} layer, {head_N} head')
            X_train, y_train, X_test, y_test = build_dict_dataset_from_cached(config, train_datasets, layer=layer_N, heads=[head_N], 
                                                                      features=config.attention_config.features, 
                                                                      split_hidden=config.attention_config.split_heads_in_data)
            print('Train size:', len(X_train))
            print(X_train[10]['hidden_to'].shape)
            #train_linear_model(X_train, X_test, y_train, y_test, config, save_pattern=f'{config.data.model_save_pattern}_{layer_N}_{head_N}', 
            #                   use_plots=False, save_final_results=True, 
            #                   verbose=True, use_pbars=False, save_model=True)
            pbar.update(1)

else:
    pbar = tqdm(total=12, position=0, leave=True)
    for layer_N in range(12):
        X_train, y_train, X_test, y_test = build_dict_dataset_from_cached(config, train_datasets, layer=layer_N, heads=[0, 1, 2], 
                                                                      features=config.attention_config.features, 
                                                                      split_hidden=False)
        print('Train size:', len(X_train))
        train_linear_model(X_train, X_test, y_train, y_test, config, save_pattern=f'{config.data.model_save_pattern}_{layer_N}', 
                           use_plots=False, save_final_results=True, 
                           verbose=True, use_pbars=False, save_model=True)
        pbar.update(1)

  0%|          | 0/12 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [88]:
X_train, y_train, X_test, y_test = build_dict_dataset_from_cached(config, train_datasets, layer=0, heads=[0], 
                                                                      features=config.attention_config.features, 
                                                                      split_hidden=False)

In [89]:
def to_vectors(X_train, y_train):
    dataset, target = [], []
    for i in range(len(X_train)):
        fv = []
        for k, v in X_train[i].items():
            try:
                fv += list(v)
            except:
                fv += [v]
        dataset.append(fv)
        target.append(y_train[i])
    return np.array(dataset), np.array(target)

In [90]:
X_train1, y_train1 = to_vectors(X_train, y_train)
X_test1, y_test1 = to_vectors(X_test, y_test)

In [125]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import SGDRegressor

In [150]:
m = Ridge(solver='svd')

In [151]:
m.fit(X_train1, y_train1)

In [152]:
m.coef_

array([ 2.14876008e-02,  9.58622339e-03, -4.41515484e-03, ...,
       -1.20941502e+00,  1.10310109e-01, -1.10338978e-14])

In [156]:
preds = m.predict(X_train1)
r2_score(y_train1, preds)
preds = m.predict(X_test1)
r2_score(y_test1, preds)

0.27447040713421234

In [131]:
w = np.dot(np.dot(np.linalg.inv(np.dot(X_train1.T, X_train1) + np.ones_like(np.dot(X_train1.T, X_train1)) * 0.01), X_train1.T), y_train1)

In [132]:
w

array([ 1.32631248e+06,  1.38765318e+06,  1.43136212e+06, ...,
        3.07740085e+04,  1.27252901e+02, -8.87666222e+05])

In [97]:
preds

array([-5.64006054, -6.48562395, -6.24644707, ..., -6.5668683 ,
       -6.4783497 , -6.37182276])

In [101]:
import sklearn

scaler = sklearn.preprocessing.StandardScaler()
X_train2 = scaler.fit_transform(X_train1)
X_test2 = scaler.transform(X_test1)

In [102]:
X_train2, y_train2 = torch.tensor(X_train2, dtype=torch.float32), torch.tensor(y_train1, dtype=torch.float32) 
X_test2, y_test2 = torch.tensor(X_test2, dtype=torch.float32), torch.tensor(y_test1, dtype=torch.float32) 

In [103]:
criterion(y_train2, torch.tensor(preds))

tensor(0.2152, dtype=torch.float64)

In [104]:
X_train1[0].shape

(1547,)

In [143]:
# Linear model
model = nn.Linear(1547, 1)
model.weight.data = torch.tensor([w], dtype=torch.float32)


  model.weight.data = torch.tensor([w], dtype=torch.float32)


In [144]:
model.weight.data

tensor([[ 1.3263e+06,  1.3877e+06,  1.4314e+06,  ...,  3.0774e+04,
          1.2725e+02, -8.8767e+05]])

In [145]:
model.weight.data

tensor([[ 1.3263e+06,  1.3877e+06,  1.4314e+06,  ...,  3.0774e+04,
          1.2725e+02, -8.8767e+05]])

In [146]:
outputs = model(X_test2)

In [149]:
r2_score(y_test1, outputs.detach().numpy())

-6.836387443022793e+17

In [113]:
# Linear model
model = nn.Linear(1547, 1)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.LBFGS(model.parameters(), lr=1)

# Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train2)
    loss = criterion(outputs, y_train2)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step(0.0001)

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

TypeError: 'float' object is not callable

In [123]:
outputs = model(X_test2)

In [124]:
r2_score(y_test1, outputs.detach().numpy())

-69.78649707264105

In [56]:
X_train1

tensor([[ 0.1196,  0.2523, -0.0407,  ...,  0.2852,  0.0034,  0.0000],
        [ 0.0077,  0.4746,  0.6170,  ...,  0.5000,  0.0020,  0.0000],
        [-0.7503,  0.0204,  0.5124,  ...,  0.2666,  0.0037,  0.0000],
        ...,
        [-0.2463,  0.9201,  0.0722,  ...,  0.3633,  0.0027,  0.0000],
        [ 0.7329,  0.7134,  0.1441,  ...,  0.3682,  0.0027,  0.0000],
        [-0.7599, -0.7485, -0.3053,  ...,  0.5000,  0.0020,  0.0000]],
       dtype=torch.float64)

In [59]:
model.weight.dtype

torch.float32

In [45]:
preds = m.predict(X_test1)

In [46]:
r2_score(y_test1, preds)

0.27447040713421134

In [None]:
class LinearAttention(nn.Module):
    def __init__(self, config):
        super(LinearAttention, self).__init__()
        self.config = config
        self.features = config['features']
        self.device = config['device']
        self.batch_size = config['batch_size']

        self.dim_size = config['d_model']
        if config.split_heads:
            self.dim_size = config['d_model'] // config['num_heads']
                        
        for k in self.features:                
            if 'hidden' in k:
                learnable_parameters = f'torch.nn.Linear(in_features={self.dim_size}, out_features=1, bias=False)'
                exec(f"self.{k} = {learnable_parameters}")
            else:
                learnable_parameters = f'nn.Parameter(torch.randn(1), requires_grad=True)'
                exec(f"self.{k} = {learnable_parameters}")
                    
            
    def forward(self, seq_len_arg=None, **kwargs):
        if seq_len_arg is not None:
            result = torch.zeros((self.batch_size, seq_len_arg, seq_len_arg), device=self.device)
            
            for arg_name, arg_value in kwargs.items():
                namespace = {'cur_result': None, 'self': self, 'arg_name': arg_name, 'arg_value': arg_value}
                if 'hidden' in arg_name:
                    exec(f"cur_result = self.{arg_name}(arg_value)", namespace)
                else:
                    exec(f"cur_result = self.{arg_name} * arg_value", namespace)
                if 'from' in arg_name:
                    result += namespace['cur_result'].T
                else:
                    result += namespace['cur_result']
        else:
            for arg_name, arg_value in kwargs.items():
                bs = len(arg_value)
                break
            result = torch.zeros((bs, 1), device=self.device, )
            
            for arg_name, arg_value in kwargs.items():
                namespace = {'cur_result': None, 'self': self, 'arg_name': arg_name, 'arg_value': arg_value}
                if 'hidden' in arg_name:
                    exec(f"cur_result = self.{arg_name}(arg_value)", namespace)
                else:
                    exec(f"cur_result = self.{arg_name} * arg_value", namespace)

                result += namespace['cur_result'].view((bs, 1))
            
        return result

In [49]:
model = train_linear_model(X_train, X_test, y_train, y_test, config, save_pattern=f'{config.data.model_save_pattern}_{0}', 
                           use_plots=False, save_final_results=True, 
                           verbose=True, use_pbars=False, save_model=True)

pos_to Parameter containing:
tensor([-0.3278], device='cuda:1', requires_grad=True)
pos_from Parameter containing:
tensor([0.1027], device='cuda:1', requires_grad=True)
relev_pos_from Parameter containing:
tensor([0.8275], device='cuda:1', requires_grad=True)
relev_pos_to Parameter containing:
tensor([-0.4740], device='cuda:1', requires_grad=True)
inv_pos_from Parameter containing:
tensor([-0.5632], device='cuda:1', requires_grad=True)
inv_pos_to Parameter containing:
tensor([0.0237], device='cuda:1', requires_grad=True)
inv_relev_pos_from Parameter containing:
tensor([-0.9353], device='cuda:1', requires_grad=True)
inv_relev_pos_to Parameter containing:
tensor([0.1786], device='cuda:1', requires_grad=True)
seq_len Parameter containing:
tensor([-0.2316], device='cuda:1', requires_grad=True)
inv_seq_len Parameter containing:
tensor([0.6134], device='cuda:1', requires_grad=True)
head_num Parameter containing:
tensor([-2.3959], device='cuda:1', requires_grad=True)
hidden_to.weight Paramete

In [19]:
model.to(config.general.device)

LinearAttention(
  (hidden_to): Linear(in_features=768, out_features=1, bias=False)
  (hidden_from): Linear(in_features=768, out_features=1, bias=False)
)

In [20]:
val_loss, val_preds = eval_epoch(model, nn.MSELoss(), X_test, y_test, config, use_pbar=True)

  0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
val_preds

array([[-7.247595 ],
       [-5.6525435],
       [-5.801957 ],
       [-6.010397 ],
       [-6.3579507],
       [-6.4027143],
       [-6.201969 ],
       [-5.9606333],
       [-6.28158  ],
       [-6.4117446],
       [-6.074593 ],
       [-5.958689 ],
       [-3.7018425],
       [-6.09683  ],
       [-4.9902015],
       [-5.9078703],
       [-5.102816 ],
       [-6.173346 ],
       [-6.507085 ],
       [-6.8629436],
       [-7.1392684],
       [-6.2972198],
       [-5.7428403],
       [-6.851408 ],
       [-5.840838 ],
       [-6.2860622],
       [-6.629168 ],
       [-5.484012 ],
       [-5.9730744],
       [-5.4034266],
       [-6.5269356],
       [-6.764206 ],
       [-6.027009 ],
       [-7.2928915],
       [-4.3291903],
       [-6.3096848],
       [-6.346401 ],
       [-6.515431 ],
       [-4.9723926],
       [-5.4534526],
       [-6.667586 ],
       [-6.984903 ],
       [-5.9428144],
       [-6.340772 ],
       [-6.461209 ],
       [-5.121102 ],
       [-5.3070703],
       [-5.35

In [22]:
r2_score(y_test1, val_preds)

-0.8787578157832978

In [23]:
r2_score(y_test1, val_preds.squeeze(1))

-0.8787578157832978

In [24]:
for param_name, param in model.named_parameters():
    print(param_name, param)

pos_to Parameter containing:
tensor([-1.4713], device='cuda:1', requires_grad=True)
pos_from Parameter containing:
tensor([0.2917], device='cuda:1', requires_grad=True)
relev_pos_from Parameter containing:
tensor([-0.4342], device='cuda:1', requires_grad=True)
relev_pos_to Parameter containing:
tensor([-0.3354], device='cuda:1', requires_grad=True)
inv_pos_from Parameter containing:
tensor([-0.1581], device='cuda:1', requires_grad=True)
inv_pos_to Parameter containing:
tensor([1.2451], device='cuda:1', requires_grad=True)
inv_relev_pos_from Parameter containing:
tensor([0.4964], device='cuda:1', requires_grad=True)
inv_relev_pos_to Parameter containing:
tensor([-0.5488], device='cuda:1', requires_grad=True)
seq_len Parameter containing:
tensor([-0.6138], device='cuda:1', requires_grad=True)
inv_seq_len Parameter containing:
tensor([-0.1583], device='cuda:1', requires_grad=True)
head_num Parameter containing:
tensor([-0.2370], device='cuda:1', requires_grad=True)
hidden_to.weight Parame

In [32]:
model.seq_len

Parameter containing:
tensor([0.9457], device='cuda:1', requires_grad=True)

## Approach 2

In [70]:
layer_N, head_N = 0, 0
X_train, y_train, X_test, y_test = build_dict_dataset_from_cached(config, train_datasets, layer=layer_N, heads=[head_N], 
                                                                  features=config.attention_config.features, 
                                                                  split_hidden=True)

In [71]:
linear_model = BertWrapperLin(initial_model, LinearClassifierBertAttention, config, layer_nums=[6, 7, 8, 9, 10, 11])

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [73]:
for param_name, param in linear_model.named_parameters():
    if 'LinearAttention' in param_name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [94]:
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc = nn.Linear(768, 2)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.fc(x)

In [103]:
linear_head = SimpleNet().to(config.general.device)

dataset_names = list(train_datasets)
dataset_name = 'imdb'
dataset_idx = 0

for ex_idx, ex in tqdm(enumerate(train_datasets[dataset_name]['train'])):
    if ex_idx > config.data.cache_cut_size:
        continue
    field1, field2 = config.data.train_datasets_fields[dataset_idx]
    if field2 != '':
        encoded_inputs = tokenizer.encode(
                        ex[field1],
                        ex[field2],
                        truncation=True,
                        return_tensors='pt'
                    ).to(config.general.device)
    else:
        encoded_inputs = tokenizer.encode(
                        ex[field1],
                        truncation=True,
                        return_tensors='pt'
                    ).to(config.general.device)

    emb = linear_model(encoded_inputs).last_hidden_state[:, 0, :]
    pred_label = linear_head(emb)
    gold_label = torch.tensor([ex['label']]).to(config.general.device)

    
    print(pred_label.shape, gold_label.shape)
    loss = criterion(pred_label, gold_label)
    

0it [00:00, ?it/s]

torch.Size([1, 2]) torch.Size([1])


NameError: name 'criterion' is not defined

In [97]:
ex

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [79]:
linear_model.to(config.general.device)
linear_model(encoded_inputs).last_hidden_state[:, 0, :].shape

torch.Size([1, 768])

In [52]:
train_datasets['imdb']['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [None]:
def get_cls_embeddings_for_dataset(dataset_name, dataset, 
                                   feature_names, model, tokenizer, 
                                   pbar_func=tqdm_pbar, device=device, CUT_SIZE=CUT_SIZE):
    collected_embeddings = defaultdict(list)

    for split, data in dataset.items():
        
        pbar = pbar_func(data, f"{split} {dataset_name}") if pbar_func is not None else data
        for example in pbar:
            # Encode the input sentences
            if len(feature_names) == 2:
                encoded_inputs = tokenizer.encode(
                    example[feature_names[0]], 
                    example[feature_names[1]], 
                    truncation=True, 
                    return_tensors='pt'
                )
            else:
                encoded_inputs = tokenizer.encode(*list(map(lambda x: example[x] , feature_names)), 
                                              truncation=True, 
                                              return_tensors='pt')
            
            special_token_positions = (encoded_inputs[0] < 103).nonzero().squeeze()
            encoded_inputs = encoded_inputs.to(device)

            # print(f"Encoded inputs: {encoded_inputs}")
            # print(f"Special token positions: {special_token_positions}")
            
            # Forward pass through the model
            with torch.no_grad():
                outputs = model(encoded_inputs)
            # print(f'Outputs: {outputs}')

            # Get the embedding of the [CLS] token
            cls_embedding = outputs.last_hidden_state[:, 0, :]

            # Append the [CLS] embedding to the list
            collected_embeddings[split].append(cls_embedding)
         
    return collected_embeddings