In [1]:
%cd ..

/home/sasha/effective-inference


## Modules

In [48]:
import yaml
import os
import h5py
import numpy as np
import torch
import argparse, sys
import torch.nn as nn
import json
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from utils.prepare_dataset import load_datasets, cut_datasets
from utils.dataset_cache import cache_embeddings, cache_features_for_regression, build_dataset_from_cached, build_dict_dataset_from_cached,load_cached_dataset
from utils.config import ConfigWrapper
from utils.attentions.bert.linear import BertWrapperLin, LinearClassifierBertAttention, LinearAttention
from utils.dataset_utils import get_dict_batch, prepare_batches
from utils.train_linear_utils import train_epoch, eval_epoch, plot_history

from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel
from typing import Tuple, List, Dict, Optional, Union
from numpy.random import shuffle
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score, max_error, mean_absolute_percentage_error


## Config

In [58]:
f = """data:
    data_path: '/home/sasha/effective-inference/data'
    
    train_datasets:
        - ['', 'imdb']

    train_datasets_fields: 
        - ['text', '']
    
    eval_datasets:
        - ['glue', 'mrpc']
        - ['glue', 'sst2']
        - ['super_glue', 'wic']

    eval_datasets_fields:
        - ['sentence1', 'sentence2']
        - ['sentence', '']
        - ['sentence1', 'sentence2']
        
    cut_size: 200 # null or uint

    cache_cut_size: 1000
    prob_of_take: 0.01
    cache_train_features: True
    train_features_prefix: 'not_split'
    cache_train_dataset: False
    norm_len_factor: 1024 # 1 vs max_len
    train_prop: 0.95
    
    model_save_pattern: '27_09'

    layers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    heads: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    
model:
    model_name: 'bert-base-uncased'
    attention_aproximation: '' # one of linear, vanilla, window
    layers_num: 12 # int or arr with layers
    heads_num: 12 # int or arr with heads

general:
    device: 'cuda:0'
    out_prediction: 'data/outputs.json'
    out_metrics: 'data/metrics.txt'
    max_len: 1024
    batch_size: 1
    d_model: 768
    num_epochs: 60

attention_config: # hidden_to, hidden_from, pos_to, pos_from, 
    d_model: 768
    device: 'cuda:0'
    features:
        - 'hidden_to'
        - 'hidden_from'
        - 'pos_to'
        - 'pos_from'
        - 'relev_pos_from'
        - 'relev_pos_to'
        - 'inv_pos_from'
        - 'inv_pos_to'
        - 'inv_relev_pos_from'
        - 'inv_relev_pos_to'
        - 'seq_len'
        - 'inv_seq_len'
        - 'head_num'
        
    batch_size: 1
    train_batch_size: 128
    num_heads: 12
    split_heads: False
    split_heads_in_data: False
    model_for_each_head: False

    layers_to_train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    heads_to_train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    layers_to_change: [0]

"""
prepare_dataset_for_regression = False
cache = None #!

config = ConfigWrapper(yaml.load(f, Loader=yaml.FullLoader))

## Prepare datset

In [40]:
if not os.path.exists(config.data.data_path):
    os.makedirs(config.data.data_path)
    print(f'Data directory {data.data_path} created')
else:
    print('This directory already exists')

train_datasets = load_datasets(config.data.train_datasets, config.data.cut_size)

tokenizer = AutoTokenizer.from_pretrained(config.model.model_name, max_length=config.general.max_len)
initial_model = AutoModel.from_pretrained(config.model.model_name).to(config.general.device)

cache_embeddings(config, train_datasets, tokenizer, initial_model, cache_embeddings=cache)

if prepare_dataset_for_regression:
    layers = config.data.layers if isinstance(config.data.layers, list) else range(config.model.layers_num)
    heads = config.data.heads if isinstance(config.data.heads, list) else list(range(config.model.heads_num))
    cache_features_for_regression(config, train_datasets, tokenizer, initial_model, layers=layers, heads=heads, cache_embeddings=cache)


This directory already exists


## Train Linear Layers

In [51]:
def train_linear_model(X_train, X_test, y_train, y_test, config,layer, head, save_pattern='', 
                       use_plots=False, save_final_results=True, 
                       verbose=False, use_pbars=False, save_model=True):
    
    add_ = 0 if len(X_train) % config.attention_config.train_batch_size == 0 else 1
    total_len = (len(X_train) // config.attention_config.train_batch_size) + add_
    
    model = LinearAttention(config.attention_config).to(config.general.device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001, steps_per_epoch=total_len, epochs=config.general.num_epochs)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    train_log = []
    val_log = []
    for epoch in range(config.general.num_epochs):
        if use_plots:
            clear_output()
            
        train_loss, _ = train_epoch(model, optimizer, criterion, X_train, y_train, config, scheduler=scheduler, use_pbar=use_pbars)
        val_loss, val_preds = eval_epoch(model, criterion, X_test, y_test, config, use_pbar=use_pbars)
        train_log.extend(train_loss)
        steps = len(train_loss)
        val_log.append((steps * (epoch + 1), np.mean(val_loss)))
        
        if use_plots:
            print(f'{epoch} -- VAL R2 score:', r2_score(y_test, val_preds))
            plot_history(train_log, val_log)
        elif verbose:
            print(f'{epoch} -- Mean train loss:', np.mean(train_loss))
            print(f'{epoch} -- Mean val loss:', np.mean(val_loss))
            print(f'{epoch} -- VAL R2 score:', r2_score(y_test, val_preds))
            print()

        if epoch + 1 == config.general.num_epochs and save_final_results and save_pattern != '':
            if not os.path.exists(f'{config.data.data_path}/linear_models'):
                os.makedirs(f'{config.data.data_path}/linear_models')
            if not os.path.exists(f'{config.data.data_path}/linear_models/{save_pattern}'):
                os.makedirs(f'{config.data.data_path}/linear_models/{save_pattern}')
            with open(f'{config.data.data_path}/linear_models/{save_pattern}/preds.json', 'wb') as f:
                np.save(f, val_preds) # json.dump(val_preds, f)
            with open(f'{config.data.data_path}/linear_models/{save_pattern}/true.json', 'wb') as f:
                np.save(f, y_test) # json.dump(y_test, f)

    if save_model:
        if not os.path.exists(f'{config.data.data_path}/linear_models'):
            os.makedirs(f'{config.data.data_path}/linear_models')
        if not os.path.exists(f'{config.data.data_path}/linear_models/{save_pattern}'):
            os.makedirs(f'{config.data.data_path}/linear_models/{save_pattern}')
        model.to('cpu')
        torch.save(model.state_dict(), f'{config.data.data_path}/linear_models/{save_pattern}/model.pth')

    if epoch + 1 == config.general.num_epochs and not verbose and not use_plots:
        print(f'Final val loss:', np.mean(val_loss))
        print(f'Final val R2 score:', r2_score(y_test, val_preds))
        
    cols = ['dataset_name','layer', 'head', 'model_name', 'r2', 'mse', 'explained_variance_score', 'max_error', 
                                     'mean_absolute_percentage_error']
    data = {i: '' for i in cols}
    data['layer'] = layer
    data['head'] = head
    data['dataset_name'] = str(config.data.train_datasets)
    data['r2'] = r2_score(y_test, val_preds)
    data['mse'] = mean_squared_error(y_test, val_preds)
    data['explained_variance_score'] = explained_variance_score(y_test, val_preds)
    data['max_error'] = max_error(y_test, val_preds)
    data['mean_absolute_percentage_error'] = mean_absolute_percentage_error(y_test, val_preds)
    data['model_name'] = str(model)
    metrics = pd.read_csv(f'{config.data.data_path}/{config.data.model_save_pattern}.csv', index_col = [0])
    metrics.loc[len(metrics)] = data
    metrics.to_csv(f'{config.data.data_path}/{config.data.model_save_pattern}.csv')
    
    
    return model

In [52]:

verbose = True
use_pbars = True
save_final_results = True
save_final_models = True


In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.model.model_name, max_length=config.general.max_len)
initial_model = AutoModel.from_pretrained(config.model.model_name).to(config.general.device)
train_datasets = load_datasets(config.data.train_datasets, config.data.cut_size)
    
cols = ['dataset_name','layer', 'head', 'model_name', 'r2', 'mse', 'explained_variance_score', 'max_error', 
                                     'mean_absolute_percentage_error']
metrics = pd.DataFrame([], columns = cols)
metrics.to_csv(f'{config.data.data_path}/{config.data.model_save_pattern}_{}.csv')
    
if config.attention_config.split_heads or config.attention_config.model_for_each_head:
    pbar = tqdm(total=len(config.attention_config.layers_to_train) * len(config.attention_config.heads_to_train), position=0, leave=True)
    for layer_N in config.attention_config.layers_to_train:
        for head_N in config.attention_config.heads_to_train:
            print(f'Training {layer_N} layer, {head_N} head')
            X_train, y_train, X_test, y_test = build_dict_dataset_from_cached(config, train_datasets, layer=layer_N, heads=[head_N], 
                                                                          features=config.attention_config.features, 
                                                                          split_hidden=config.attention_config.split_heads_in_data)
            print('Train size:', len(X_train))
            print(X_train[10]['hidden_to'].shape)
            train_linear_model(X_train, X_test, y_train, y_test, config,layer=layer_N, head=head_N,
                                   save_pattern=f'{config.data.model_save_pattern}_{layer_N}_{head_N}', 
                                   use_plots=False, save_final_results=save_final_results, 
                                   verbose=verbose, use_pbars=use_pbars, save_model=save_final_models)
            pbar.update(1)
    
else:
    pbar = tqdm(total=12, position=0, leave=True)
    for layer_N in config.attention_config.layers_to_train:
        X_train, y_train, X_test, y_test = build_dict_dataset_from_cached(config, train_datasets, layer=layer_N,
                                                                              heads=config.attention_config.heads_to_train,
                                                                              features=config.attention_config.features, split_hidden=False)
        print('Train size:', len(X_train))
        train_linear_model(X_train, X_test, y_train, y_test, config,layer=layer_N, head=None,
                               save_pattern=f'{config.data.model_save_pattern}_{layer_N}', 
                               use_plots=False, save_final_results=save_final_results, 
                               verbose=verbose, use_pbars=use_pbars, save_model=save_final_models)
        pbar.update(1)


## Evaluate Model

In [56]:
import os

import argparse
import yaml
import h5py
import numpy as np
import torch
import json
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from utils.dataset_cache import cache_embeddings, get_dataset_for_regression, build_dataset_from_cached, load_cached_dataset
from utils.dataset_cache import build_dict_dataset_from_cached
from utils.prepare_dataset import load_datasets, cut_datasets
from utils.config import ConfigWrapper
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel
from typing import Tuple, List, Dict, Optional, Union
from numpy.random import shuffle
from sklearn.metrics import r2_score

from IPython.display import clear_output
from collections import defaultdict

from utils.attentions.bert.linear import BertWrapperLin, LinearClassifierBertAttention, LinearAttention
from utils.dataset_utils import get_dict_batch, prepare_batches
from utils.train_linear_utils import train_epoch, eval_epoch, plot_history

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import classification_report

tqdm_pbar = lambda x, y: tqdm(x, leave=True, position=0, total=len(x), desc=f'{y}')
def get_cls_embeddings_for_dataset(dataset_idx, dataset_name, dataset, config, tokenizer, model, eval_datasets,
                                   pbar_func=tqdm_pbar):
    collected_embeddings = defaultdict(list)

    for split, data in eval_datasets[dataset_name].items():
        pbar = pbar_func(list(enumerate(data)), f"{split} {dataset_name}") if pbar_func is not None else data
        for ex_idx, ex in pbar:
            field1, field2 = config.data.eval_datasets_fields[dataset_idx]
            if field2 != '':
                encoded_inputs = tokenizer.encode(
                                ex[field1],
                                ex[field2],
                                truncation=True,
                                return_tensors='pt'
                            ).to(config.general.device)
            else:
                encoded_inputs = tokenizer.encode(
                                ex[field1],
                                truncation=True,
                                return_tensors='pt'
                            ).to(config.general.device)
            
            with torch.no_grad():
                outputs = model(encoded_inputs)

            # Get the embedding of the [CLS] token
            cls_embedding = outputs.last_hidden_state[:, 0, :]

            # Append the [CLS] embedding to the list
            collected_embeddings[split].append(cls_embedding)
    return collected_embeddings

def train_linear(X_train, y_train):
    classifier = LogisticRegression(solver='lbfgs', max_iter=3000)
    classifier.fit(X_train, y_train)
    return classifier

def evaluate_classifier(classifier, X, y=None):
    predictions = classifier.predict(X)
    return predictions

def get_metrics_report(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred,  average='weighted')
    print('Weighted F1', f1)
    print('Accuracy', accuracy)
    print('-------------------------------')
    return f1, accuracy


def init_linear_modules(config, linear_model):
    for layer_num, bert_att in enumerate(linear_model.bert_model.encoder.layer):
        for param_name, param in bert_att.named_modules():
            if '.' in param_name and 'linear_model' in param_name.split('.')[-1]:
                head_num = int(param_name.split('_')[-1])
                save_pattern = f"{config.data.model_save_pattern}_{layer_num}_{head_num}"
                param.load_state_dict(torch.load(f'{config.data.data_path}/linear_models/{save_pattern}/model.pth'), strict=False)

def init_linear_modules2(config, linear_model):
    for layer_num, bert_att in enumerate(linear_model.bert_model.encoder.layer):
        for param_name, param in bert_att.named_modules():
            if '.' in param_name and 'linear_model' in param_name.split('.')[-1]:
                save_pattern = f"{config.data.model_save_pattern}_{layer_num}"
                param.load_state_dict(torch.load(f'{config.data.data_path}/linear_models/{save_pattern}/model.pth'), strict=False)

def check_results(custom_model, initial_model, datasets, config, tokenizer, eval_datasets):
    for dataset_idx, (dataset_name, dataset) in enumerate(datasets.items()):
        print(f"{dataset_name}\n")

        # print('Original')

        # dataset_embeddings_orig = get_cls_embeddings_for_dataset(
        #     dataset_idx, 
        #     dataset_name,
        #     dataset, 
        #     config,
        #     tokenizer, 
        #     initial_model,
        #     eval_datasets)
        
        # train_dataset_embeddings = torch.cat(dataset_embeddings_orig['train'], dim=0)
        # valid_dataset_embeddings = torch.cat(dataset_embeddings_orig['validation'], dim=0)
        # test_dataset_embeddings = torch.cat(dataset_embeddings_orig['test'], dim=0)
        
        # classif = train_linear(train_dataset_embeddings.cpu(), [el['label'] for el in dataset['train']])
        # valid_preds = evaluate_classifier(classif, valid_dataset_embeddings.cpu())
        # print('Validation evaluation:\n')
        # get_metrics_report([el['label'] for el in dataset['validation']], valid_preds)
        # # print(train_dataset_embeddings.shape)

        
        print('\nLinear:')
        
        dataset_embeddings_custom = get_cls_embeddings_for_dataset(
            dataset_idx, 
            dataset_name,
            dataset, 
            config,
            tokenizer, 
            custom_model,
            eval_datasets)
        
        train_dataset_embeddings = torch.cat(dataset_embeddings_custom['train'], dim=0)
        valid_dataset_embeddings = torch.cat(dataset_embeddings_custom['validation'], dim=0)
        test_dataset_embeddings = torch.cat(dataset_embeddings_custom['test'], dim=0)


        classif = train_linear(train_dataset_embeddings.cpu(), [el['label'] for el in dataset['train']])
        valid_preds = evaluate_classifier(classif, valid_dataset_embeddings.cpu())
        print('Validation evaluation:\n')
        f, a = get_metrics_report([el['label'] for el in dataset['validation']], valid_preds)   
        cols = ['dataset_name', 'layers', 'accuracy', 'f1']
        data = {i: '' for i in cols}
        data['layers'] = str(config.attention_config.layers_to_change)
        data['dataset_name'] = dataset_name
        data['accuracy'] = a
        data['f1'] = f
        results = pd.read_csv(f'{config.data.data_path}/results_{config.data.model_save_pattern}.csv',index_col = [0])
        results.loc[len(results)] = data
        results.to_csv(f'{config.data.data_path}/results_{config.data.model_save_pattern}.csv')
        

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.model.model_name, max_length=config.general.max_len)
initial_model = AutoModel.from_pretrained(config.model.model_name).to(config.general.device)
eval_datasets = load_datasets(config.data.eval_datasets, config.data.cut_size)

linear_model = BertWrapperLin(initial_model, LinearClassifierBertAttention, config, layer_nums=config.attention_config.layers_to_change)

if config.attention_config.split_heads or config.attention_config.model_for_each_head:
    init_linear_modules(config, linear_model)
else:
    init_linear_modules2(config, linear_model)

initial_model = initial_model.to(config.general.device)
linear_model = linear_model.to(config.general.device)
    
# cols = ['dataset_name', 'layers', 'accuracy', 'f1']
# results = pd.DataFrame([], columns = cols)
# results.to_csv(f'{config.data.data_path}/results_{config.data.model_save_pattern}.csv')
# check_results(linear_model, initial_model, eval_datasets, config, tokenizer, eval_datasets)