In [6]:
%cd ..

/home/sasha/effective-inference


## Modules

In [7]:
from typing import Tuple, List, Dict, Optional, Union
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from datasets.dataset_dict import DatasetDict

from utils.prepare_dataset import load_datasets, cut_datasets
from utils.attention_patterns.bert_module_linear import LinearClassifierBertAttention, BertWrapperLin
# from utils.attention_patterns.gpt2_module_linear import 

In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import classification_report
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import torch
#from progressbar import progressbar
from tqdm.auto import tqdm
from collections import defaultdict

import seaborn as sns
import matplotlib.pyplot as plt
from transformers.models.bert.modeling_bert import BertSelfAttention, BertModel, BaseModelOutputWithPastAndCrossAttentions
from typing import Optional, Tuple, Union, List
from torch import nn
from copy import deepcopy
import joblib


WINDOW_SIZE = 10

class LinearClassifierBertAttention(BertSelfAttention):
    """
    Idea: attention weights are predicted by Linear Classifier
    """
    def set_models(self, coefs, layer):
        # print(coefs.size)
        # print(coefs[768])
        self.hidden_pred = torch.nn.Linear(
            in_features=768,
            out_features=1
        )
        self.hidden_pred.data = torch.tensor(coefs[:768]).view(1, -1).to(DEVICE)
        self.pos_i_coef = coefs[768]
        self.pos_j_coef = coefs[769]
        self.relev_pos_i_coef = coefs[770]
        self.relev_pos_j_coef = coefs[771]
        self.inv_pos_i_coef = coefs[772]
        self.inv_pos_j_coef = coefs[773]
        self.inv_relev_pos_i_coef = coefs[774]
        self.inv_relev_pos_j_coef = coefs[775]
        self.seq_len_coef = coefs[776]
        self.inv_seq_len_coef = coefs[777]
        self.layer_coef = coefs[778]
        self.layer = layer

    def set_window_size(self, window_size):
        self.WINDOW_SIZE = window_size

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
        special_tokens_idxs: Optional[List[int]] = [0]
    ) -> Tuple[torch.Tensor]:
        mixed_query_layer = self.query(hidden_states)

        # If this is instantiated as a cross-attention module, the keys
        # and values come from an encoder; the attention mask needs to be
        # such that the encoder's padding tokens are not attended to.
        is_cross_attention = encoder_hidden_states is not None

        if is_cross_attention and past_key_value is not None:
            # reuse k,v, cross_attentions
            key_layer = past_key_value[0]
            value_layer = past_key_value[1]
            attention_mask = encoder_attention_mask
            special_tokens_idxs = (encoder_hidden_states[0] < 103).nonzero().squeeze()
        elif is_cross_attention:
            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
            attention_mask = encoder_attention_mask
            special_tokens_idxs = (encoder_hidden_states[0] < 103).nonzero().squeeze()
        elif past_key_value is not None:
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            value_layer = self.transpose_for_scores(self.value(hidden_states))
            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
        else:

            key_layer = self.transpose_for_scores(self.key(hidden_states))
            value_layer = self.transpose_for_scores(self.value(hidden_states))
    
        seq_len = hidden_states.shape[1]
        hidden_states_attention_features = self.hidden_pred(hidden_states)
        

        predicted_attention = hidden_states_attention_features.repeat(1, 1, seq_len) #.transpose(1, 2) # bs, seq_len, 1        
        positions = torch.arange(seq_len)

        # add i, j pos features
        positions_features_from = positions * self.pos_i_coef # seq_len
        positions_features_out = positions * self.pos_j_coef # seq_len
        predicted_attention += positions_features_from.T
        predicted_attention += positions_features_out

        # add relev i, j pos features
        relev_positions_features_from = (seq_len - positions) * self.relev_pos_i_coef # seq_len
        relev_positions_features_out = (seq_len - positions) * self.relev_pos_j_coef # seq_len
        predicted_attention += relev_positions_features_from.T
        predicted_attention += relev_positions_features_out
       

        inv_positions_features_from = (positions / seq_len) * self.inv_pos_i_coef # seq_len
        inv_positions_features_out = (positions / seq_len) * self.inv_pos_j_coef # seq_len
        predicted_attention += inv_positions_features_from.T
        predicted_attention += inv_positions_features_out
        
        
        inv_relev_positions_features_from = ((seq_len - positions) / seq_len) * self.inv_relev_pos_i_coef # seq_len
        inv_relev_positions_features_out = ((seq_len - positions) / seq_len) * self.inv_relev_pos_j_coef # seq_len
        predicted_attention += inv_relev_positions_features_from.T
        predicted_attention += inv_relev_positions_features_out

        seq_len_feature = self.seq_len_coef * seq_len # 1
        inv_seq_len_feature = self.inv_seq_len_coef * (1 / seq_len) # 1
        layer_feature = self.layer * self.layer_coef # 1
        predicted_attention += seq_len_feature
        predicted_attention += inv_seq_len_feature
        predicted_attention += layer_feature
        
        predicted_attention = torch.clamp(predicted_attention, min=-10, max=10)
        # print(predicted_attention)
        attention_probs = torch.exp(predicted_attention) #torch.nn.functional.softmax(predicted_attention, dim=-1)
        # print(attention_probs)
        # print(value_layer.squeeze(1).size())
        context_layer = torch.matmul(attention_probs, value_layer.squeeze(1))
        # print(context_layer.size())
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

    
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        if self.is_decoder:
            outputs = outputs + (past_key_value,)
        return outputs


class BertWrapperLin(nn.Module):
    def __init__(self, model, new_attention_class, final_model_weights_dict,  layer_nums=None, window_size=2):
        super().__init__()

        self.bert_model = deepcopy(model)
        self.layer_nums = layer_nums
        self.final_model_weights_dict = final_model_weights_dict


        # Create a list of modules to modify
        modules_to_modify = []
        for i in final_model_weights_dict.keys():
            if (layer_nums is not None and i in layer_nums) or (layer_nums is None):
                mean_attention = new_attention_class(self.bert_model.config)
                mean_attention.set_window_size(window_size)
                mean_attention.load_state_dict(self.bert_model.encoder.layer[i].attention.self.state_dict())
                # mean_attention.encoder_hidden_states = self.bert_model.encoder.layer[i].attention.self.encoder_hidden_states
                if self.final_model_weights_dict is not None: # and i in final_model_weights_dict.keys():
                    mean_attention.set_models(self.final_model_weights_dict[i], i)
                self.bert_model.encoder.layer[i].attention.self = mean_attention

    def forward(self, *args, **kwargs):
        return self.bert_model(*args, **kwargs)

In [9]:
from transformers import Conv1D
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Model
import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple, Union
from copy import deepcopy

import torch
import torch.utils.checkpoint
from torch import nn
from torch.cuda.amp import autocast
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.pytorch_utils import prune_conv1d_layer, find_pruneable_heads_and_indices

WINDOW_SIZE = 5
    
class LinearClassifierGPT2Attention(GPT2Attention):

    def set_models(self, coefs, layer):
        # print(coefs.size)
        # print(coefs[768])
        self.hidden_pred = torch.nn.Linear(
            in_features=768,
            out_features=1
        )
        self.hidden_pred.data = torch.tensor(coefs[:768]).view(1, -1).to(DEVICE)
        self.pos_i_coef = coefs[768]
        self.pos_j_coef = coefs[769]
        self.relev_pos_i_coef = coefs[770]
        self.relev_pos_j_coef = coefs[771]
        self.inv_pos_i_coef = coefs[772]
        self.inv_pos_j_coef = coefs[773]
        self.inv_relev_pos_i_coef = coefs[774]
        self.inv_relev_pos_j_coef = coefs[775]
        self.seq_len_coef = coefs[776]
        self.inv_seq_len_coef = coefs[777]
        self.layer_coef = coefs[778]
        self.layer = layer

    def set_window_size(self, window_size):
        self.WINDOW_SIZE = window_size
    def forward(
            self,
            hidden_states: Optional[Tuple[torch.FloatTensor]],
            layer_past: Optional[Tuple[torch.Tensor]] = None,
            attention_mask: Optional[torch.FloatTensor] = None, 
            head_mask: Optional[torch.FloatTensor] = None,
            encoder_hidden_states: Optional[torch.Tensor] = None,
            encoder_attention_mask: Optional[torch.FloatTensor] = None,
            use_cache: Optional[bool] = False,
            output_attentions: Optional[bool] = False) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:

        if encoder_hidden_states is not None:
            if not hasattr(self, "q_attn"):
                raise ValueError(
                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
                )

            query = self.q_attn(hidden_states)
            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
            attention_mask = encoder_attention_mask
        else:
            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)

        seq_len = hidden_states.shape[1]
        hidden_states_attention_features = self.hidden_pred(hidden_states)
        
        predicted_attention = hidden_states_attention_features.repeat(1, 1, seq_len) # bs, seq_len, 1
        positions = torch.arange(seq_len)

        #special tokens
        # special_tokens_idxs = (encoder_hidden_states[0] < 103).nonzero().squeeze()
        
        # add i, j pos features
        positions_features_from = positions * self.pos_i_coef # seq_len
        positions_features_out = positions * self.pos_j_coef # seq_len
        predicted_attention += positions_features_from.T
        predicted_attention += positions_features_out
        
        # add relev i, j pos features
        relev_positions_features_from = (seq_len - positions) * self.relev_pos_i_coef # seq_len
        relev_positions_features_out = (seq_len - positions) * self.relev_pos_j_coef # seq_len
        predicted_attention += relev_positions_features_from.T
        predicted_attention += relev_positions_features_out
        
        inv_positions_features_from = (positions / seq_len) * self.inv_pos_i_coef # seq_len
        inv_positions_features_out = (positions / seq_len) * self.inv_pos_j_coef # seq_len
        predicted_attention += inv_positions_features_from.T
        predicted_attention += inv_positions_features_out
        
        inv_relev_positions_features_from = ((seq_len - positions) / seq_len) * self.inv_relev_pos_i_coef # seq_len
        inv_relev_positions_features_out = ((seq_len - positions) / seq_len) * self.inv_relev_pos_j_coef # seq_len
        predicted_attention += inv_relev_positions_features_from.T
        predicted_attention += inv_relev_positions_features_out

        
        seq_len_feature = self.seq_len_coef * seq_len # 1
        inv_seq_len_feature = self.inv_seq_len_coef * (1 / seq_len) # 1
        layer_feature = self.layer * self.layer_coef # 1
        predicted_attention += seq_len_feature
        predicted_attention += inv_seq_len_feature
        predicted_attention += layer_feature

        attention_probs = torch.exp(predicted_attention) #torch.nn.functional.softmax(predicted_attention, dim=-1)
        # print(value_layer.squeeze(1).size())
        context_layer = torch.matmul(attention_probs, value.squeeze(1))
        print(attention_probs.size())
        print(value.squeeze(1).size())
        context_layer = context_layer.permute(0, 2, 1).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)
        
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
    


class GPT2WrapperLin(nn.Module):
    def __init__(self, model, new_attention_class, final_model_weights, layer_nums=None, window_size=2):
        super().__init__()

        self.gpt2_model = deepcopy(model)
        self.layer_nums = layer_nums
        self.final_model_weights_dict = final_model_weights


        # Create a list of modules to modify
        modules_to_modify = []

        for i in layer_nums:
            if (layer_nums is not None and i in layer_nums) or (layer_nums is None):
                mean_attention = new_attention_class(self.gpt2_model.config)
                mean_attention.set_window_size(window_size)
                mean_attention.load_state_dict(self.gpt2_model.h[i].attn.state_dict())
                if self.final_model_weights_dict is not None: # and i in final_model_weights_dict.keys():
                    mean_attention.set_models(self.final_model_weights_dict[i], i)
                self.gpt2_model.h[i].attn = mean_attention
           
            

    def forward(self, *args, **kwargs):
        return self.gpt2_model(*args, **kwargs)


## Pipeline

In [10]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import classification_report
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import torch
#from progressbar import progressbar
from tqdm.auto import tqdm

from collections import defaultdict

import seaborn as sns
from sklearn.linear_model import SGDRegressor
import matplotlib.pyplot as plt
sns.set(style='darkgrid')
import numpy as np
import json
import h5py
import os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

### Define Parameters

In [11]:
# Define dataset
dataset = {'sst2': ['sentence']}

# Define model (bert or gpt2)
model = 'bert' 
# model = 'gpt2' 

### Load Data

In [12]:
if model == 'bert':
    model_name = 'bert-base-uncased'
elif model == 'gpt2':
    model_name = 'gpt2'
else: model_name = 'bert-base-uncased'

DEBUG_FLAG = True
CUT_SIZE = None if not DEBUG_FLAG else 200

try:
    dataset_name = list(glue_classification.keys())[0]
    datasets = load_datasets('glue', list(dataset), CUT_SIZE)
except:
    dataset = {'mrpc': ['sentence1', 'sentence2']}
    dataset_name = 'mrpc'
    datasets = load_datasets('glue', list(dataset), CUT_SIZE)
    
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length=1024)
initial_model = AutoModel.from_pretrained(model_name)
initial_model.eval()

# Define the device
device = 'cpu'#torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == 'cpu':
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
DEVICE=device
initial_model.to(device)

None

In [15]:
for ex_idx, ex in tqdm(enumerate(datasets[dataset_name]['train'])):
    
    encoded_inputs = tokenizer.encode(
                    *[ex[i] for i in list(dataset.values())[0]],
                    truncation=True,
                    return_tensors='pt'
                ).to(device)
    aa = initial_model(encoded_inputs, output_hidden_states=True, output_attentions=True)

    for layer in  range(12):
        if not os.path.exists(f'notebooks/{dataset_name}/layer_{layer}'):
            os.makedirs(f'notebooks/{dataset_name}/layer_{layer}')

        for head_num in range(12): #range(aa.attentions[0].shape[1]):
            if not os.path.exists(f'notebooks/{dataset_name}/layer_{layer}/head_{head_num}'):
                os.makedirs(f'notebooks/{dataset_name}/layer_{layer}/head_{head_num}')

            current_hidden_states = aa.hidden_states[layer][0].detach().cpu().numpy()
            next_hidden_states = aa.hidden_states[layer + 1][0].detach().cpu().numpy()
            attentions = aa.attentions[layer][0][head_num].detach().cpu().numpy() # .item()

            cur_emb = aa.hidden_states[layer][0]

            with h5py.File(f'notebooks/{dataset_name}/layer_{layer}/head_{head_num}/{ex_idx}.hdf5', 'w') as f:
                f.create_dataset("current_hidden_states", data=current_hidden_states)
                f.create_dataset("next_hidden_states", data=next_hidden_states)
                f.create_dataset("attentions", data=attentions)

46it [00:02, 16.94it/s]


KeyboardInterrupt: 

In [16]:
datasets[dataset_name]['train'][:5]

[{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  'label': 1,
  'idx': 0},
 {'sentence1': "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
  'sentence2': "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
  'label': 0,
  'idx': 1},
 {'sentence1': 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
  'sentence2': "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
  'label': 1,
  'idx': 2},
 {'sentence1': 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
  'sentence2': 'Tab shares jumped 20 cent

In [18]:
def get_dataset_for_regression(prob_of_take=0.01, layer=0, head_num=0):
    X_train, y_train = [], []
    X_test, y_test = [], []
    for ex_idx, ex in tqdm(enumerate(datasets[dataset_name]['train'][:100])):
        if np.random.choice([0, 1], size=1, p=[0.8, 0.2])[0] > 0.2:
            encoded_inputs = tokenizer.encode(
                                        *[ex[i] for i in list(dataset.values())[0]],
                                        truncation=True,
                                        return_tensors='pt'
                                    )
            # special_symbols = (encoded_inputs[0] < 103).nonzero().squeeze()
            for head_num in range(12):
                with h5py.File(f'notebooks/{dataset_name}/layer_{layer}/head_{head_num}/{ex_idx}.hdf5', 'r') as f:
        
                    current_hidden_states = f['current_hidden_states'][()]
                    next_hidden_states = f['next_hidden_states'][()]
                    attentions = f['attentions'][()]
            
                    len_of_seq = attentions.shape[0]
                    for from_ in range(len_of_seq):
                        for to_ in range(len_of_seq):
                            if np.random.choice([0, 1], size=1, p=[1-prob_of_take, prob_of_take])[0] > 0.2:
                                            #feature_vector = []
                                            # (pos form, pos to, is_from_special_token, is_to_special_token,
                                            # len_of_seq, rel_position_from, rel_position_to, rev_pos_from, rev_pos_to, to_emb)
            
            
                                feature_vector = list(current_hidden_states[to_])
                                feature_vector.append(from_) # from_ pos
                                feature_vector.append(to_) # to_ pos
            
                                feature_vector.append(len_of_seq - from_) # from_ pos
                                feature_vector.append(len_of_seq - to_) # to_ pos
            
                                feature_vector.append(from_ / len_of_seq) # from_ pos
                                feature_vector.append(to_ / len_of_seq) # to_ pos
            
                                feature_vector.append((len_of_seq - from_) / len_of_seq) # from_ pos
                                feature_vector.append((len_of_seq - to_) / len_of_seq) # to_ pos
            
                                feature_vector.append(len_of_seq)
                                feature_vector.append(1/len_of_seq)
            
                                feature_vector.append(layer)
            
                                            #X_train.append(feature_vector)
                                            # y_train.append(np.log(max(attentions[from_, to_], 1e-10)))
            
                                X_train.append(feature_vector)
                                y_train.append(np.log(attentions[from_, to_]))
                        
    l = round(len(X_train)*0.95)
    X_test = X_train[l:]
    y_test = y_train[l:]
    X_train = X_train[:l]
    y_train = y_train[:l]
    
    return X_train, y_train, X_test, y_test

### Fit Linear Models

In [19]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from statistics import mean
    
def learn_linear(model, y_scaler, reverse_scaler, X_train, y_train, X_test, y_test, num_examples = 5):
    # y_scaler = StandardScaler()
    y_train_scaled = y_scaler(y_train) 
    y_test_scaled = y_scaler(y_test) 

    print('Y train mean scaled:', np.mean(y_train_scaled), ', Y test mean scaled:', np.mean(y_test_scaled))
    
    model.fit(X_train, y_train_scaled)

    return model

pipe_linear = Ridge()
id_scaler = lambda x: [el for el in x]


In [21]:
# for layer in [11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]:
    
#     for i in range(5):
#         if not os.path.exists(f'notebooks/data/{dataset_name}/layer_{layer}/'):
#             os.makedirs(f'notebooks/data/{dataset_name}/layer_{layer}/')
#         X_train, y_train, X_test, y_test = get_dataset_for_regression(prob_of_take=0.3, layer=layer, head_num = 0)
#         with h5py.File(f'notebooks/data/{dataset_name}/layer_{layer}/{i}.hdf5', 'w') as f:
#                     f.create_dataset("X_train", data=X_train)
#                     f.create_dataset("y_train", data=y_train)
#                     f.create_dataset("X_test", data=X_test)
#                     f.create_dataset("y_test", data=y_test)

In [26]:
from sklearn.metrics import r2_score, mean_squared_error
import psutil

final_models = {}
final_coefs= {}
for layer in tqdm([11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]):
    final_models[layer] = []
    print(f'Layer {layer}')
    for i in range(5):
        with h5py.File(f'notebooks/data/{dataset_name}/layer_{layer}/{i}.hdf5', 'r') as f:
            X_train = f['X_train'][()]
            y_train = f['y_train'][()]
            X_test = f['X_test'][()]
            y_test = f['y_test'][()]        
        print(f'Train: {len(X_train)}, Test: {len(X_test)}')
        model = learn_linear(Ridge(), id_scaler, id_scaler, X_train, y_train, X_test, y_test)
        preds_scaled = model.predict(X_test)
        preds = id_scaler(preds_scaled)
        score_test = r2_score(y_test, preds)
        print(score_test)
        final_models[layer].append(model)
    final_coefs[layer] = model.coef_

  0%|                                                                                                                                                                                          | 0/12 [00:00<?, ?it/s]

Layer 11
Train: 179237, Test: 9434
Y train mean scaled: -6.1578965 , Y test mean scaled: -6.2634125
0.3818640016782672
Train: 199243, Test: 10486
Y train mean scaled: -6.1604867 , Y test mean scaled: -5.525296
0.3621801232801043
Train: 213749, Test: 11250
Y train mean scaled: -6.0665736 , Y test mean scaled: -6.140334
0.36762481227105936
Train: 277808, Test: 14621
Y train mean scaled: -6.006435 , Y test mean scaled: -6.1426926
0.44589356614612574
Train: 214840, Test: 11307
Y train mean scaled: -6.037857 , Y test mean scaled: -6.1589155


  8%|██████████████▊                                                                                                                                                                   | 1/12 [00:18<03:28, 18.91s/it]

0.3647030610054549
Layer 10
Train: 217785, Test: 11462
Y train mean scaled: -6.4428806 , Y test mean scaled: -6.7527204
0.27877886611711955
Train: 238955, Test: 12577
Y train mean scaled: -6.5305724 , Y test mean scaled: -6.365891
0.24446020582224293
Train: 276695, Test: 14563
Y train mean scaled: -6.521085 , Y test mean scaled: -5.989593
0.2854751188756227
Train: 235964, Test: 12419
Y train mean scaled: -6.638694 , Y test mean scaled: -6.811783
0.2735633834153105
Train: 264796, Test: 13937
Y train mean scaled: -6.6633177 , Y test mean scaled: -6.2943664


 17%|█████████████████████████████▋                                                                                                                                                    | 2/12 [00:46<03:57, 23.73s/it]

0.39433824666289397
Layer 9
Train: 182245, Test: 9592
Y train mean scaled: -6.4518476 , Y test mean scaled: -6.2638755
0.3058360509547411
Train: 235026, Test: 12370
Y train mean scaled: -6.298654 , Y test mean scaled: -6.3035607
0.12158448200269101
Train: 209719, Test: 11038
Y train mean scaled: -6.2896285 , Y test mean scaled: -6.1863465
0.2818339986687004
Train: 186049, Test: 9792
Y train mean scaled: -6.1906633 , Y test mean scaled: -6.266952
0.28761278034514526
Train: 213284, Test: 11226
Y train mean scaled: -6.3013754 , Y test mean scaled: -6.191964


 25%|████████████████████████████████████████████▌                                                                                                                                     | 3/12 [01:19<04:12, 28.08s/it]

0.26160940787369746
Layer 8
Train: 221242, Test: 11644
Y train mean scaled: -6.518686 , Y test mean scaled: -6.6439466
0.12291546847265833
Train: 243522, Test: 12817
Y train mean scaled: -6.550937 , Y test mean scaled: -6.003183
0.2305637289236242
Train: 156122, Test: 8217
Y train mean scaled: -6.5783033 , Y test mean scaled: -6.400243
0.015272022114282469
Train: 196903, Test: 10363
Y train mean scaled: -6.5808277 , Y test mean scaled: -6.550075
0.10903027404833066
Train: 128886, Test: 6784
Y train mean scaled: -6.4751105 , Y test mean scaled: -5.5911617


 33%|███████████████████████████████████████████████████████████▎                                                                                                                      | 4/12 [01:48<03:47, 28.43s/it]

-0.34544539671177854
Layer 7
Train: 260326, Test: 13701
Y train mean scaled: -7.070214 , Y test mean scaled: -8.169417
0.12168676101072318
Train: 220788, Test: 11620
Y train mean scaled: -7.146482 , Y test mean scaled: -7.4065895
0.036475558174698164
Train: 257095, Test: 13531
Y train mean scaled: -6.9103785 , Y test mean scaled: -7.7638535
0.11054247430121167
Train: 216050, Test: 11371
Y train mean scaled: -6.998885 , Y test mean scaled: -7.6898108
0.175963553918285
Train: 213523, Test: 11238
Y train mean scaled: -6.919623 , Y test mean scaled: -7.385061


 42%|██████████████████████████████████████████████████████████████████████████▏                                                                                                       | 5/12 [02:26<03:44, 32.11s/it]

0.08088130161318574
Layer 6
Train: 246583, Test: 12978
Y train mean scaled: -6.9517536 , Y test mean scaled: -7.0349607
0.18905495066130817
Train: 285703, Test: 15037
Y train mean scaled: -7.039995 , Y test mean scaled: -7.6358056
0.17884177108415178
Train: 165715, Test: 8722
Y train mean scaled: -6.9370084 , Y test mean scaled: -7.0364294
0.1558937665584852
Train: 172420, Test: 9075
Y train mean scaled: -7.1216164 , Y test mean scaled: -7.060547
0.16251817007630842
Train: 149901, Test: 7890
Y train mean scaled: -6.9669046 , Y test mean scaled: -7.2712173


 50%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                         | 6/12 [03:01<03:17, 32.90s/it]

0.16214041717993954
Layer 5
Train: 225025, Test: 11843
Y train mean scaled: -7.009394 , Y test mean scaled: -7.025103
0.15100785509410541
Train: 180737, Test: 9512
Y train mean scaled: -6.931515 , Y test mean scaled: -7.54635
0.16002709675090032
Train: 190442, Test: 10023
Y train mean scaled: -7.0926166 , Y test mean scaled: -6.3380527
0.13178930730268512
Train: 166911, Test: 8785
Y train mean scaled: -6.9531384 , Y test mean scaled: -6.607244
0.03267372427669313
Train: 189150, Test: 9955
Y train mean scaled: -7.0320315 , Y test mean scaled: -6.6471334


 58%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 7/12 [03:33<02:42, 32.60s/it]

0.10478077484065518
Layer 4
Train: 275109, Test: 14479
Y train mean scaled: -6.4178677 , Y test mean scaled: -6.757731
0.12918064903661786
Train: 171354, Test: 9019
Y train mean scaled: -6.480999 , Y test mean scaled: -6.003023
-0.9405488933272337
Train: 304090, Test: 16005
Y train mean scaled: -6.4050846 , Y test mean scaled: -6.6637254
0.17148439062649
Train: 219412, Test: 11548
Y train mean scaled: -6.265139 , Y test mean scaled: -7.0308867
-0.021037647405642845
Train: 179647, Test: 9455
Y train mean scaled: -6.388902 , Y test mean scaled: -6.5418086


 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 8/12 [04:12<02:18, 34.67s/it]

0.19134493303768074
Layer 3
Train: 137931, Test: 7260
Y train mean scaled: -6.2949314 , Y test mean scaled: -6.54586
0.10879854658656396
Train: 174905, Test: 9206
Y train mean scaled: -6.2840333 , Y test mean scaled: -6.9218183
0.10782558253749819
Train: 194816, Test: 10253
Y train mean scaled: -6.3242054 , Y test mean scaled: -6.657123
0.07741590339431381
Train: 252977, Test: 13315
Y train mean scaled: -6.429841 , Y test mean scaled: -6.4614825
-0.3658654705553581
Train: 140021, Test: 7370
Y train mean scaled: -6.25741 , Y test mean scaled: -6.5367856


 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 9/12 [04:42<01:39, 33.31s/it]

0.10638087750421077
Layer 2
Train: 276925, Test: 14575
Y train mean scaled: -8.313403 , Y test mean scaled: -8.315569
-0.08588414233241459
Train: 213316, Test: 11227
Y train mean scaled: -8.277937 , Y test mean scaled: -7.9477468
-0.07232553310463574
Train: 203594, Test: 10715
Y train mean scaled: -8.23842 , Y test mean scaled: -7.347701
-0.7490255213512425
Train: 241582, Test: 12715
Y train mean scaled: -8.325938 , Y test mean scaled: -7.2572145
-5.1970314469787455
Train: 153595, Test: 8084
Y train mean scaled: -8.193428 , Y test mean scaled: -7.8104696


 83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                             | 10/12 [05:18<01:08, 34.19s/it]

-0.7958219207786603
Layer 1
Train: 186343, Test: 9808
Y train mean scaled: -6.1787515 , Y test mean scaled: -6.319547
0.022353193450617814
Train: 271740, Test: 14302
Y train mean scaled: -6.2045674 , Y test mean scaled: -6.298799
0.02324683737443911
Train: 170729, Test: 8986
Y train mean scaled: -6.290158 , Y test mean scaled: -6.250949
0.03437165339770565
Train: 131560, Test: 6924
Y train mean scaled: -6.074579 , Y test mean scaled: -6.16599
0.028032741017755547
Train: 190930, Test: 10049
Y train mean scaled: -6.1945896 , Y test mean scaled: -6.219161


 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 11/12 [05:50<00:33, 33.37s/it]

-0.02625688930120962
Layer 0
Train: 199391, Test: 10494
Y train mean scaled: -5.14934 , Y test mean scaled: -5.295481
-0.0024836534905852137
Train: 196014, Test: 10317
Y train mean scaled: -5.19817 , Y test mean scaled: -4.7565427
0.052327006664451114
Train: 248230, Test: 13065
Y train mean scaled: -5.181726 , Y test mean scaled: -4.8724885
0.032696979785270974
Train: 224251, Test: 11803
Y train mean scaled: -5.2411304 , Y test mean scaled: -5.2886014
-0.015654715548899745
Train: 204465, Test: 10761
Y train mean scaled: -5.1642017 , Y test mean scaled: -5.385914


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [06:25<00:00, 32.14s/it]

0.004310280937733224





In [28]:
import pandas as pd
cols = ['dataset_name', 'dataset_id','layer', 'model_name', 'r2', 'mse', 'explained_variance_score', 'max_error', 
                                     'mean_absolute_percentage_error', 'model']
metrics = pd.DataFrame([], columns = cols)
metrics

Unnamed: 0,dataset_name,dataset_id,layer,model_name,r2,mse,explained_variance_score,max_error,mean_absolute_percentage_error,model


In [34]:
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_percentage_error
def check_metrics(model_list):   
    data = {i: '' for i in cols}
    data['dataset_name'] = list(dataset.keys())[0]
    for layer in tqdm(range(11)):
        data['layer'] = layer
        for i, model in enumerate(final_models[layer]):
            data['dataset_id'] = i
            data['model_name'] = str(model)
            with h5py.File(f'notebooks/data/{dataset_name}/layer_{layer}/{i}.hdf5', 'r') as f:
                X_train = f['X_train'][()]
                y_train = f['y_train'][()]
                X_test = f['X_test'][()]
                y_test = f['y_test'][()]        
            preds_scaled = model.predict(X_test)
            preds = id_scaler(preds_scaled)
            data['r2'] = r2_score(y_test, preds)
            data['mse'] = mean_squared_error(y_test, preds)
            data['explained_variance_score'] = explained_variance_score(y_test, preds)
            data['max_error'] = max_error(y_test, preds)
            data['mean_absolute_percentage_error'] = mean_absolute_percentage_error(y_test, preds)
            data['model'] = model
            metrics.loc[len(metrics)] = data

In [35]:
check_metrics(final_models)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [04:08<00:00, 22.55s/it]


In [39]:
metrics[metrics['layer']==9]

Unnamed: 0,dataset_name,dataset_id,layer,model_name,r2,mse,explained_variance_score,max_error,mean_absolute_percentage_error,model
45,mrpc,0,9,Ridge(),0.305836,3.330985,0.307352,8.8286,0.281896,Ridge()
46,mrpc,1,9,Ridge(),0.121584,3.907439,0.24279,8.292103,0.331089,Ridge()
47,mrpc,2,9,Ridge(),0.281834,3.2015,0.316525,8.745685,0.283262,Ridge()
48,mrpc,3,9,Ridge(),0.287613,3.186917,0.290803,7.624804,0.273508,Ridge()
49,mrpc,4,9,Ridge(),0.261609,3.322623,0.295856,9.154041,0.301652,Ridge()


### Evaluate Results

In [40]:
from sklearn.impute import SimpleImputer

tqdm_pbar = lambda x, y: tqdm(x, leave=True, position=0, total=len(x), desc=f'{y}')
def get_cls_embeddings_for_dataset(dataset_name, dataset, 
                                   feature_names, model, tokenizer, 
                                   pbar_func=tqdm_pbar, device=device, CUT_SIZE=CUT_SIZE):
    collected_embeddings = defaultdict(list)

    for split, data in dataset.items():
        
        # pbar = pbar_func(data, f"{split} {dataset_name}") if pbar_func is not None else data
        for example in data:
            # Encode the input sentences
            if len(feature_names) == 2:
                encoded_inputs = tokenizer.encode(
                    example[feature_names[0]], 
                    example[feature_names[1]], 
                    truncation=True, 
                    return_tensors='pt'
                )
            else:
                encoded_inputs = tokenizer.encode(*list(map(lambda x: example[x] , feature_names)), 
                                              truncation=True, 
                                              return_tensors='pt')
            
            special_token_positions = (encoded_inputs[0] < 103).nonzero().squeeze()
            encoded_inputs = encoded_inputs.to(device)

            # print(f"Encoded inputs: {encoded_inputs}")
            # print(f"Special token positions: {special_token_positions}")
            
            # Forward pass through the model
            with torch.no_grad():
                outputs = model(encoded_inputs)
            # print(f'Outputs: {outputs}')

            # Get the embedding of the [CLS] token
            cls_embedding = outputs.last_hidden_state[:, 0, :]

            # Append the [CLS] embedding to the list
            collected_embeddings[split].append(cls_embedding)
         
    return collected_embeddings

def train_linear(X_train, y_train):
    classifier = LogisticRegression(solver='lbfgs', max_iter=3000)
    classifier.fit(X_train, y_train)
    return classifier

def evaluate_classifier(classifier, X, y=None):
    predictions = classifier.predict(X)
    return predictions

def get_metrics_report(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred,  average='weighted')
    print('Weighted F1', f1)
    print('Accuracy', accuracy)
    print('-------------------------------')
    return f1, accuracy

glue_classification = {'mrpc': ['sentence1', 'sentence2'], 'sst2':  ['sentence']}
superglue_classification = {'wic': ['sentence1', 'sentence2']}
all_classification = {'glue': glue_classification, 'superglue': superglue_classification}

glue_datasets = load_datasets('glue', list(glue_classification), 200)
superglue_datasets = load_datasets('super_glue', list(superglue_classification), 200)

all_datasets = {'glue': glue_datasets, 'superglue': superglue_datasets}


In [62]:
cols = ['dataset_name', 'dataset_id','first_layer', 'accurasy', 'f1']
results = pd.DataFrame([], columns = cols)
data = {i: '' for i in cols}
def get_layer_results(initial_model):
    initial_model = initial_model.to(device)
    for layer in range(11):
        layers = list(range(layer, 11))
        coefs = [final_coefs[i] for i in layers]
        if model == 'bert':
            custom_model = BertWrapperLin(initial_model, LinearClassifierBertAttention, coefs, layers).to(device)
        else: 
            custom_model = GPT2WrapperLin(initial_model, LinearClassifierGPT2Attention, coefs,
                            layers).to(device)
        
    

def check_results(custom_model, initial_model):
    data = {}
    for dn, datasets in all_datasets.items():
        for dataset_name, dataset in datasets.items():
            print(f"{dn.upper()} / {dataset_name}\n")
            
            print('Original')
            dataset_embeddings_orig = get_cls_embeddings_for_dataset(
                dataset_name,
                dataset, 
                all_classification[dn][dataset_name], 
                initial_model, 
                tokenizer)
            
            train_dataset_embeddings = torch.cat(dataset_embeddings_orig['train'], dim=0)
            valid_dataset_embeddings = torch.cat(dataset_embeddings_orig['validation'], dim=0)
            test_dataset_embeddings = torch.cat(dataset_embeddings_orig['test'], dim=0)
            
            classif = train_linear(train_dataset_embeddings.cpu(), [el['label'] for el in dataset['train']])
            valid_preds = evaluate_classifier(classif, valid_dataset_embeddings.cpu())
            print('Validation evaluation:\n')
            a, f = get_metrics_report([el['label'] for el in dataset['validation']], valid_preds)
            
            
            
            # print('Linear:')
            
            # dataset_embeddings_custom = get_cls_embeddings_for_dataset(
            #     dataset_name,
            #     dataset, 
            #     all_classification[dn][dataset_name], 
            #     custom_model, 
            #     tokenizer)
            
            # train_dataset_embeddings = torch.cat(dataset_embeddings_custom['train'], dim=0)
            # valid_dataset_embeddings = torch.cat(dataset_embeddings_custom['validation'], dim=0)
            # test_dataset_embeddings = torch.cat(dataset_embeddings_custom['test'], dim=0)


            # classif = train_linear(train_dataset_embeddings.cpu(), [el['label'] for el in dataset['train']])
            # valid_preds = evaluate_classifier(classif, valid_dataset_embeddings.cpu())
            # print('Validation evaluation:\n')
            # f1, acc = get_metrics_report([el['label'] for el in dataset['validation']], valid_preds)
            
            # print('-------------------------------')    
    return [f1, acc]


In [None]:
data['dataset_name'] = dataset_name
for i in range(5):
    data['dataset_id'] = i
    for layer in tqdm(range(8, 11)):
        data['first_layer'] = layer
        coefs = {j : final_models[j][i].coef_ for j in range(layer, 12)}
        custom_model = BertWrapperLin(initial_model, LinearClassifierBertAttention, coefs, [layer]).to(device)
        f, a = check_results(custom_model, initial_model)

  0%|                                                                                                                                                                                           | 0/3 [00:00<?, ?it/s]

GLUE / mrpc

Original
Validation evaluation:

Weighted F1 0.6956793959319341
Accuracy 0.735
-------------------------------
GLUE / sst2

Original


In [74]:
results[results['dataset_id'] == 0]

Unnamed: 0,dataset_name,dataset_id,first_layer,accurasy,f1
0,mrpc,0,0,,0.554766
1,mrpc,0,1,,0.524465
2,mrpc,0,2,,0.538382
3,mrpc,0,3,,0.522357
4,mrpc,0,4,,0.521585
5,mrpc,0,5,,0.524278
6,mrpc,0,6,,0.528868
7,mrpc,0,7,,0.503549
8,mrpc,0,8,,0.519616
9,mrpc,0,9,,0.609883


In [59]:
metrics[metrics['dataset_id'] == 0]

Unnamed: 0,dataset_name,dataset_id,layer,model_name,r2,mse,explained_variance_score,max_error,mean_absolute_percentage_error,model
0,mrpc,0,0,Ridge(),-0.002484,3.29493,0.035103,9.900027,0.349006,Ridge()
5,mrpc,0,1,Ridge(),0.022353,10.513316,0.036862,20.470954,0.530845,Ridge()
10,mrpc,0,2,Ridge(),-0.085884,53.482803,-0.04501,36.082396,8261.937744,Ridge()
15,mrpc,0,3,Ridge(),0.108799,7.7739,0.111246,16.661229,0.714129,Ridge()
20,mrpc,0,4,Ridge(),0.129181,5.322289,0.150011,12.567723,0.332041,Ridge()
25,mrpc,0,5,Ridge(),0.151008,5.724083,0.151073,12.979287,0.670714,Ridge()
30,mrpc,0,6,Ridge(),0.189055,5.510399,0.195094,14.413526,0.428811,Ridge()
35,mrpc,0,7,Ridge(),0.121687,6.908023,0.248233,10.070678,0.341354,Ridge()
40,mrpc,0,8,Ridge(),0.122915,4.856509,0.189245,7.38196,0.346971,Ridge()
45,mrpc,0,9,Ridge(),0.305836,3.330985,0.307352,8.8286,0.281896,Ridge()


In [63]:
custom_model = BertWrapperLin(initial_model, LinearClassifierBertAttention, final_coefs, list(range(6, 12))).to(device)
# else: 
#     custom_model = GPT2WrapperLin(initial_model, LinearClassifierGPT2Attention, linear_layers,
                        # list(linear_layers.keys())).to(device)
check_results(custom_model, initial_model)

GLUE / mrpc

Original
Validation evaluation:

Weighted F1 0.6956793959319341
Accuracy 0.735
-------------------------------
GLUE / sst2

Original
Validation evaluation:

Weighted F1 0.7950051251281284
Accuracy 0.795
-------------------------------
SUPERGLUE / wic

Original
Validation evaluation:

Weighted F1 0.5980676328502416
Accuracy 0.6
-------------------------------


NameError: name 'f1' is not defined

In [73]:
results

Unnamed: 0,dataset_name,dataset_id,first_layer,accurasy,f1
0,mrpc,0,0,,0.554766
1,mrpc,0,1,,0.524465
2,mrpc,0,2,,0.538382
3,mrpc,0,3,,0.522357
4,mrpc,0,4,,0.521585
5,mrpc,0,5,,0.524278
6,mrpc,0,6,,0.528868
7,mrpc,0,7,,0.503549
8,mrpc,0,8,,0.519616
9,mrpc,0,9,,0.609883


In [131]:
custom_model = BertWrapperLin(initial_model, LinearClassifierBertAttention, final_coefs, list(range(10, 12))).to(device)
# else: 
#     custom_model = GPT2WrapperLin(initial_model, LinearClassifierGPT2Attention, linear_layers,
                        # list(linear_layers.keys())).to(device)
check_results(custom_model, initial_model)

GLUE / mrpc

Original
Validation evaluation:

Weighted F1 0.6956793959319341
Accuracy 0.735
-------------------------------
Linear:
Validation evaluation:

Weighted F1 0.6650466200466201
Accuracy 0.71
-------------------------------
-------------------------------
GLUE / sst2

Original
Validation evaluation:

Weighted F1 0.7950051251281284
Accuracy 0.795
-------------------------------
Linear:
Validation evaluation:

Weighted F1 0.8049268585431721
Accuracy 0.805
-------------------------------
-------------------------------
SUPERGLUE / wic

Original
Validation evaluation:

Weighted F1 0.5980676328502416
Accuracy 0.6
-------------------------------
Linear:
Validation evaluation:

Weighted F1 0.5344758135223827
Accuracy 0.535
-------------------------------
-------------------------------


In [202]:
import time
rte = load_dataset('glue', 'qnli')
long_dataset = []
cols = ['question', 'sentence']
for i in rte['train']:
    if sum([len(i[t]) for t in cols]) > 900:
        long_dataset.append(i)
for i in rte['test']:
    if sum([len(i[t]) for t in cols]) > 900:
        long_dataset.append(i)
for i in rte['validation']:
    if sum([len(i[t]) for t in cols]) > 900:
        long_dataset.append(i)

l = round(len(long_dataset)*0.9)
print(f'Dataset size: {len(long_dataset)}') 
l_d = {'train': long_dataset[:l], 'validation':long_dataset[l:]}
print('Original:')
start_time = time.time()
dataset_embeddings_orig = get_cls_embeddings_for_dataset(
                'long_dataset',
                l_d, 
                cols, 
                initial_model, 
                tokenizer)

print(f'Original time: {time.time()- start_time }')

train_dataset_embeddings = torch.cat(dataset_embeddings_orig['train'], dim=0)
valid_dataset_embeddings = torch.cat(dataset_embeddings_orig['validation'], dim=0)
            
classif = train_linear(train_dataset_embeddings.cpu(), [el['label'] for el in l_d['train']])
valid_preds = evaluate_classifier(classif, valid_dataset_embeddings.cpu())
print('Validation evaluation:\n')
get_metrics_report([el['label'] for el in l_d['validation']], valid_preds)
            # print(train_dataset_embeddings.shape)

            
print('Linear:')
start_time = time.time()    
dataset_embeddings_custom = get_cls_embeddings_for_dataset(
                'long_dataset',
                l_d, 
                cols, 
                custom_model, 
                tokenizer)
print(f'Original time: {time.time()- start_time }')
            
train_dataset_embeddings = torch.cat(dataset_embeddings_custom['train'], dim=0)
valid_dataset_embeddings = torch.cat(dataset_embeddings_custom['validation'], dim=0)


classif = train_linear(train_dataset_embeddings.cpu(), [el['label'] for el in l_d['train']])
valid_preds = evaluate_classifier(classif, valid_dataset_embeddings.cpu())
print('Validation evaluation:\n')
get_metrics_report([el['label'] for el in l_d['validation']], valid_preds)
print('-------------------------------')

Dataset size: 30
Original:
Original time: 4.594026327133179
Validation evaluation:

Weighted F1 0.16666666666666666
Accuracy 0.3333333333333333
-------------------------------
Linear:
Original time: 4.907689571380615
Validation evaluation:

Weighted F1 0.16666666666666666
Accuracy 0.3333333333333333
-------------------------------
-------------------------------


In [None]:
llangnickel
/
long-covid-classification-data 

In [217]:
data = load_dataset('THUDM/LongBench', "trec_e")

Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [220]:
long_data_1 = load_dataset('llangnickel/long-covid-classification-data')
long_data_2 = load_dataset('THUDM/LongBench', "trec_e")

Downloading readme:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/674k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/234k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [224]:
l = 0
for i in data['train']:
    l+=len(i['text'])
print(l/len(data['train']))

1620.306763285024
