In [1]:
%%html
<style type='text/css'>
.CodeMirror{
    font-size: 16px;
    font-family: Monaco;
}

div.output_area pre {
    font-size: 12px;
}
</style>

In [2]:
import os
os.chdir(os.getcwd()+"./..")

In [3]:
# import libaries
import torch
from torch import cuda
from torch.utils.data import Dataset,DataLoader

### load model

In [19]:
class document_selection_model_config():
    model_path = r"./models/twin-albert-for-long-text-pair-classification/pytorch_model.bin"
    model_dir = r"./models/twin-albert-for-long-text-pair-classification/"
    max_length = 512
    batch_size = 12
    doc_overlap_length = 32

In [15]:
device = "cuda" if cuda.is_available() else "cpu"

In [7]:
document_selection_model_config.model_path

'./models/twin-albert-for-long-text-pair-classification/pytorch_model.bin'

In [9]:
from transformers import AlbertModel

from transformers.modeling_outputs import SequenceClassifierOutput

class TwinAlBerts(torch.nn.Module):
    def __init__(self,model_config):
        
        super(TwinAlBerts,self).__init__()
        
        self.albert_layer_1 = AlbertModel.from_pretrained(model_config.model_name)
        self.albert_layer_2 = AlbertModel.from_pretrained(model_config.model_name)
        
        self.pre_classifier = torch.nn.Linear(768*2,768)
        
        self.dropout = torch.nn.Dropout(0.3)
        
        self.classifer = torch.nn.Linear(768,model_config.num_class)

        self.loss_fct = torch.nn.CrossEntropyLoss()

    def forward(self,token_inputs_1,token_inputs_2,labels=None):

        albert_outputs_1 = self.albert_layer_1(**token_inputs_1)
        albert_outputs_2 = self.albert_layer_2(**token_inputs_2)
        
        pooler_output_1 = albert_outputs_1.pooler_output

        pooler_output_2 = albert_outputs_2.pooler_output
        
        
        concat_pooler = torch.cat([pooler_output_1,pooler_output_2],axis = 1)
        
        concat_pooler = self.pre_classifier(concat_pooler)
        
        concat_pooler = self.dropout(concat_pooler)
        
        logits = self.classifer(concat_pooler)
        
        loss = None
        if labels is not None:
            loss = self.loss_fct(logits,labels)
                    
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=None,
            attentions=None,
        )

In [10]:
model = torch.load(document_selection_model_config.model_path)

### load data

In [17]:
from transformers import AutoTokenizer

In [20]:
tokenizer = AutoTokenizer.from_pretrained(document_selection_model_config.model_dir)

In [12]:
from project_modules.utils import get_squad_v2_pandas_dataframe

In [13]:
squad_train_df, squad_dev_df = get_squad_v2_pandas_dataframe()

In [14]:
squad_train_df.head()

Unnamed: 0,id,title,context,question,answer_text,answer_start,is_impossible
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,526,False
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",166,False
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,276,False


In [27]:
question = "When did Beyonce start becoming popular?"
context_list = squad_train_df.head(10)['context'].to_list()

In [29]:
import pandas as pd

In [33]:
df = pd.DataFrame()

In [35]:
df['context'] = context_list

In [37]:
df['question'] = question

In [38]:
df.head()

Unnamed: 0,question,context
0,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
1,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
2,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
3,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
4,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...


In [40]:
def get_token(question,context):
    
    inputs = tokenizer(
            text = question,
            text_pair = context,
            add_special_tokens = True,
            max_length = 512,
            padding = "max_length",
            return_token_type_ids = True,
            truncation = "only_second",
#             return_tensors=   'pt'
            )
    
    return inputs

In [41]:
def get_raw_token(question,context):
    
    inputs = tokenizer(
        text = question,
        text_pair = context,
        add_special_tokens = True,
        max_length = None,
        padding = False,
        return_token_type_ids = True,
        truncation = False,
        return_offsets_mapping = True
        )
    
    return inputs

In [42]:
def split_long_token(question,context,raw_token,doc_overlap_length = 32):
    
    first_context_end_pos = raw_token['offset_mapping'][511][1]-1# because of specical token
    context_1 = context[:first_context_end_pos]
    
    sencond_char_start_pos = raw_token['offset_mapping'][511-doc_overlap_length][0]-1
    
    context_2 = context[sencond_char_start_pos:]
    
    
    inputs_1 = get_token(question,context_1)
    
    inputs_2 = get_token(question,context_2)

        
    return inputs_1,inputs_2

In [43]:
def prepare_feature(example):
    
    context = example['context']
    question = example['question']
    
    # get raw token
    raw_token  = get_raw_token(question,context)
    
    if len(raw_token['input_ids'])<=512:
        
        token_inputs_1,token_inputs_2 = duplicate_token(question,context)
        
    else:
        token_inputs_1,token_inputs_2 = split_long_token(question,context,raw_token,doc_overlap_length = 32)
        
    return token_inputs_1,token_inputs_2

In [45]:
def duplicate_token(question,context):
    
    inputs = get_token(question,context)
    
    return inputs,inputs

In [51]:
token_pairs_list = df.apply(lambda x:prepare_feature(x),axis = 1)
tokens_left = [pair[0] for pair in token_pairs_list]

tokens_left_df  = pd.DataFrame(tokens_left )

tokens_right = [pair[1] for pair in token_pairs_list]
tokens_right_df  = pd.DataFrame(tokens_right)

In [53]:
class LongTextPairDataSet(Dataset):
    
    def __init__(self,df_pair_1,df_pair_2, label_list=None,device = "cpu"):
        self.len = len(df_pair_1)
        self.df_pair_1 = df_pair_1
        self.df_pair_2 = df_pair_2
        self.label_list = label_list
        self.device = device
    
    def __getitem__(self,index):
        df_1 = self.df_pair_1.iloc[index]
        df_2 = self.df_pair_2.iloc[index]
        if self.label_list is not None:
            labels = self.label_list[index]
        
        if isinstance(df_1,pd.core.series.Series):
            pair_dict_1 = df_1.to_dict()
            pair_dict_2 = df_2.to_dict()
            
        else:
            pair_dict_1 = df_1.to_dict(orient = "list")
            pair_dict_2 = df_2.to_dict(orient = "list")
        
        inputs_1 = {k:torch.tensor(v).to(self.device) for k,v in pair_dict_1.items()}
        
        inputs_2 = {k:torch.tensor(v).to(self.device) for k,v in pair_dict_2.items()}
        
        
        if self.label_list is not None:
            return {"token_inputs_1":inputs_1,"token_inputs_2":inputs_2,"labels":torch.tensor(labels).to(self.device)}
        else:
            return {"token_inputs_1":inputs_1,"token_inputs_2":inputs_2}
            
    def __len__(self):
        return self.len

In [58]:
pred_dataset = LongTextPairDataSet(tokens_left_df,tokens_right_df,device = device)

In [59]:
pred_dataset[:4]

{'token_inputs_1': {'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
  'input_ids': tensor([[  2,  76, 144,  ...,   0,   0,   0],
          [  2,  76, 144,  ...,   0,   0,   0],
          [  2,  76, 144,  ...,   0,   0,   0],
          [  2,  76, 144,  ...,   0,   0,   0]], device='cuda:0'),
  'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')},
 'token_inputs_2': {'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
  'input_ids': tensor([[  2,  76, 144,  ...,   0,   0,   0],
          [  2,  76, 144,  ...,   0,   0,   0],
          [  2,  76, 144,  ...,   0,   0,   0],
          [  2,  76, 144,  ...,   0,   

In [61]:
sample = pred_dataset[:6]

In [62]:
_ = model.to(device)

In [63]:
with torch.no_grad():
    outputs = model(**sample)

In [64]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-3.0678,  2.8800],
        [-3.0678,  2.8800],
        [-3.0678,  2.8800],
        [-3.0678,  2.8800],
        [-3.0678,  2.8800],
        [-3.0678,  2.8800]], device='cuda:0'), hidden_states=None, attentions=None)