In [1]:
%%html
<style type='text/css'>
.CodeMirror{
    font-size: 16px;
    font-family: Monaco;
}

div.output_area pre {
    font-size: 12px;
}
</style>

In [2]:
import os
os.chdir(os.getcwd()+"./..")

In [3]:
import pandas as pd
import numpy as np

In [4]:
from torch.utils.data import Dataset,DataLoader
import torch

In [5]:
qnli_model_path = r"E:\MyFiles\WorkSpace\BertModels\QNLI\albert-base-v2-fine-tuned-qnli-sample"

In [6]:
def read_qnli_data(file_name,data_dir=r"E:\MyFiles\WorkSpace\Data\QNLIv2\QNLI"):
    path = os.path.join(data_dir,file_name)
    with open(path,encoding = 'utf-8-sig') as f:
        text = f.readlines()

    header = text[0].strip().split("\t")
    lines = [line.strip().split("\t") for line in text[1:]]

    df = pd.DataFrame(lines,columns = header)
    return df

In [7]:
qnli_train_df = read_qnli_data("train.tsv")
qnli_dev_df = read_qnli_data("dev.tsv")

In [8]:
qnli_train_df['label'] = np.where(qnli_train_df['label']=='entailment',1,0)
qnli_dev_df['label'] = np.where(qnli_dev_df['label']=='entailment',1,0)

In [9]:
qnli_train_df.head(9)

Unnamed: 0,index,question,sentence,label
0,0,When did the third Digimon series begin?,Unlike the two seasons before it and most of t...,0
1,1,Which missile batteries often have individual ...,"When MANPADS is operated by specialists, batte...",0
2,2,What two things does Popper argue Tarski's the...,He bases this interpretation on the fact that ...,1
3,3,What is the name of the village 9 miles north ...,"On 31 December 1853, the Ottoman forces at Cal...",1
4,4,What famous palace is located in London?,London contains four World Heritage Sites: the...,0
5,5,When is the term 'German dialects' used in reg...,"When talking about the German language, the te...",1
6,6,What was the name of the island the English tr...,"At the end of the Second Anglo-Dutch War, the ...",1
7,7,How were the Portuguese expelled from Myanmar?,"From the 1720s onward, the kingdom was beset w...",0
8,8,What does the word 'customer' properly apply to?,The bill also required rotation of principal m...,1


In [10]:
example_id = 10

question= qnli_train_df['question'].iloc[example_id]
sentence_list = qnli_train_df['sentence'].iloc[:example_id].to_list()
label_list = qnli_train_df['label'].iloc[:example_id].to_list()

In [11]:
question= qnli_train_df['question'].iloc[example_id]
sentence_list = qnli_train_df['sentence'].iloc[:example_id].to_list()
label_list = qnli_train_df['label'].iloc[:example_id].to_list()

In [12]:
class TextPairsDataset(Dataset):
    
    def __init__(self,question_list,sentence_list,tokenizer,max_length,device):
        self.len = len(question_list)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device
        self.question_list = question_list
        self.sentence_list = sentence_list

    def __len__(self):
        return self.len
    
    def __getitem__(self,index):
        question = self.question_list[index]
        sentence = self.sentence_list[index]
        
        inputs = self.tokenizer(
            text = question,
            text_pair = sentence,
            
            max_length = self.max_length,
            padding = "max_length",
            return_token_type_ids = False,
            truncation = "only_second",
            )
        
        inputs= {
            'input_ids':torch.tensor(inputs['input_ids']),
            'attention_mask':torch.tensor(inputs['attention_mask']),
        }

        inputs = {k:v.to(self.device) for (k,v) in inputs.items()}

        return inputs

In [13]:
from transformers import AutoTokenizer,AlbertForSequenceClassification

In [14]:
class model_config():
    model_path = r"E:\MyFiles\WorkSpace\BertModels\QNLI\albert-base-v2-fine-tuned-qnli-sample"
    max_length = 512
    batch_size = 12

In [15]:
class SentenceSelectionModelLoder(object):

    def __init__(self, model_config):

        from transformers import AlbertForSequenceClassification as model_structure

        self.device = device = "cuda" if torch.cuda.is_available() else "cpu"

        self.model = model_structure.from_pretrained(
            model_config.model_path).to(self.device)

        self.tokenizer = AutoTokenizer.from_pretrained(model_config.model_path)

        self.max_length = model_config.max_length

        self.batch_size = model_config.batch_size

    def prediction_from_loader(self, model, data_loader):

        pred_list = []
        prob_list = []

        model.eval()

        for sample in data_loader:

            with torch.no_grad():
                outputs = model(**sample)

                logits = outputs.logits
                probs = torch.sigmoid(logits)

                pred = torch.argmax(logits, axis=1)
                pred = pred.detach().cpu().numpy()
                pred_list.extend(pred)

                prob = probs[:, 1]
                prob = prob.detach().cpu().numpy()
                prob_list.extend(prob)

        return {'pred': pred_list, 'prob': prob_list}

    def predict(self, question, sentence_list):

        if isinstance(question, str):
            question_list = [question] * len(sentence_list)
        elif isinstance(question, list):
            question_list = question

        predict_datasets = TextPairsDataset(
            question_list, sentence_list, self.tokenizer, self.max_length, self.device)

        predict_datasets_loader = DataLoader(
            predict_datasets, batch_size=self.batch_size, shuffle=False)

        return self.prediction_from_loader(self.model, predict_datasets_loader)

In [16]:
# sentence_selection_model  = SentenceSelectionModelLoder(model_config  = model_config)

In [17]:
from transformers import AlbertForSequenceClassification as model_structure

In [18]:
device = device = "cuda" if torch.cuda.is_available() else "cpu"

In [19]:
model  = model_structure.from_pretrained(model_config.model_path).to(device)

In [20]:
tokenizer = AutoTokenizer.from_pretrained(model_config.model_path)

In [21]:
example_id = 10
question_list= qnli_train_df['question'].iloc[:example_id].to_list()
sentence_list = qnli_train_df['sentence'].iloc[:example_id].to_list()
label_list = qnli_train_df['label'].iloc[:example_id].to_list()

In [22]:
question_list

['When did the third Digimon series begin?',
 'Which missile batteries often have individual launchers several kilometres from one another?',
 "What two things does Popper argue Tarski's theory involves in an evaluation of truth?",
 'What is the name of the village 9 miles north of Calafat where the Ottoman forces attacked the Russians?',
 'What famous palace is located in London?',
 "When is the term 'German dialects' used in regard to the German language?",
 'What was the name of the island the English traded to the Dutch in return for New Amsterdam?',
 'How were the Portuguese expelled from Myanmar?',
 "What does the word 'customer' properly apply to?",
 'What did Arsenal consider the yellow and blue colors to be after losing a FA Cup final wearing red and white?']

In [23]:
predict_datasets = TextPairsDataset(question_list,sentence_list,tokenizer,512,device)

In [24]:
sample = predict_datasets[:10]

In [25]:
with torch.no_grad():
        pred_list = []
        prob_list = []
        
#         model.eval()
        
        outputs = model(**sample)

        logits = outputs.logits
        probs = torch.sigmoid(logits)

        pred = torch.argmax(logits,axis = 1)
        pred = pred.detach().cpu().numpy()

        prob = probs[:,1]
        prob = prob.detach().cpu().numpy()

In [26]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.1512, -2.1581],
        [ 0.7795, -1.6949],
        [-1.0292,  0.7467],
        [-2.3284,  1.9095],
        [-1.6990,  1.2644],
        [ 0.7529, -1.5850],
        [-1.6217,  1.5169],
        [ 0.8757, -1.6522],
        [-2.0132,  1.6940],
        [ 0.5792, -1.5447]], device='cuda:0'), hidden_states=None, attentions=None)

In [27]:
label_list

[0, 0, 1, 1, 0, 1, 1, 0, 1, 1]

In [28]:
pred

array([0, 0, 1, 1, 1, 0, 1, 0, 1, 0], dtype=int64)

In [29]:
prob

array([0.10358039, 0.15513778, 0.6784646 , 0.8709625 , 0.7797845 ,
       0.17008494, 0.82007486, 0.16081364, 0.8447452 , 0.17585221],
      dtype=float32)

In [30]:
sentence_selection_model  = SentenceSelectionModelLoder(model_config  = model_config)

In [31]:
question_list[0]

'When did the third Digimon series begin?'

In [32]:
sentence_selection_model.predict(question_list,sentence_list)

{'pred': [0, 0, 1, 1, 1, 0, 1, 0, 1, 0],
 'prob': [0.10358039,
  0.15513778,
  0.6784646,
  0.8709625,
  0.7797845,
  0.17008494,
  0.82007486,
  0.16081364,
  0.8447452,
  0.17585221]}

In [33]:
sentence_selection_model.predict(question_list[2],sentence_list)

{'pred': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 'prob': [0.12281186,
  0.1328893,
  0.6784646,
  0.10919345,
  0.35432395,
  0.12000906,
  0.12984481,
  0.34810072,
  0.1259087,
  0.10937402]}