In [4]:
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AlbertForQuestionAnswering
import collections
import math
import json
import pandas as pd
data = pd.read_csv("es_dev.csv")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cls_modelPath = "./cls_model"
mrc_modelPath = "./model4"

tokenizer = AlbertTokenizer.from_pretrained(mrc_modelPath)
cls_model = AlbertForSequenceClassification.from_pretrained(cls_modelPath).to(device)
cls_model.eval()
mrc_model = AlbertForQuestionAnswering.from_pretrained(mrc_modelPath).to(device)
mrc_model.eval()

Some weights of the model checkpoint at ./model4 were not used when initializing AlbertForQuestionAnswering: ['has_ans.1.bias', 'has_ans.1.weight']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AlbertForQuestionAnswering(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=4096, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((4096,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=4096, out_features=4096, bias=True)
                (key): Linear(in_features=4096, out_features=4096, bias=True)
                (value): Linear(in_features=4096, out_features=40

In [5]:
def _get_best_indexes(logits, n_best_size):
    """Get the n-best logits from a list."""
    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)

    best_indexes = []
    for i in range(len(index_and_score)):
        if i >= n_best_size:
            break
        best_indexes.append(index_and_score[i][0])
    return best_indexes

def _compute_softmax(scores):
    """Compute softmax probability over raw logits."""
    if not scores:
        return []

    max_score = None
    for score in scores:
        if max_score is None or score > max_score:
            max_score = score

    exp_scores = []
    total_sum = 0.0
    for score in scores:
        x = math.exp(score - max_score)
        exp_scores.append(x)
        total_sum += x

    probs = []
    for score in exp_scores:
        probs.append(score / total_sum)
    return probs

def get_qa_nbest(input_ids, start_logits, end_logits, seq_len, n_best_size=20, max_answer_length=30):
    score_null = 1000000  # large and positive
    prelim_predictions = []
    null_start_logit = 0  # the start logit at the slice with min null score
    null_end_logit = 0  # the end logit at the slice with min null score
    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction",
        ["start_index", "end_index", "start_logit", "end_logit"])
    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
            "NbestPrediction", ["text", "start_logit", "end_logit"])
    
    start_indexes = _get_best_indexes(start_logits, n_best_size)
    end_indexes = _get_best_indexes(end_logits, n_best_size)

    feature_null_score = start_logits[0] + end_logits[0]
    if feature_null_score < score_null:
        score_null = feature_null_score
    for start_index in start_indexes:
        for end_index in end_indexes:
            if end_index < start_index:
                continue
            length = end_index - start_index + 1
            if length > max_answer_length:
                continue
            if start_index >= seq_len:
                        continue
            if end_index >= seq_len:
                continue
            prelim_predictions.append(
                _PrelimPrediction(
                    start_index=start_index,
                    end_index=end_index,
                    start_logit=start_logits[start_index],
                    end_logit=end_logits[end_index]))
    prelim_predictions = sorted(
            prelim_predictions,
            key=lambda x: (x.start_logit + x.start_logit),
            reverse=True)
    
    seen_predictions = {}
    nbest = []
    for pred in prelim_predictions:
        if len(nbest) >= n_best_size:
            break
        
        if pred.start_index > 0:  # this is a non-null prediction\
            predict_answer_tokens = input_ids[0, pred.start_index: (pred.end_index + 1)]
            final_text = tokenizer.decode(predict_answer_tokens)
            if final_text in seen_predictions:
                continue
            seen_predictions[final_text] = True
        else:
            final_text = ""
            seen_predictions[final_text] = True
        

        nbest.append(
                _NbestPrediction(
                    text=final_text,
                    start_logit=pred.start_logit,
                    end_logit=pred.end_logit))
    if "" not in seen_predictions:
        nbest.append(
            _NbestPrediction(
                text="",
                start_logit=null_start_logit,
                end_logit=null_end_logit))

            # In very rare edge cases we could only have single null prediction.
            # So we just create a nonce prediction in this case to avoid failure.
    if len(nbest) == 1:
        nbest.insert(0,
            _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))

    # In very rare edge cases we could have no valid predictions. So we
    # just create a nonce prediction in this case to avoid failure.
    if not nbest:
        nbest.append(
            _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))


    total_scores = []
    best_non_null_entry = None
    for entry in nbest:
        total_scores.append(entry.start_logit + entry.end_logit)
        if not best_non_null_entry:
            if entry.text:
                best_non_null_entry = entry

    probs = _compute_softmax(total_scores)
    nbest_json = []
    for (i, entry) in enumerate(nbest):
        output = collections.OrderedDict()
        output["text"] = entry.text
        output["probability"] = probs[i]
        output["start_logit"] = entry.start_logit
        output["end_logit"] = entry.end_logit
        nbest_json.append(output)

    score_diff = score_null - best_non_null_entry.start_logit - (
                best_non_null_entry.end_logit)
    
    return nbest_json, score_diff

def inference(context, question, reference):
    inputs = tokenizer(
        question,
        context,
        add_special_tokens=True,
        pad_to_max_length=True,
        max_length=512,
        return_tensors="pt"
    )
    
    # Move inputs to the GPU
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    try:
        seq_len = inputs['input_ids'][0].tolist().index(0)
    except ValueError:
        seq_len = len(inputs['input_ids'][0].tolist())  # Use full length if padding token not found
    
    with torch.no_grad():
        cls_outputs = cls_model(**inputs)
        qa_outputs = mrc_model(**inputs)

    cls_logits = cls_outputs.logits[0]
    cls_divide = cls_logits[1] - cls_logits[0]

    # Get n-best predictions and score difference
    nbest, score_diff = get_qa_nbest(
        inputs['input_ids'], qa_outputs.start_logits[0], qa_outputs.end_logits[0], seq_len=seq_len
    )

    thresh = -1.246073067188263

    print(cls_divide, score_diff)

    # Calculate NA score
    na_score = (0.5 * cls_divide + 0.5 * score_diff) * 0.5

    # Determine final answer based on threshold
    if na_score > thresh:
        final_answer = f"<No Answer>. The question is not answerable according to the context."
    else:
        final_answer = nbest[0]["text"]
    return final_answer

In [6]:
def save(data, fname):
    with open(fname, 'w') as convert_file: 
     convert_file.write(json.dumps(data))

In [12]:
try:
    with open('P-U-retro-es-dev.json', 'r') as file:
        final = json.load(file)
except:
    final = {}
counter = 0
from tqdm import tqdm
fh = open('output.txt', 'w')
for index, row in tqdm(data.iterrows(),total=data.shape[0], file=fh):
    id = row["Question_Id"]
    if id not in final:
        counter = 1
        final[row["Question_Id"]] = []
        final[row["Question_Id"]].append(inference(row["Original_Passage"], row["Modified_Question"],""))
        save(final,'P-U-retro-es-dev.json')
    else:
        shape = (data.loc[data['Question_Id'] == id]).shape[0]
        print(shape)
        print(len(final[row["Question_Id"]]))
        if len(final[row["Question_Id"]]) < shape:
            if (counter) < len(final[row["Question_Id"]]):
                counter+=1
            else:
                final[row["Question_Id"]].append(inference(row["Original_Passage"], row["Modified_Question"],""))
                save(final,'P-U-retro-es-dev.json')

  0%|                                                                                        | 0/11399 [00:00<?, ?it/s]

2
2
2
2
1
1


  0%|                                                                             | 4/11399 [00:20<15:58:57,  5.05s/it]

tensor(10.1323) tensor(11.2401)


  0%|                                                                             | 5/11399 [00:40<28:36:17,  9.04s/it]

tensor(10.4951) tensor(11.0171)


  0%|                                                                             | 6/11399 [01:01<39:09:27, 12.37s/it]

tensor(10.2989) tensor(11.3047)
2
1


  0%|                                                                             | 7/11399 [01:23<47:15:56, 14.94s/it]

tensor(8.8734) tensor(11.4262)


  0%|                                                                             | 8/11399 [01:44<52:38:19, 16.64s/it]

tensor(10.1247) tensor(10.1554)


  0%|                                                                             | 8/11399 [01:46<42:18:09, 13.37s/it]


KeyboardInterrupt: 

In [5]:
data = pd.read_csv("es_train.csv")
try:
    with open('P-U-retro-es-train.json', 'r') as file:
        final = json.load(file)
except:
    final = {}
from tqdm import tqdm
fh = open('output.txt', 'w')
for index, row in tqdm(data.iterrows(),total=data.shape[0], file=fh):
    id = row["Question_Id"]
    if id not in final:
        counter = 1
        final[row["Question_Id"]] = []
        final[row["Question_Id"]].append(inference(row["Original_Passage"], row["Modified_Question"],""))
        save(final,'P-U-retro-es-train.json')
    else:
        shape = (data.loc[data['Question_Id'] == id]).shape[0]
        print(shape)
        print(len(final[row["Question_Id"]]))
        if len(final[row["Question_Id"]]) < shape:
            if (counter) < len(final[row["Question_Id"]]):
                counter+=1
            else:
                final[row["Question_Id"]].append(inference(row["Original_Passage"], row["Modified_Question"],""))
                save(final,'P-U-retro-es-train.json')

  0%|                                                                            | 1/86821 [00:18<456:30:58, 18.93s/it]

tensor(-3.9935) tensor(9.0278)


  0%|                                                                            | 1/86821 [00:22<553:11:08, 22.94s/it]


KeyboardInterrupt: 