### Loading of data sources

In [1]:
def load_context(file_path):
    with open(file_path, 'r') as file:
        filedata = file.read()

        context = filedata.split('\n')
        context = ''.join(context)
    return context

In [2]:
sinch_node_red = load_context("mmd/sinch_doc_node_red.mmd")
sinch_webhook = load_context("mmd/sinch_doc_how_to_webhook.mmd")
sinch_overview = load_context("mmd/sinch_doc_overview.mmd")
nougat_context = load_context("mmd/nougat.mmd")

### Load finetuned model

In [4]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import os
import numpy as np
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
model = AutoModelForQuestionAnswering.from_pretrained("../../extractive_qa/squad_experiments/code/outputs/squad2_microsoft/deberta-v3-base/")



In [5]:
def inference(question, context):
    inputs = tokenizer.encode_plus(question, context, max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length")

    input_ids = inputs.pop("input_ids") #Token ids
    offsets = inputs.pop("offset_mapping") #Offsets to adjust token id to the original text
    attention_mask = inputs.pop("attention_mask") #Mask to not consider pad tokens as input
   
    n_best = 10

    answers = []
    for i in range(len(input_ids)):
        input_id = torch.tensor(input_ids[i]).unsqueeze(0)
        attention_mask_ = torch.tensor(attention_mask[i]).unsqueeze(0)
        with torch.no_grad():
            outputs = model(input_id, attention_mask=attention_mask_)
            
        start_logits = outputs.start_logits.cpu().detach().numpy()[0]
        end_logits = outputs.end_logits.cpu().detach().numpy()[0]

        offset = offsets[i]
        sequence_ids = inputs.sequence_ids(i) #The ids of the current sequence tokens, 1 for the context tokens and 0 for the question tokens
        offset = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

        start_indexes = np.argsort(start_logits)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logits)[-1 : -n_best - 1 : -1].tolist()

        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offset[start_index] is None or offset[end_index] is None:
                        continue
                # Skip answers with a length that is either < 0 or > max_answer_length
                if ( end_index < start_index) or (end_index - start_index + 1 > 30):
                    continue
                answer = {
                    "text": context[offset[start_index][0] : offset[end_index][1]], #The answer is the text between the start and end index
                    "logit_score": start_logits[start_index] + end_logits[end_index], #The score is the sum of the start and end logit
                    "no_answer_probability": (start_logits[0] + end_logits[0]) - (start_logits[start_index] + end_logits[end_index]), #The score of the null answer is the sum of the start and end logit for the first token
                }
                if answer["no_answer_probability"] < -2:
                    answers.append(answer)

    if len(answers) > 0:
        #Return the answer with the highest logit score
        best_answer = max(answers, key=lambda x: x["logit_score"])
    else:
        #No answer found
        best_answer = {"text": "No answer found", "logit_score": 0.0, "no_answer_probability": 0.0}
    return best_answer

### Sinch doc

In [6]:
df = pd.DataFrame(columns=['question', 'answer', 'logit_score', 'no_answer_probability'])

In [189]:
question = "What is Node RED ? "
context = sinch_node_red
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

  df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])


In [190]:
question = "In few words, What is Node RED ? "
context = sinch_node_red
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [191]:
question = "What are the supported channels of Node RED ? "
context = sinch_node_red
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [192]:
question = "In which cases can I use Node RED ? "
context = sinch_node_red
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [193]:
question = "What are the differents nodes of Sinch Messaging ? "
context = sinch_node_red
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [194]:
question = "When was Node RED released ? "
context = sinch_node_red
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [195]:
question = "Give me the different steps to add a webhook to my app ? "
context = sinch_webhook
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [196]:
question = "What is the Sinch Conversation API ?"
context = sinch_overview
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [197]:
question = "Can I use the Sinch Conversation API with Viber Business ? "
context = sinch_overview
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [198]:
question = "Can I use the Sinch Conversation API with Outlook ? "
context = sinch_overview
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [199]:
question = "Where are the hosting locations for the Conversation API ? "
context = sinch_overview
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [200]:
question = "What are the specific pricing details for using the Sinch Conversation API ? "
context = sinch_overview
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [201]:
question = "How does the Sinch Conversation API handle multimedia content like images and videos ? "
context = sinch_overview
output = inference(question, context)
df = pd.concat([df, pd.DataFrame([[question, output['text'], output['logit_score'], output['no_answer_probability']]], columns=['question', 'answer', 'logit_score', 'no_answer_probability'])])

In [202]:
df = df.drop(['logit_score', 'no_answer_probability'], axis=1)

In [203]:
#drop the two last columns
latex_code = df.to_latex(index=False, column_format="|p{5cm}|p{10cm}|", float_format=(lambda x: "%.3f" % x))
latex_code = latex_code.replace('\\toprule', '\hline')
latex_code = latex_code.replace('\\bottomrule', '\hline')
latex_code = latex_code.replace('\\\n', '\\ \hline\n')

with open('../outputs/extractive_qa.tex', 'w') as file:
    file.write(latex_code)