In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import spacy
from rank_bm25 import BM25Okapi

In [13]:
nlp = spacy.load("en_core_web_sm")

In [14]:
dataset = pd.read_json('train.json')

## Text Preprocess

In [15]:
def preprocess(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    return text

### Cut text to between 0 - 512 word because model cant read more than 512 words

In [16]:
def cut_text_to_512_words(text):
    doc = nlp(text)
    words = [token.text for token in doc]
    if len(words) > 256:
        cut_words = words[:450]
        cut_text = " ".join(cut_words)
        return cut_text
    else:
        return text

In [17]:
dataset['support'] = dataset['support'].apply(preprocess)
dataset['question'] = dataset['question'].apply(preprocess)

In [18]:
context = list(dataset['support'])

### Rank BM25 for Passage ranking

In [19]:
tokenized_corpus = [doc.split(" ") for doc in context]
bm25 = BM25Okapi(tokenized_corpus)

### TF-IDF for Passage ranking

In [20]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(context)

### Function to find context based on question  return context1 : TF-IDF and context2 : RankBM25

In [21]:
def find_context(question):
    tokenized_question = question.split(" ")
    question_vector = vectorizer.transform([question])
    similiarity = cosine_similarity(question_vector, tfidf_matrix)
    n = 1
    top_indices = similiarity.argsort()[0][-n:][::-1]
    top_contexts = [context[idx] for idx in top_indices]
    
    context1 = cut_text_to_512_words(top_contexts[0])
    context2 = cut_text_to_512_words(bm25.get_top_n(tokenized_question, context, n=n)[0])
    
    return [context1,context2]

In [22]:
find_context('What is the short name of Oxygen')

['all oxygen atoms have eight protons and most have eight neutrons as well what is the mass number of an oxygen isotope that has nine neutrons what is the name of this isotope',
 'molecules with a carboxyl group are called carboxylic acids as with aldehydes the functional group in carboxylic acids is at the end of a carbon chain also as with aldehydes the c atom in the functional group is counted as one of the c atoms that defines the parent hydrocarbon name to name carboxylic acids the parent name of the hydrocarbon is used but the suffix oic acid is added']

In [23]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

## Bert question answering using pretrained

In [24]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [25]:
### train ###


## Extract answer from context function

In [26]:
def question_answer(question, text):
    answer = ''
    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
              
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    print("Question : "+ question + "?")
    print("Predicted answer:\n{}".format(answer.capitalize()))
    return answer

In [27]:
answer_1 = question_answer('How many protons Oxygen have',(find_context('How many proton Oxygen have ?')))

Question : How many protons Oxygen have?
Predicted answer:
Protons


### Evaluation just for test score you can delete all below

In [28]:
test_1 = []
test_2 = []
score_1 = 0
score_2 = 0

In [29]:
for index ,row in dataset.iterrows():
    print(index)
    context_lst = find_context(dataset['question'][index])
    answer_1 = question_answer(dataset['question'][index],context_lst[0])
    answer_2 = question_answer(dataset['question'][index],context_lst[1])
    if answer_1 == dataset['correct_answer'][index]:
        score_1 +=1
    elif dataset['correct_answer'][index] in answer_1:
        test_1.append([dataset['correct_answer'][index],answer_1])
    if answer_2 == dataset['correct_answer'][index]:
        score_2 +=1
    elif dataset['correct_answer'][index] in answer_2:
        test_2.append([dataset['correct_answer'][index],answer_2])

0
Question : what type of organism is commonly used in preparation of foods such as cheese and yogurt?
Predicted answer:
Bacteria
Question : what type of organism is commonly used in preparation of foods such as cheese and yogurt?
Predicted answer:
Mesophilic
1
Question : what phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?
Predicted answer:
Coriolis
Question : what phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?
Predicted answer:
Coriolis
2
Question : changes from a lessordered state to a moreordered state such as a liquid to a solid are always what?
Predicted answer:
Exothermic
Question : changes from a lessordered state to a moreordered state such as a liquid to a solid are always what?
Predicted answer:
Exothermic
3
Question : what is the lea

KeyboardInterrupt: 