In [67]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


file_path = 'news_dataset.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')


original_columns = ['id', 'author', 'date', 'year', 'month', 'topic', 'article']


def clean_text(text):
    text = text.lower()
    text = re.sub(r'\?', '', text)  
    text = re.sub(r'[^\w\s]', '', text)  
    return text

df['clean_article'] = df['article'].apply(lambda x: clean_text(x) if pd.notnull(x) else x)
df = df.dropna(subset=['clean_article'])


stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    words = word_tokenize(text)
    words = [ps.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['clean_article'] = df['clean_article'].apply(preprocess_text)

processed_data = df[original_columns + ['clean_article']]


In [None]:
processed_data

In [64]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer, BertModel


qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
qa_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


embed_model = BertModel.from_pretrained('bert-base-uncased')
embed_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [68]:
import spacy


nlp = spacy.load("en_core_web_sm")


def basic_coreference_resolution(text):
    doc = nlp(text)
    resolved_text = text
    entities = [(ent.text, ent.start_char, ent.ead_char, ent.label_) for ent in doc.ents]
    

    pronouns = [(token.text, token.idx) for token in doc if token.pos_ == 'PRON']
    
    for pronoun, pronoun_idx in pronouns:

        for entity_text, start, end, label in reversed(entities):
            if pronoun.lower() == "he" and label == "PERSON":
                resolved_text = resolved_text[:pronoun_idx] + entity_text + resolved_text[pronoun_idx + len(pronoun):]
                break
            elif pronoun.lower() == "she" and label == "PERSON":
                resolved_text = resolved_text[:pronoun_idx] + entity_text + resolved_text[pronoun_idx + len(pronoun):]
                break
            elif pronoun.lower() == "it" and label in {"ORG", "GPE"}:
                resolved_text = resolved_text[:pronoun_idx] + entity_text + resolved_text[pronoun_idx + len(pronoun):]
                break
            elif pronoun.lower() in {"they", "them"}:
                resolved_text = resolved_text[:pronoun_idx] + entity_text + resolved_text[pronoun_idx + len(pronoun):]
                break
    
    return resolved_text


In [61]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_sentence_embedding(sentence):
    inputs = embed_tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True, padding=True)
    outputs = embed_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy()

def find_most_relevant_sentence(question, article):
    question_embedding = get_sentence_embedding(question)
    sentences = article.split('. ')
    max_similarity = -1
    most_relevant_sentence = ""
    
    for sentence in sentences:
        sentence_embedding = get_sentence_embedding(sentence)
        similarity = cosine_similarity(question_embedding, sentence_embedding)
        
        if similarity > max_similarity:
            max_similarity = similarity
            most_relevant_sentence = sentence
    
    return most_relevant_sentence


In [None]:
def answer_question(article_text, question):
    '''
    Takes a `question` string and an `article_text` string (which contains the
    answer), and identifies the words within the `article_text` that are the
    answer. Prints them out.
    '''

    resolved_article_text = basic_coreference_resolution(article_text)
    

    inputs = tokenizer.encode_plus(question, resolved_article_text, return_tensors='pt', max_length=512, truncation=True)
    

    outputs = model(**inputs)
    
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    

    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores) + 1


    input_ids = inputs['input_ids'].tolist()[0]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = tokenizer.convert_tokens_to_string(tokens[answer_start:answer_end])

    return answer


article_id = 17574
question = "Who is the vice chairman of Samsung?"


article_text = df[df['id'] == article_id]['article'].values[0]


answer = answer_question(article_text, question)
print("Answer:", answer)


In [66]:
def answer_question(article_text, question):
    '''
    Takes a `question` string and an `article_text` string (which contains the
    answer), and identifies the words within the `article_text` that are the
    answer. Prints them out.
    '''

    resolved_article_text = basic_coreference_resolution(article_text)
    

    most_relevant_sentence = find_most_relevant_sentence(question, resolved_article_text)
    

    inputs = qa_tokenizer.encode_plus(question, most_relevant_sentence, return_tensors='pt', max_length=512, truncation=True)
    

    outputs = qa_model(**inputs)
    
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    

    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores) + 1


    input_ids = inputs['input_ids'].tolist()[0]
    tokens = qa_tokenizer.convert_ids_to_tokens(input_ids)
    answer = qa_tokenizer.convert_tokens_to_string(tokens[answer_start:answer_end])

    return most_relevant_sentence, answer


article_id = 17574
question = "Who is the vice chairman of Samsung?"


article_text = df[df['id'] == article_id]['article'].values[0]

most_relevant_sentence, answer = answer_question(article_text, question)
print("Most Relevant Sentence:", most_relevant_sentence)
print("Answer:", answer)


Most Relevant Sentence: Lee effectively runs Samsung, South Korea?s largest conglomerateLeee is the son of its chairman, Lee   who has been incapacitated with health problemsLeeHe is expected to be asked whether   donations that Samsung made to two foundations controlled by Choi   a longtime friend of the president, amounted to bribes, and what role, if anLee he played in the decision to give the money
Answer: lee


In [80]:
from sklearn.metrics import precision_score, recall_score, f1_score

def compute_f1(predicted_answer, true_answer):
    predicted_tokens = set(predicted_answer.lower().split())
    true_tokens = set(true_answer.lower().split())
    
    common_tokens = predicted_tokens.intersection(true_tokens)
    
    if len(common_tokens) == 0:
        return 0, 0, 0
    
    precision = len(common_tokens) / len(predicted_tokens)
    recall = len(common_tokens) / len(true_tokens)
    
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1


In [81]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer
import spacy


qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
qa_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


nlp = spacy.load("en_core_web_sm")


def basic_coreference_resolution(text):
    doc = nlp(text)
    resolved_text = text
    entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    
    
    pronouns = [(token.text, token.idx) for token in doc if token.pos_ == 'PRON']
    
    for pronoun, pronoun_idx in pronouns:
        
        for entity_text, start, end, label in reversed(entities):
            if pronoun.lower() == "he" and label == "PERSON":
                resolved_text = resolved_text[:pronoun_idx] + entity_text + resolved_text[pronoun_idx + len(pronoun):]
                break
            elif pronoun.lower() == "she" and label == "PERSON":
                resolved_text = resolved_text[:pronoun_idx] + entity_text + resolved_text[pronoun_idx + len(pronoun):]
                break
            elif pronoun.lower() == "it" and label in {"ORG", "GPE"}:
                resolved_text = resolved_text[:pronoun_idx] + entity_text + resolved_text[pronoun_idx + len(pronoun):]
                break
            elif pronoun.lower() in {"they", "them"}:
                resolved_text = resolved_text[:pronoun_idx] + entity_text + resolved_text[pronoun_idx + len(pronoun):]
                break
    
    return resolved_text


def answer_question(article_text, question):
    
    resolved_article_text = basic_coreference_resolution(article_text)
    

    inputs = qa_tokenizer.encode_plus(question, resolved_article_text, return_tensors='pt', max_length=512, truncation=True)
    

    outputs = qa_model(**inputs)
    
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    

    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores) + 1

    input_ids = inputs['input_ids'].tolist()[0]
    tokens = qa_tokenizer.convert_ids_to_tokens(input_ids)
    answer = qa_tokenizer.convert_tokens_to_string(tokens[answer_start:answer_end])

    return answer


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [82]:
def test_qa_system(test_data):
    total_precision = 0
    total_recall = 0
    total_f1 = 0
    count = 0
    
    for data in test_data:
        article_id = data["article_id"]
        question = data["question"]
        true_answer = data["answer"]
        
  
        article_text = df[df['id'] == article_id]['article'].values[0]
        
        
        predicted_answer = answer_question(article_text, question)
        
        # F1-score
        precision, recall, f1 = compute_f1(predicted_answer, true_answer)
        
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        count += 1
        
        print(f"Question: {question}")
        print(f"Predicted Answer: {predicted_answer}")
        print(f"True Answer: {true_answer}")
        print(f"Precision: {precision}, Recall: {recall}, F1-Score: {f1}\n")
    
    avg_precision = total_precision / count if count > 0 else 0
    avg_recall = total_recall / count if count > 0 else 0
    avg_f1 = total_f1 / count if count > 0 else 0
    
    print(f"Average Precision: {avg_precision}")
    print(f"Average Recall: {avg_recall}")
    print(f"Average F1-Score: {avg_f1}")


test_data = [
    {
        "article_id": 17574,
        "question": "Who is the vice chairman of Samsung?",
        "answer": "Jay Y. Lee"
    },

]


test_qa_system(test_data)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Question: Who is the vice chairman of Samsung?
Predicted Answer: jay y . lee
True Answer: Jay Y. Lee
Precision: 0.5, Recall: 0.6666666666666666, F1-Score: 0.5714285714285715

Average Precision: 0.5
Average Recall: 0.6666666666666666
Average F1-Score: 0.5714285714285715


## B. References

In [None]:
(based on https://mccormickml.com/2020/03/10/question-answering-with-a-fine-tuned-BERT/ )