# Assignment 2 
### Kusal Bista

In [1]:
!python --version

Python 3.7.16


In [1]:
# Libraries for data reading
import glob
import json
import random
import numpy as np
import pandas as pd 
from tqdm import tqdm
import neuralcoref
import stanza
# Libraries for pre-processing
import re
import nltk
from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Libraries for information retrieval
import spacy
from spacy import displacy
from spacy.lang.en import English
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Libraries for data analysis
import matplotlib.pyplot as plt
from tabulate import tabulate

# Libraries for question answering
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
import torch
import time
from transformers import RobertaTokenizer, RobertaModel

nlp = spacy.load('en_core_web_sm')



In [3]:
# pip install tabulate

In [4]:
# !python -m spacy download en_core_web_sm

In [5]:

# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !pip install tabulate
# !pip install nltk

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/poojakc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/poojakc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/poojakc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/poojakc/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### 1 Reading dataset and pre-processing

In [3]:
news_dataset = pd.read_csv('news_dataset.csv', encoding='latin-1')

In [4]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."


In [6]:
# selecting 100 sample 
test_article = news_dataset[news_dataset['id'] == 17574]
sample_size = 100
if news_dataset.shape[0] >= sample_size:
    news_dataset = news_dataset.sample(n=sample_size, random_state=42)  # Adjusting random_state for reproducibility
    news_dataset.reset_index(drop=True, inplace=True) 
    print("Sampled dataset shape:", news_dataset.shape)
else:
    print("Dataset size is less than the sample size. Cannot perform sampling.")
    
news_dataset = pd.concat([news_dataset, test_article], ignore_index=True)

Sampled dataset shape: (100, 7)


In [7]:
news_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       101 non-null    int64 
 1   author   100 non-null    object
 2   date     101 non-null    object
 3   year     101 non-null    object
 4   month    101 non-null    object
 5   topic    101 non-null    object
 6   article  101 non-null    object
dtypes: int64(1), object(6)
memory usage: 5.6+ KB


### 1.2 Handling missing value

In [8]:
print("Missing value")
print(news_dataset.isnull().sum())

Missing value
id         0
author     1
date       0
year       0
month      0
topic      0
article    0
dtype: int64


In [9]:
# Handling missing value
news_dataset['author'] = news_dataset['author'].fillna('No author')
# checking missing value after handling missing value

In [10]:
print("After handling missing value")
print(news_dataset.isnull().sum())

After handling missing value
id         0
author     0
date       0
year       0
month      0
topic      0
article    0
dtype: int64


In [11]:
news_dataset = news_dataset.drop_duplicates(subset=['article'], keep='first').reset_index(drop=True)
news_dataset.head(12)

Unnamed: 0,id,author,date,year,month,topic,article
0,17663,"Hannah Berkeley Cohen, Azam Ahmed and Frances ...",14/01/2017,2017,1,business,HAVANA ? Andr?s Ivÿn and his girlfriend gre...
1,17361,No author,15/01/2017,2017,1,lifestyle,"For the 12th straight year, the Travel section..."
2,18058,Rana F. Sweis,29/01/2017,2017,1,entertainment,"MAFRAQ, Jordan ? Nisreen thought the wors..."
3,18315,Andrew Higgins,12/02/2017,2017,2,politics,"NICOSIA, Cyprus ? As the United Nations gea..."
4,17612,Jennifer Senior,12/01/2017,2017,1,politics,"Let?s just get this out of the way, shall we? ..."
5,17294,John Schwartz,5/01/2017,2017,1,science,"THOMPSONS, Tex. ? Can one of the most promi..."
6,17728,Nelson D. Schwartz and Bill Vlasic,20/01/2017,2017,1,business,Donald J. Trump won?t be sworn in until Friday...
7,17637,Mike Hale,14/01/2017,2017,1,business,"?Homeland,? a series always conscious of curre..."
8,18253,Shivani Vora,14/02/2017,2017,2,entertainment,Beach and spa vacations are typically associat...
9,17904,Brooks Barnes,23/01/2017,2017,1,entertainment,Nominations for the 89th Academy Awards will b...


### 1.3 Data pre-processing

In [14]:
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)


def pre_process(data):
    stop_words = set(stopwords.words('english'))
    stop_words.update(["This", "The", "the"])
    s = " \[(?=.*\d).*?\]" 
    # Lemmatization and removal of stopwords
    lemmatizer = WordNetLemmatizer()

#     print(data)
    result = []
    for text in data:
        doc = nlp(text)
        text = doc._.coref_resolved
        # Clean text
        # Remove non-ASCII characters
        text = ''.join([char for char in text if ord(char) < 128])
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        # Remove question mark problems
        text = re.sub(r'(\s\?)',' ',text)
        text = re.sub(r"\b\?\b", "\'", text)
        text = re.sub(r"(,\?)",",", text)
        text = re.sub(r"\?+", "?", text)
        text = text.strip()
        # Lemmatization and removal of stopwords
        processed_text = " ".join([lemmatizer.lemmatize(word) for word in re.sub(s, "", text).split() if word.lower() not in stop_words])
        result.append(processed_text)
    return result

In [13]:
news_dataset['processed_article'] = pre_process(news_dataset['article'])

In [13]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article,processed_article
0,17904,Brooks Barnes,23/01/2017,2017,1,entertainment,Nominations for the 89th Academy Awards will b...,Nominations 89th Academy Awards announced Tues...
1,18166,Matt Flegenheimer,2/02/2017,2017,2,business,"WASHINGTON ? President Trump, seeming to re...","WASHINGTON Mr. Trump, encouraged Mitch, Wednes..."
2,18169,Somini Sengupta,2/02/2017,2017,2,business,UNITED NATIONS ? The new secretary general ...,UNITED NATIONS new secretary general United Na...
3,18055,Emily Palmer,26/01/2017,2017,1,lifestyle,"On a chilly October morning, Talea Childs, 4, ...","chilly October morning, Talea Childs, 4, still..."
4,17776,Liam Stack,20/01/2017,2017,1,entertainment,celebrities may be staying away from Donald J....,celebrity may staying away Donald J. Trump's i...


In [None]:
# !python -m spacy download en_core_web_md

#### 2 Classical retrivel

In [15]:
nlp_md = spacy.load("en_core_web_md")

class TextMatchingUtility:
    def __init__(self, dataset):
        self.data = dataset  # Dataset
        self.nlp = spacy.load("en_core_web_sm")

    def preprocess_query(self, query):
        # Regular expression to match text patterns
        s = " \[(?=.*\d).*?\]"
        # Removing stopwords and Lemmatization
        stop_words = stopwords.words('english')
        stop_words.extend(["This", "The", "the"])
        lemmatizer = WordNetLemmatizer()
        result = [" ".join([lemmatizer.lemmatize(word) for word in re.sub(s, "", query).split() if word not in stop_words])]
        return result


    def tf_idf_score(self, query, articles):    
        vectorizer = TfidfVectorizer()
        # Convert to word vector
        articles_wv = vectorizer.fit_transform(articles)
        # Convert to word vector
        query_wv = vectorizer.transform([query]) 
        # Calculate similarity
        similarities = cosine_similarity(query_wv, articles_wv)[0]
        return similarities

    def spacy_score(self, query, articles):
        # Convert to word vector
        query_nlp = nlp_md(str(query))
        # Convert to word vector
        articles_nlp = [nlp_md(article) for article in articles]
        # Calculate similarity
        similarities = [query_nlp.similarity(article_nlp) for article_nlp in articles_nlp]
        return similarities
    
    def get_best_sentences(self, query, article_id, word_vector, top_n=3):
        article = self.data.loc[self.data['id'] == article_id, 'processed_article'].iloc[0]
        # Convert text into sentences
        sentences_clean = tokenize.sent_tokenize(article)
        # Calculate similarity
        if word_vector == "tf-idf":
            similarities = self.tf_idf_score(query, sentences_clean)
        elif word_vector == "spaCy":
            similarities = self.spacy_score(query, sentences_clean)
        # Get the indices of top N scores
        top_indices = np.argsort(similarities)[::-1][:top_n]
        # Get the top N sentences and their scores
        top_sentences = [(sentences_clean[i], similarities[i]) for i in top_indices]
        return top_sentences 

    def get_best_sentence(self, query, article_id, word_vector):
        article = self.data.loc[self.data['id'] == article_id, 'processed_article'].iloc[0]
        # Convert text into sentences
        sentences_clean = tokenize.sent_tokenize(article)
        # Calculate similarity
        if word_vector == "tf-idf":
            similarities = self.tf_idf_score(query, sentences_clean)
        elif word_vector == "spaCy":
            similarities = self.spacy_score(query, sentences_clean)
    
        # Get the maximum score index position
        best_idx = np.array(similarities).argmax()
        # Get the best score
        best_score = max(similarities)
        # Get original data
        for j in range(len(self.data['id'])):
            if self.data['id'][j] == article_id:
                topic = self.data['article'][j]
        sentences_topic = tokenize.sent_tokenize(topic)
        answer = sentences_topic[best_idx]
        print("Article ID ", article_id)  
        print("Question:", query)
        if best_score < 0.3:
            print("No answer found")
        else:
            print("Answer:", answer)
            print("Score", best_score,"\n")

### 2.1 tf-idf

In [16]:
tm_utility = TextMatchingUtility(news_dataset)

# Sample question
question = "Who is the vice president of Samsung?"
article_id = 17574
word_vector = 'tf-idf'

top_results = tm_utility.get_best_sentences(question, article_id, word_vector, top_n=3)

print("Question:", question)
for i, (answer, score) in enumerate(top_results, 1):
    print(f"Answer {i}: {answer}")
    print("Score:", score)
    print()

Question: Who is the vice president of Samsung?
Answer 1: special prosecutor's office said special prosecutor's office evidence Mr. Lee received request bribery president ordered Samsung subsidiary send bribe destination designated president.
Score: 0.3697214925441742

Answer 2: SEOUL, South Korea special prosecutor investigating corruption scandal led President Park impeachment summoned de facto head Samsung questioning Wednesday, calling bribery suspect.
Score: 0.21046949811335275

Answer 3: Mr. Lee expected asked whether donation Samsung made two foundation controlled Choi longtime friend president, amounted bribes, role, any, Mr. Lee played decision give money.
Score: 0.19505615219055755



### 2.2 tf-idf

In [17]:
tm_utility = TextMatchingUtility(news_dataset)

# Sample question
question = "Who is the vice chairman of Samsung?"
article_id = 17574
word_vector = 'spaCy'

top_results = tm_utility.get_best_sentences(question, article_id, word_vector, top_n=3)

print("Question:", question)
for i, (answer, score) in enumerate(top_results, 1):
    print(f"Answer {i}: {answer}")
    print("Score:", score)
    print()

Question: Who is the vice chairman of Samsung?
Answer 1: special prosecutor's office said special prosecutor's office evidence Mr. Lee received request bribery president ordered Samsung subsidiary send bribe destination designated president.
Score: 0.8260947373005356

Answer 2: Mr. Lee expected asked whether donation Samsung made two foundation controlled Choi longtime friend president, amounted bribes, role, any, Mr. Lee played decision give money.
Score: 0.8260032233848222

Answer 3: national pension fund's support crucial merger, analyst said helped Mr. Lee inherit control Samsung Mr. Lee father.
Score: 0.7937786586865135



### Sample Questions

In [18]:
 # Test questions
test_questions = [
    {'query': 'Who is the vice chairman of Samsung?', 'answer': 'Jay Y. Lee'},
    {'query': 'Who is the de facto head of Samsung being questioned for bribery?', 'answer': 'Jay Y. Lee'},
    {'query': 'What scandal led to President Park\'s impeachment?', 'answer': 'corruption scandal'},
    {'query': 'What is the name of Samsung\'s vice chairman?', 'answer': 'Jay Y. Lee'},
    {'query': 'What is the name of the special prosecutor investigating the corruption scandal?', 'answer': '[SEP]'},
    {'query': 'Who is on trial at the Constitutional Court?', 'answer': 'Ms. Park'},
    {'query': 'What is the name of the special prosecutor\'s office spokesman?', 'answer': 'Lee'},
    {'query': 'What charges were filed against Ms. Choi by state prosecutors?', 'answer': 'coercing 53 big businesses'},
    {'query': 'What organization\'s support was crucial for the merger of two Samsung affiliates?', 'answer': 'National Pension Service'},
    {'query': 'What amount did Samsung contribute to Ms. Choi\'s winter sports program?', 'answer': '$1.3 million'}
    ]

In [20]:
class TestUtility:
    def __init__(self, text_matching_utility, test_questions):
        self.text_matching_utility = text_matching_utility
        self.test_questions = test_questions

    def evaluate_mrr(self, article_id, word_vector):
        reciprocal_ranks = []

        for question_data in self.test_questions:
            query = question_data['query']
            true_answer = question_data['answer']

            # Get the best sentence from the text matching utility
            best_sentence = self.text_matching_utility.get_best_sentence(query, article_id, word_vector)

            # If no answer found, skip this question
            if not best_sentence:
                continue

            # Check if the true answer is in the best sentence
            if true_answer in best_sentence:
                rank = best_sentence.index(true_answer) + 1  # Rank of the true answer
                reciprocal_ranks.append(1 / rank)

        # Calculate MRR
        mrr = sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0
        return mrr

In [21]:
# Create an instance of TextMatchingUtility
tm_utility = TextMatchingUtility(news_dataset)

# Create an instance of TestUtility
test_utility = TestUtility(tm_utility, test_questions)

# Test with spaCy word vector and article id
article_id = 17574  # Replace with appropriate article ID
mrr = test_utility.evaluate_mrr(article_id, word_vector='tf-idf')
print("Mean Reciprocal Rank (MRR) with spaCy word vector:", mrr)


Article ID  17574
Question: Who is the vice chairman of Samsung?
No answer found
Article ID  17574
Question: Who is the de facto head of Samsung being questioned for bribery?
Answer: SEOUL, South Korea  ?
Score 0.4294359000773963 

Article ID  17574
Question: What scandal led to President Park's impeachment?
Answer: SEOUL, South Korea  ?
Score 0.4610361928916077 

Article ID  17574
Question: What is the name of Samsung's vice chairman?
No answer found
Article ID  17574
Question: What is the name of the special prosecutor investigating the corruption scandal?
Answer: SEOUL, South Korea  ?
Score 0.4376736307642736 

Article ID  17574
Question: Who is on trial at the Constitutional Court?
Answer: Allegations that Ms. Park helped Ms. Choi extort millions in bribes from Samsung and other companies are at the heart of the corruption scandal that led to the National Assembly?s vote to impeach her last month.
Score 0.47337877099319725 

Article ID  17574
Question: What is the name of the speci

In [22]:
# Create an instance of TextMatchingUtility
tm_utility = TextMatchingUtility(news_dataset)

# Create an instance of TestUtility
test_utility = TestUtility(tm_utility, test_questions)

# Test with spaCy word vector and article id
article_id = 17574  # Replace with appropriate article ID
mrr = test_utility.evaluate_mrr(article_id, word_vector='spaCy')
print("Mean Reciprocal Rank (MRR) with spaCy word vector:", mrr)


Article ID  17574
Question: Who is the vice chairman of Samsung?
Answer: The reference on Wednesday to possible perjury charges against Mr. Lee stemmed from that testimony.
Score 0.8260947373005356 

Article ID  17574
Question: Who is the de facto head of Samsung being questioned for bribery?
Answer: SEOUL, South Korea  ?
Score 0.8305692713275632 

Article ID  17574
Question: What scandal led to President Park's impeachment?
Answer: SEOUL, South Korea  ?
Score 0.8743210249797997 

Article ID  17574
Question: What is the name of Samsung's vice chairman?
Answer: The reference on Wednesday to possible perjury charges against Mr. Lee stemmed from that testimony.
Score 0.8286177302079136 

Article ID  17574
Question: What is the name of the special prosecutor investigating the corruption scandal?
Answer: In November, state prosecutors indicted Ms. Choi on charges of coercing 53 big businesses, including Samsung, to contribute $69 million to her two foundations.
Score 0.8390553654061038 

Ar

In [23]:
class QuestionAnsweringSystem:
    def __init__(self, model_name='deepset/bert-base-cased-squad2'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    def answer_question(self, question, article_id):
        passage = news_dataset.loc[news_dataset['id'] == article_id, 'processed_article'].iloc[0]
        inputs = self.tokenizer.encode_plus(question, passage, return_tensors='pt', max_length=512, truncation=True, truncation_strategy='longest_first')

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        outputs = self.model(input_ids, attention_mask=attention_mask, return_dict=True)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        start_index = torch.argmax(start_logits)
        end_index = torch.argmax(end_logits) + 1

        input_tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
        answer_tokens = input_ids[0][start_index:end_index]

        for i, token in enumerate(answer_tokens):
            if token == self.tokenizer.cls_token_id:
                start_index += 1
            elif token == self.tokenizer.sep_token_id:
                end_index -= 1
        answer_tokens = input_ids[0][start_index:end_index]

        answer = self.tokenizer.decode(answer_tokens)
        return answer

In [24]:
class TestUtility:
    def __init__(self, test_questions):
        self.test_questions = test_questions

    @staticmethod
    def accuracy(test_questions, predicted_labels):
        if len(test_questions) != len(predicted_labels):
            raise ValueError("Length of test_questions and predicted_labels must be the same.")

        correct = 0
        total = len(test_questions)
        for i in range(total):
            correct_answer = test_questions[i]['answer'].lower()
            predicted_label = predicted_labels[i].lower()
            correct_answer = correct_answer.replace(" ", "")
            print(correct_answer,predicted_label)
            if correct_answer in predicted_label:
                correct += 1
        if total == 0:
            return 0  # Return 0 if there are no test questions
        return correct / total

    @staticmethod
    def mean_reciprocal_rank(test_questions, predicted_labels):
        rr_sum = 0
        total = len(test_questions)
        for i in range(total):
            correct_answer = test_questions[i]['answer'].lower()
            predicted_labels_lower = predicted_labels[i].lower()
            correct_answer = correct_answer.replace(" ", "")
            if correct_answer in predicted_labels_lower:
                rr_sum += 1 / (predicted_labels_lower.index(correct_answer) + 1)
        return rr_sum / total if total != 0 else 0

    def evaluate_performance(self, predicted_labels):
        acc = self.accuracy(self.test_questions, predicted_labels)
        mrr = self.mean_reciprocal_rank(self.test_questions, predicted_labels)

        print("Accuracy:", acc)
        print("Mean Reciprocal Rank:", mrr)

    def get_predicted_labels(self, article_id, qa_model):
        start_time = time.time()
        answers = []
        print_query = []
        for index, question in enumerate(self.test_questions):
            query = qa_model.answer_question(question['query'], article_id) 
            predicted_labels = query.replace(" ", "")
            answers.append(predicted_labels)
            print_query.append(query)
        end_time = time.time()
        execution_time = end_time - start_time
        print("Execution time:", execution_time, "seconds")
        return answers


In [26]:
article_id = 17574
question = "Who is the vice chairman of Samsung?"

qa_system = QuestionAnsweringSystem()
answer = qa_system.answer_question(question, article_id)

print("Question:", question)
print("Answer:", answer)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 4053a733-fbb5-406f-958b-8b3e34f1f54c)')' thrown while requesting HEAD https://huggingface.co/deepset/bert-base-cased-squad2/resolve/main/tokenizer_config.json


Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Question: Who is the vice chairman of Samsung?
Answer: $ 69 million


In [27]:
test_questions 

[{'query': 'Who is the vice chairman of Samsung?', 'answer': 'Jay Y. Lee'},
 {'query': 'Who is the de facto head of Samsung being questioned for bribery?',
  'answer': 'Jay Y. Lee'},
 {'query': "What scandal led to President Park's impeachment?",
  'answer': 'corruption scandal'},
 {'query': "What is the name of Samsung's vice chairman?",
  'answer': 'Jay Y. Lee'},
 {'query': 'What is the name of the special prosecutor investigating the corruption scandal?',
  'answer': '[SEP]'},
 {'query': 'Who is on trial at the Constitutional Court?',
  'answer': 'Ms. Park'},
 {'query': "What is the name of the special prosecutor's office spokesman?",
  'answer': 'Lee'},
 {'query': 'What charges were filed against Ms. Choi by state prosecutors?',
  'answer': 'coercing 53 big businesses'},
 {'query': "What organization's support was crucial for the merger of two Samsung affiliates?",
  'answer': 'National Pension Service'},
 {'query': "What amount did Samsung contribute to Ms. Choi's winter sports pr

In [28]:
# Instantiate TestUtility object with test questions
test_utility = TestUtility(test_questions)

# Call get_predicted_labels method to obtain predicted labels
article_id = 17574

predicted_labels = test_utility.get_predicted_labels(article_id, qa_system)

# Now you can pass these predicted labels to evaluate_performance method to evaluate performance
test_utility.evaluate_performance(predicted_labels)

Execution time: 23.92566180229187 seconds
jayy.lee $69million
jayy.lee coercing53bigbusinesses,includingsamsung,contribute$69million
corruptionscandal coercing53bigbusinesses,includingsamsung,contribute$69million
jayy.lee coercing53bigbusinesses,includingsamsung,contribute$69million
[sep] coercing53bigbusinesses,includingsamsung,contribute$69million
ms.park $69million
lee coercing53bigbusinesses,includingsamsung,contribute$69million
coercing53bigbusinesses coercing53bigbusinesses,includingsamsung,contribute$69million
nationalpensionservice coercing53bigbusinesses,includingsamsung,contribute$69million
$1.3million $69million
Accuracy: 0.1
Mean Reciprocal Rank: 0.1


In [29]:
### ROBERTA

In [30]:
article_id = 17574
question = "Who is the vice chairman of Samsung?"

qa_system_roberta = QuestionAnsweringSystem(model_name= "deepset/roberta-base-squad2")
answer = qa_system_roberta.answer_question(question, article_id)

print("Question:", question)
print("Answer:", answer)

Downloading tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Question: Who is the vice chairman of Samsung?
Answer: 


In [31]:
true_labels = [item['answer'] for item in test_questions]
print(true_labels)

['Jay Y. Lee', 'Jay Y. Lee', 'corruption scandal', 'Jay Y. Lee', '[SEP]', 'Ms. Park', 'Lee', 'coercing 53 big businesses', 'National Pension Service', '$1.3 million']


In [32]:
# Instantiate TestUtility object with test questions
test_utility = TestUtility(test_questions)

# Call get_predicted_labels method to obtain predicted labels
article_id = 17574

predicted_labels = test_utility.get_predicted_labels(article_id, qa_system_roberta)

print(true_labels)
test_utility.evaluate_performance(predicted_labels)# Instantiate TestUtility object with test questions
test_utility = TestUtility(test_questions)

# Call get_predicted_labels method to obtain predicted labels
article_id = 17574

predicted_labels = test_utility.get_predicted_labels(article_id, qa_system_roberta)

print(true_labels)
test_utility.evaluate_performance(predicted_labels)

Execution time: 23.97845196723938 seconds
['Jay Y. Lee', 'Jay Y. Lee', 'corruption scandal', 'Jay Y. Lee', '[SEP]', 'Ms. Park', 'Lee', 'coercing 53 big businesses', 'National Pension Service', '$1.3 million']
jayy.lee 
jayy.lee mr.lee
corruptionscandal corruption
jayy.lee 
[sep] 
ms.park 
lee mr.leesaid
coercing53bigbusinesses 
nationalpensionservice 
$1.3million $1.3million
Accuracy: 0.2
Mean Reciprocal Rank: 0.125
Execution time: 23.856518745422363 seconds
['Jay Y. Lee', 'Jay Y. Lee', 'corruption scandal', 'Jay Y. Lee', '[SEP]', 'Ms. Park', 'Lee', 'coercing 53 big businesses', 'National Pension Service', '$1.3 million']
jayy.lee 
jayy.lee mr.lee
corruptionscandal corruption
jayy.lee 
[sep] 
ms.park 
lee mr.leesaid
coercing53bigbusinesses 
nationalpensionservice 
$1.3million $1.3million
Accuracy: 0.2
Mean Reciprocal Rank: 0.125


In [37]:
pip install transformers==4.30.2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [42]:
from transformers import BertForQuestionAnswering, BertTokenizer

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', resume_download=True)
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', resume_download=True)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: c6f9c39b-a78e-4bcf-8679-7640fd944565)')' thrown while requesting HEAD https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json


Downloading model.safetensors:  52%|#####2    | 703M/1.34G [00:00<?, ?B/s]

ConnectionError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out.

In [None]:
import torch

class QuestionAnsweringModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def tokenize(self, question, answer_text):
        '''
        Tokenizes the input question and answer_text and sets the segment IDs.
        '''
        # Apply the tokenizer to the encode text, treating them as a question, answer_text pair.
        input_ids = self.tokenizer.encode(question, answer_text, max_length=512, truncation=True, truncation_strategy='only_second')

        # Report how long the input sequence is.
        # print('Query has {:,} tokens.\n'.format(len(input_ids)))

        # Search the input_ids for the first instance of the `[SEP]` token.
        sep_index = input_ids.index(self.tokenizer.sep_token_id)

        # The number of segment A tokens includes the [SEP] token itself.
        num_seg_a = sep_index + 1

        # The remainder are segment B.
        num_seg_b = len(input_ids) - num_seg_a

        # Construct the list of 0s and 1s.
        segment_ids = [0] * num_seg_a + [1] * num_seg_b

        # There should be a segment_id for every input token.
        assert len(segment_ids) == len(input_ids)

        return input_ids, segment_ids

    def evaluate(self, input_ids, segment_ids):
        '''
        Evaluates the input question and answer_text using the model.
        '''
        # Run the question through the model.
        model_scores = self.model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids])) 
        start_scores = model_scores.start_logits
        end_scores = model_scores.end_logits

        return start_scores, end_scores

    def reconstruct_answer(self, input_ids, start_scores, end_scores):
        '''
        Reconstructs the answer from the model's output.
        '''
        # Find the tokens with the highest `start` and `end` scores.
        answer_start = torch.argmax(start_scores)
        answer_end = torch.argmax(end_scores)

        # Get the string versions of the input tokens.
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)

        # Start with the first token.
        answer = tokens[answer_start]

        # Select the remaining answer tokens and join them with whitespace.
        for i in range(answer_start + 1, answer_end + 1):
            # If it's a subword token, then recombine it with the previous token.
            if tokens[i][0:2] == '##':
                answer += tokens[i][2:]
            # Otherwise, add a space then the token.
            else:
                answer += ' ' + tokens[i]

        return answer

    def answer_question(self, question, article_id):
        '''
        Takes a `question` string and an `answer_text` string (which contains the
        answer), and identifies the words within the `answer_text` that are the
        answer. Prints them out.
        '''
        #get article
        answer_text = news_dataset.loc[news_dataset['id'] == article_id, 'processed_article'].iloc[0]

        # Tokenize
        input_ids, segment_ids = self.tokenize(question, answer_text)

        # Evaluate
        start_scores, end_scores = self.evaluate(input_ids, segment_ids)

        # Reconstruct Answer
        answer = self.reconstruct_answer(input_ids, start_scores, end_scores)

        return answer




In [None]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

qa_model = QuestionAnsweringModel(model, tokenizer)

# Define your question and article ID
question = "Who is the vice chairman of Samsung?"
article_id =17574 

# Call the answer_question method to get the answer
answer = qa_model.answer_question(question, article_id)
print("Question:", question)
print("Answer:", answer)

In [None]:
def predicted_labels(test_questions):
    start_time = time.time()
    answers = []
    article_id = 17574
    for index, question in enumerate(test_questions):
        query = qa_model.answer_question(question['query'], article_id)
        predicted_labels = query.replace(" ", "")
        answers.append(predicted_labels)
    return answers
predicted_labels = predicted_labels(test_questions)
print(predicted_labels)

In [None]:
test_questions

In [None]:
# Instantiate TestUtility object with test questions
test_utility = TestUtility(test_questions)

# Call get_predicted_labels method to obtain predicted labels
article_id = 17574

predicted_labels = test_utility.get_predicted_labels(article_id, qa_model)

test_utility.evaluate_performance(predicted_labels)

## B. References

## C. Appendix