# Assignment 2
### Kusal Bista

In [1]:
# Libraries for data reading
import glob
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
# import neuralcoref
# import stanza
# Libraries for pre-processing
import re
import nltk
from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Libraries for information retrieval
import spacy
from spacy import displacy
from spacy.lang.en import English
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Libraries for data analysis
import matplotlib.pyplot as plt
from tabulate import tabulate

# Libraries for question answering
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
import torch
import time
from transformers import RobertaTokenizer, RobertaModel

nlp = spacy.load('en_core_web_sm')



In [2]:
# !pip install tabulate
# !pip install matplotlib
# !pip install transformers

In [3]:
# !python -m spacy download en_core_web_sm

In [4]:

# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !pip install tabulate
# !pip install nltk

In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

### 1 Reading dataset and pre-processing

In [11]:
news_dataset = pd.read_csv('news_dataset.csv', encoding='latin-1')

In [12]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."


In [13]:
# selecting 100 sample
test_article = news_dataset[news_dataset['id'] == 17574]
sample_size = 100
if news_dataset.shape[0] >= sample_size:
    news_dataset = news_dataset.sample(n=sample_size, random_state=42)  # Adjusting random_state for reproducibility
    news_dataset.reset_index(drop=True, inplace=True)
    print("Sampled dataset shape:", news_dataset.shape)
else:
    print("Dataset size is less than the sample size. Cannot perform sampling.")

news_dataset = pd.concat([news_dataset, test_article], ignore_index=True)

Sampled dataset shape: (100, 7)


In [14]:
news_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       101 non-null    int64 
 1   author   100 non-null    object
 2   date     101 non-null    object
 3   year     101 non-null    object
 4   month    101 non-null    object
 5   topic    101 non-null    object
 6   article  101 non-null    object
dtypes: int64(1), object(6)
memory usage: 5.6+ KB


### 1.2 Handling missing value

In [15]:
print("Missing value")
print(news_dataset.isnull().sum())

Missing value
id         0
author     1
date       0
year       0
month      0
topic      0
article    0
dtype: int64


In [16]:
# Handling missing value
news_dataset['author'] = news_dataset['author'].fillna('No author')
# checking missing value after handling missing value

In [17]:
print("After handling missing value")
print(news_dataset.isnull().sum())

After handling missing value
id         0
author     0
date       0
year       0
month      0
topic      0
article    0
dtype: int64


In [18]:
news_dataset = news_dataset.drop_duplicates(subset=['article'], keep='first').reset_index(drop=True)
news_dataset.head(12)

Unnamed: 0,id,author,date,year,month,topic,article
0,17904,Brooks Barnes,23/01/2017,2017,1,entertainment,Nominations for the 89th Academy Awards will b...
1,18166,Matt Flegenheimer,2/02/2017,2017,2,business,"WASHINGTON ? President Trump, seeming to re..."
2,18169,Somini Sengupta,2/02/2017,2017,2,business,UNITED NATIONS ? The new secretary general ...
3,18055,Emily Palmer,26/01/2017,2017,1,lifestyle,"On a chilly October morning, Talea Childs, 4, ..."
4,17776,Liam Stack,20/01/2017,2017,1,entertainment,celebrities may be staying away from Donald J....
5,18079,Dave Philipps,5/02/2017,2017,2,science,"RICHLAND, Wash. ? When Tim Snider arrived o..."
6,18019,Justin Wolfers,30/01/2017,2017,1,politics,Even if President Trump?s fails over the ...
7,17895,Adam Liptak,24/01/2017,2017,1,politics,WASHINGTON ? The Supreme Court rejected on ...
8,18308,Ken Belson,8/02/2017,2017,2,sports,"HOUSTON ? There was the game on the field, ..."
9,17452,The Associated Press,8/01/2017,2017,1,sports,HOUSTON ? Follow our live N. F. L. playoffs...


### 1.3 Data pre-processing

In [19]:
nlp = spacy.load('en_core_web_sm')
# neuralcoref.add_to_pipe(nlp)


def pre_process(data):
    stop_words = set(stopwords.words('english'))
    stop_words.update(["This", "The", "the"])
    s = " \[(?=.*\d).*?\]"
    # Lemmatization and removal of stopwords
    lemmatizer = WordNetLemmatizer()

#     print(data)
    result = []
    for text in data:
        doc = nlp(text)
        # text = doc._.coref_resolved
        # Clean text
        # Remove non-ASCII characters
        text = ''.join([char for char in text if ord(char) < 128])
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        # Remove question mark problems
        text = re.sub(r'(\s\?)',' ',text)
        text = re.sub(r"\b\?\b", "\'", text)
        text = re.sub(r"(,\?)",",", text)
        text = re.sub(r"\?+", "?", text)
        text = text.strip()
        # Lemmatization and removal of stopwords
        processed_text = " ".join([lemmatizer.lemmatize(word) for word in re.sub(s, "", text).split() if word.lower() not in stop_words])
        result.append(processed_text)
    return result

In [20]:
news_dataset['processed_article'] = pre_process(news_dataset['article'])

In [21]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article,processed_article
0,17904,Brooks Barnes,23/01/2017,2017,1,entertainment,Nominations for the 89th Academy Awards will b...,Nominations 89th Academy Awards announced Tues...
1,18166,Matt Flegenheimer,2/02/2017,2017,2,business,"WASHINGTON ? President Trump, seeming to re...","WASHINGTON President Trump, seeming relish fig..."
2,18169,Somini Sengupta,2/02/2017,2017,2,business,UNITED NATIONS ? The new secretary general ...,UNITED NATIONS new secretary general United Na...
3,18055,Emily Palmer,26/01/2017,2017,1,lifestyle,"On a chilly October morning, Talea Childs, 4, ...","chilly October morning, Talea Childs, 4, still..."
4,17776,Liam Stack,20/01/2017,2017,1,entertainment,celebrities may be staying away from Donald J....,celebrity may staying away Donald J. Trump's i...


In [24]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


#### 2 Classical retrivel

In [25]:
nlp_md = spacy.load("en_core_web_md")

class TextMatchingUtility:
    def __init__(self, dataset):
        self.data = dataset  # Dataset
        self.nlp = spacy.load("en_core_web_sm")

    def preprocess_query(self, query):
        # Regular expression to match text patterns
        s = " \[(?=.*\d).*?\]"
        # Removing stopwords and Lemmatization
        stop_words = stopwords.words('english')
        stop_words.extend(["This", "The", "the"])
        lemmatizer = WordNetLemmatizer()
        result = [" ".join([lemmatizer.lemmatize(word) for word in re.sub(s, "", query).split() if word not in stop_words])]
        return result


    def tf_idf_score(self, query, articles):
        vectorizer = TfidfVectorizer()
        # Convert to word vector
        articles_wv = vectorizer.fit_transform(articles)
        # Convert to word vector
        query_wv = vectorizer.transform([query])
        # Calculate similarity
        similarities = cosine_similarity(query_wv, articles_wv)[0]
        return similarities

    def spacy_score(self, query, articles):
        # Convert to word vector
        query_nlp = nlp_md(str(query))
        # Convert to word vector
        articles_nlp = [nlp_md(article) for article in articles]
        # Calculate similarity
        similarities = [query_nlp.similarity(article_nlp) for article_nlp in articles_nlp]
        return similarities

    def get_best_sentences(self, query, article_id, word_vector, top_n=3):
        article = self.data.loc[self.data['id'] == article_id, 'processed_article'].iloc[0]
        # Convert text into sentences
        sentences_clean = tokenize.sent_tokenize(article)
        # Calculate similarity
        if word_vector == "tf-idf":
            similarities = self.tf_idf_score(query, sentences_clean)
        elif word_vector == "spaCy":
            similarities = self.spacy_score(query, sentences_clean)
        # Get the indices of top N scores
        top_indices = np.argsort(similarities)[::-1][:top_n]
        # Get the top N sentences and their scores
        top_sentences = [(sentences_clean[i], similarities[i]) for i in top_indices]
        return top_sentences

    def get_best_sentence(self, query, article_id, word_vector):
        article = self.data.loc[self.data['id'] == article_id, 'processed_article'].iloc[0]
        # Convert text into sentences
        sentences_clean = tokenize.sent_tokenize(article)
        # Calculate similarity
        if word_vector == "tf-idf":
            similarities = self.tf_idf_score(query, sentences_clean)
        elif word_vector == "spaCy":
            similarities = self.spacy_score(query, sentences_clean)

        # Get the maximum score index position
        best_idx = np.array(similarities).argmax()
        # Get the best score
        best_score = max(similarities)
        # Get original data
        for j in range(len(self.data['id'])):
            if self.data['id'][j] == article_id:
                topic = self.data['article'][j]
        sentences_topic = tokenize.sent_tokenize(topic)
        answer = sentences_topic[best_idx]
        print("Article ID ", article_id)
        print("Question:", query)
        if best_score < 0.3:
            print("No answer found")
        else:
            print("Answer:", answer)
            print("Score", best_score,"\n")

### 2.1 tf-idf

In [26]:
tm_utility = TextMatchingUtility(news_dataset)

# Sample question
question = "Who is the vice president of Samsung?"
article_id = 17574
word_vector = 'tf-idf'

top_results = tm_utility.get_best_sentences(question, article_id, word_vector, top_n=3)

print("Question:", question)
for i, (answer, score) in enumerate(top_results, 1):
    print(f"Answer {i}: {answer}")
    print("Score:", score)
    print()

Question: Who is the vice president of Samsung?
Answer 1: special prosecutor's office said evidence Mr. Lee received request bribery president ordered Samsung subsidiary send bribe destination designated president.
Score: 0.26497616747841085

Answer 2: de facto leader, Jay Y. Lee, vice chairman Samsung, questioned Thursday, according special prosecutor's office, recommended also investigated suspicion perjury.
Score: 0.2389633025423427

Answer 3: SEOUL, South Korea special prosecutor investigating corruption scandal led President Park impeachment summoned de facto head Samsung questioning Wednesday, calling bribery suspect.
Score: 0.14073930825195885



### 2.2 tf-idf

In [27]:
tm_utility = TextMatchingUtility(news_dataset)

# Sample question
question = "Who is the vice chairman of Samsung?"
article_id = 17574
word_vector = 'spaCy'

top_results = tm_utility.get_best_sentences(question, article_id, word_vector, top_n=3)

print("Question:", question)
for i, (answer, score) in enumerate(top_results, 1):
    print(f"Answer {i}: {answer}")
    print("Score:", score)
    print()

Question: Who is the vice chairman of Samsung?
Answer 1: Investigators special prosecutor's office questioned senior Samsung executive suspect bribery accusations.
Score: 0.6452350002474918

Answer 2: national pension fund's support crucial merger, analyst said helped Mr. Lee inherit control Samsung father.
Score: 0.6324615702928353

Answer 3: email contained information financial support provided Samsung, prosecutor's office said.
Score: 0.6281309632782177



### Sample Questions

In [28]:
 # Test questions in one passage
test_questions = [
    {'query': 'Who is the vice chairman of Samsung?', 'answer': 'Jay Y. Lee'},
    {'query': 'Who is the de facto head of Samsung being questioned for bribery?', 'answer': 'Jay Y. Lee'},
    {'query': 'What scandal led to President Park\'s impeachment?', 'answer': 'corruption scandal'},
    {'query': 'What is the name of Samsung\'s vice chairman?', 'answer': 'Jay Y. Lee'},
    {'query': 'What is the name of the special prosecutor investigating the corruption scandal?', 'answer': '[SEP]'},
    {'query': 'Who is on trial at the Constitutional Court?', 'answer': 'Ms. Park'},
    {'query': 'What is the name of the special prosecutor\'s office spokesman?', 'answer': 'Lee'},
    {'query': 'What charges were filed against Ms. Choi by state prosecutors?', 'answer': 'coercing 53 big businesses'},
    {'query': 'What organization\'s support was crucial for the merger of two Samsung affiliates?', 'answer': 'National Pension Service'},
    {'query': 'What amount did Samsung contribute to Ms. Choi\'s winter sports program?', 'answer': '$1.3 million'}
    ]

In [103]:
test_questions_all = [
    {'query': 'What movie is seen as a leading contender for Best Picture at the 89th Academy Awards?', 'answer': 'Moonlight', 'passage_id': 17904},  # article = 17904
    {'query': 'Who is President Trump encouraging to invoke the nuclear option regarding the confirmation of his nominee to the Supreme Court?', 'answer': 'Mitch McConnell', 'passage_id': 18166},  # article 18166
    {'query': 'What tactic did President Trump suggest Senate Majority Leader Mitch McConnell use to confirm Neil Gorsuch to the Supreme Court?', 'answer': 'Nuclear option', 'passage_id': 18166},  # article 18166
    {'query': "What is the name of Talea's mother?", 'answer': 'Trenicia', 'passage_id': 18055},  # 18055
    {'query': 'Who is the Roman Catholic Archbishop of New York?', 'answer': 'Cardinal Dolan', 'passage_id': 17776},  # article 17776
    {'query': "How many religious leaders are scheduled to participate in Donald J. Trump's inauguration ceremony?", 'answer': 'Six', 'passage_id': 17776},  # 17776
    {'query': 'What is the name of the atoll where Tim Snider and other veterans were tasked with cleaning up nuclear fallout?', 'answer': 'Enewetak Atoll', 'passage_id': 18079},  # 18079
    {'query': "What is the biggest problem identified by Energy Department reports regarding the cleanup of Enewetak Atoll?", 'answer': 'Runit Island', 'passage_id': 17895},  # 17895
    {'query': 'According to Bonsor, what can travelers do to avoid feeling overwhelmed in large cities?', 'answer': 'Choose smaller cities', 'passage_id': 18253},  # 18253
    {'query': 'What amount did Samsung contribute to Ms. Choi\'s winter sports program?', 'answer': '$1.3 million', 'passage_id': 17574}  # 17574
]


In [29]:
class TestUtility:
    def __init__(self, text_matching_utility, test_questions):
        self.text_matching_utility = text_matching_utility
        self.test_questions = test_questions

    def evaluate_mrr(self, article_id, word_vector):
        reciprocal_ranks = []

        for question_data in self.test_questions:
            query = question_data['query']
            true_answer = question_data['answer']

            # Get the best sentence from the text matching utility
            best_sentence = self.text_matching_utility.get_best_sentence(query, article_id, word_vector)

            # If no answer found, skip this question
            if not best_sentence:
                continue

            # Check if the true answer is in the best sentence
            if true_answer in best_sentence:
                rank = best_sentence.index(true_answer) + 1  # Rank of the true answer
                reciprocal_ranks.append(1 / rank)

        # Calculate MRR
        mrr = sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0
        return mrr

In [30]:
# Create an instance of TextMatchingUtility
tm_utility = TextMatchingUtility(news_dataset)

# Create an instance of TestUtility
test_utility = TestUtility(tm_utility, test_questions)

# Test with spaCy word vector and article id
article_id = 17574  # Replace with appropriate article ID
mrr = test_utility.evaluate_mrr(article_id, word_vector='tf-idf')
print("Mean Reciprocal Rank (MRR) with spaCy word vector:", mrr)


Article ID  17574
Question: Who is the vice chairman of Samsung?
Answer: A special prosecutor investigating the corruption scandal that led to President Park  ?s impeachment summoned the de facto head of Samsung for questioning on Wednesday, calling him a bribery suspect.
Score 0.36449756195127636 

Article ID  17574
Question: Who is the de facto head of Samsung being questioned for bribery?
Answer: SEOUL, South Korea  ?
Score 0.4067497820978088 

Article ID  17574
Question: What scandal led to President Park's impeachment?
Answer: SEOUL, South Korea  ?
Score 0.4681912278963329 

Article ID  17574
Question: What is the name of Samsung's vice chairman?
Answer: A special prosecutor investigating the corruption scandal that led to President Park  ?s impeachment summoned the de facto head of Samsung for questioning on Wednesday, calling him a bribery suspect.
Score 0.36449756195127636 

Article ID  17574
Question: What is the name of the special prosecutor investigating the corruption scan

In [31]:
# Create an instance of TextMatchingUtility
tm_utility = TextMatchingUtility(news_dataset)

# Create an instance of TestUtility
test_utility = TestUtility(tm_utility, test_questions)

# Test with spaCy word vector and article id
article_id = 17574  # Replace with appropriate article ID
mrr = test_utility.evaluate_mrr(article_id, word_vector='spaCy')
print("Mean Reciprocal Rank (MRR) with spaCy word vector:", mrr)


Article ID  17574
Question: Who is the vice chairman of Samsung?
Answer: He is expected to be asked whether   donations that Samsung made to two foundations controlled by Choi   a longtime friend of the president, amounted to bribes, and what role, if any, he played in the decision to give the money.
Score 0.6452350002474918 

Article ID  17574
Question: Who is the de facto head of Samsung being questioned for bribery?
Answer: SEOUL, South Korea  ?
Score 0.737722794448654 

Article ID  17574
Question: What scandal led to President Park's impeachment?
Answer: Neither Samsung nor Mr. Lee responded immediately to the announcement on Wednesday.
Score 0.7442539702783805 

Article ID  17574
Question: What is the name of Samsung's vice chairman?
Answer: The emails contained information about the financial support provided by Samsung, the prosecutor?s office said.
Score 0.6877933896276821 

Article ID  17574
Question: What is the name of the special prosecutor investigating the corruption scan

In [32]:
class QuestionAnsweringSystem:
    def __init__(self, model_name='deepset/bert-base-cased-squad2'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    def answer_question(self, question, article_id):
        passage = news_dataset.loc[news_dataset['id'] == article_id, 'processed_article'].iloc[0]
        inputs = self.tokenizer.encode_plus(question, passage, return_tensors='pt', max_length=512, truncation=True, truncation_strategy='longest_first')

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        outputs = self.model(input_ids, attention_mask=attention_mask, return_dict=True)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        start_index = torch.argmax(start_logits)
        end_index = torch.argmax(end_logits) + 1

        input_tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
        answer_tokens = input_ids[0][start_index:end_index]

        for i, token in enumerate(answer_tokens):
            if token == self.tokenizer.cls_token_id:
                start_index += 1
            elif token == self.tokenizer.sep_token_id:
                end_index -= 1
        answer_tokens = input_ids[0][start_index:end_index]

        answer = self.tokenizer.decode(answer_tokens)
        return answer

In [108]:
class TestUtility:
    def __init__(self, test_questions):
        self.test_questions = test_questions

    @staticmethod
    def accuracy(test_questions, predicted_labels):
        if len(test_questions) != len(predicted_labels):
            raise ValueError("Length of test_questions and predicted_labels must be the same.")

        correct = 0
        total = len(test_questions)
        for i in range(total):
            correct_answer = test_questions[i]['answer'].lower()
            predicted_label = predicted_labels[i].lower()
            correct_answer = correct_answer.replace(" ", "")
            print(correct_answer,predicted_label)
            if correct_answer in predicted_label:
                correct += 1
        if total == 0:
            return 0  # Return 0 if there are no test questions
        return correct / total

    @staticmethod
    def mean_reciprocal_rank(test_questions, predicted_labels):
        rr_sum = 0
        total = len(test_questions)
        for i in range(total):
            correct_answer = test_questions[i]['answer'].lower()
            predicted_labels_lower = predicted_labels[i].lower()
            correct_answer = correct_answer.replace(" ", "")
            if correct_answer in predicted_labels_lower:
                rr_sum += 1 / (predicted_labels_lower.index(correct_answer) + 1)
        return rr_sum / total if total != 0 else 0

    def evaluate_performance(self, predicted_labels):
        acc = self.accuracy(self.test_questions, predicted_labels)
        mrr = self.mean_reciprocal_rank(self.test_questions, predicted_labels)

        print("Accuracy:", acc)
        print("Mean Reciprocal Rank:", mrr)

    def get_predicted_labels(self, article_id, qa_model):
        start_time = time.time()
        answers = []
        print_query = []
        for index, question in enumerate(self.test_questions):
            query = qa_model.answer_question(question['query'], article_id)
            predicted_labels = query.replace(" ", "")
            answers.append(predicted_labels)
            print_query.append(query)
        end_time = time.time()
        execution_time = end_time - start_time
        print("Execution time:", execution_time, "seconds")
        return answers



    def get_predicted_labels_all(self, qa_model):
        start_time = time.time()
        answers = []
        print_query = []
        for index, question in enumerate(self.test_questions):
            query = qa_model.answer_question(question['query'], question['passage_id'])
            predicted_labels = query.replace(" ", "")
            answers.append(predicted_labels)
            print_query.append(query)
        end_time = time.time()
        execution_time = end_time - start_time
        print("Execution time:", execution_time, "seconds")
        return answers


In [34]:
article_id = 17574
question = "Who is the vice chairman of Samsung?"

qa_system = QuestionAnsweringSystem()
answer = qa_system.answer_question(question, article_id)

print("Question:", question)
print("Answer:", answer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Question: Who is the vice chairman of Samsung?
Answer: Jay Y. Lee


In [35]:
test_questions

[{'query': 'Who is the vice chairman of Samsung?', 'answer': 'Jay Y. Lee'},
 {'query': 'Who is the de facto head of Samsung being questioned for bribery?',
  'answer': 'Jay Y. Lee'},
 {'query': "What scandal led to President Park's impeachment?",
  'answer': 'corruption scandal'},
 {'query': "What is the name of Samsung's vice chairman?",
  'answer': 'Jay Y. Lee'},
 {'query': 'What is the name of the special prosecutor investigating the corruption scandal?',
  'answer': '[SEP]'},
 {'query': 'Who is on trial at the Constitutional Court?',
  'answer': 'Ms. Park'},
 {'query': "What is the name of the special prosecutor's office spokesman?",
  'answer': 'Lee'},
 {'query': 'What charges were filed against Ms. Choi by state prosecutors?',
  'answer': 'coercing 53 big businesses'},
 {'query': "What organization's support was crucial for the merger of two Samsung affiliates?",
  'answer': 'National Pension Service'},
 {'query': "What amount did Samsung contribute to Ms. Choi's winter sports pr

In [36]:
# Instantiate TestUtility object with test questions
test_utility = TestUtility(test_questions)

# Call get_predicted_labels method to obtain predicted labels
article_id = 17574

predicted_labels = test_utility.get_predicted_labels(article_id, qa_system)

# Now you can pass these predicted labels to evaluate_performance method to evaluate performance
test_utility.evaluate_performance(predicted_labels)

Execution time: 1.341726303100586 seconds
jayy.lee jayy.lee
jayy.lee jayy.lee
corruptionscandal jayy.lee,vicechairmansamsung,questionedthursday,accordingspecialprosecutor'soffice,recommendedalsoinvestigatedsuspicionperjury.mr.leeeffectivelyrunsamsung,southkorea'slargestconglomeratesonchairman,leeincapacitatedhealthproblems.expectedaskedwhetherdonationsamsungmadetwofoundationcontrolledchoilongtimefriendpresident,amountedbribes,role,any,playeddecisiongivemoney.investigatorsspecialprosecutor'sofficequestionedseniorsamsungexecutivesuspectbriberyaccusations.neithersamsungmr.leerespondedimmediatelyannouncementwednesday.allegationsms.parkhelpedms.choiextortmillionbribesamsungcompanyheartcorruptionscandallednationalassembly'svoteimpeachlastmonth.sincethen,ms.park'spowersuspended,trialconstitutionalcourt,ultimatelydecidewhetherendpresidency.lastmonth,mr.leetestifiednationalassemblyhearinginvolveddecisionsamsungmakedonations.alsosaiddonationvoluntary,suggestingcompanyvictimextortion,participantb

In [37]:
### ROBERTA

In [38]:
article_id = 17574
question = "Who is the vice chairman of Samsung?"

qa_system_roberta = QuestionAnsweringSystem(model_name= "deepset/roberta-base-squad2")
answer = qa_system_roberta.answer_question(question, article_id)

print("Question:", question)
print("Answer:", answer)

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Question: Who is the vice chairman of Samsung?
Answer:  Jay Y. Lee


In [39]:
true_labels = [item['answer'] for item in test_questions]
print(true_labels)

['Jay Y. Lee', 'Jay Y. Lee', 'corruption scandal', 'Jay Y. Lee', '[SEP]', 'Ms. Park', 'Lee', 'coercing 53 big businesses', 'National Pension Service', '$1.3 million']


In [40]:
# Instantiate TestUtility object with test questions
test_utility = TestUtility(test_questions)

# Call get_predicted_labels method to obtain predicted labels
article_id = 17574

predicted_labels = test_utility.get_predicted_labels(article_id, qa_system_roberta)

print(true_labels)
test_utility.evaluate_performance(predicted_labels)# Instantiate TestUtility object with test questions
test_utility = TestUtility(test_questions)

# Call get_predicted_labels method to obtain predicted labels
article_id = 17574

predicted_labels = test_utility.get_predicted_labels(article_id, qa_system_roberta)

print(true_labels)
test_utility.evaluate_performance(predicted_labels)

Execution time: 1.4707188606262207 seconds
['Jay Y. Lee', 'Jay Y. Lee', 'corruption scandal', 'Jay Y. Lee', '[SEP]', 'Ms. Park', 'Lee', 'coercing 53 big businesses', 'National Pension Service', '$1.3 million']
jayy.lee jayy.lee
jayy.lee jayy.lee
corruptionscandal corruption
jayy.lee jayy.lee
[sep] 
ms.park 
lee 
coercing53bigbusinesses coercing53bigbusinesses
nationalpensionservice nationalpensionservice
$1.3million $1.3million
Accuracy: 0.6
Mean Reciprocal Rank: 0.6
Execution time: 1.471409797668457 seconds
['Jay Y. Lee', 'Jay Y. Lee', 'corruption scandal', 'Jay Y. Lee', '[SEP]', 'Ms. Park', 'Lee', 'coercing 53 big businesses', 'National Pension Service', '$1.3 million']
jayy.lee jayy.lee
jayy.lee jayy.lee
corruptionscandal corruption
jayy.lee jayy.lee
[sep] 
ms.park 
lee 
coercing53bigbusinesses coercing53bigbusinesses
nationalpensionservice nationalpensionservice
$1.3million $1.3million
Accuracy: 0.6
Mean Reciprocal Rank: 0.6


In [42]:
from transformers import BertForQuestionAnswering, BertTokenizer

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', resume_download=True)
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', resume_download=True)

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [87]:
import heapq

class QuestionAnsweringModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def tokenize(self, question, answer_text):
        '''
        Tokenizes the input question and answer_text and sets the segment IDs.
        '''
        # Apply the tokenizer to the encode text, treating them as a question, answer_text pair.
        input_ids = self.tokenizer.encode(question, answer_text, max_length=512, truncation=True, truncation_strategy='only_second')

        # Report how long the input sequence is.
        # print('Query has {:,} tokens.\n'.format(len(input_ids)))

        # Search the input_ids for the first instance of the `[SEP]` token.
        sep_index = input_ids.index(self.tokenizer.sep_token_id)

        # The number of segment A tokens includes the [SEP] token itself.
        num_seg_a = sep_index + 1

        # The remainder are segment B.
        num_seg_b = len(input_ids) - num_seg_a

        # Construct the list of 0s and 1s.
        segment_ids = [0] * num_seg_a + [1] * num_seg_b

        # There should be a segment_id for every input token.
        assert len(segment_ids) == len(input_ids)

        return input_ids, segment_ids

    def evaluate(self, input_ids, segment_ids):
        '''
        Evaluates the input question and answer_text using the model.
        '''
        # Run the question through the model.
        model_scores = self.model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
        start_scores = model_scores.start_logits
        end_scores = model_scores.end_logits

        return start_scores, end_scores

    def reconstruct_answer(self, input_ids, start_scores, end_scores):
        '''
        Reconstructs the answer from the model's output.
        '''
        # Find the tokens with the highest `start` and `end` scores.
        answer_start = torch.argmax(start_scores)
        answer_end = torch.argmax(end_scores)

        # Get the string versions of the input tokens.
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)

        # Start with the first token.
        answer = tokens[answer_start]

        # Select the remaining answer tokens and join them with whitespace.
        for i in range(answer_start + 1, answer_end + 1):
            # If it's a subword token, then recombine it with the previous token.
            if tokens[i][0:2] == '##':
                answer += tokens[i][2:]
            # Otherwise, add a space then the token.
            else:
                answer += ' ' + tokens[i]

        return answer

    def answer_question(self, question, article_id):
        '''
        Takes a `question` string and an `answer_text` string (which contains the
        answer), and identifies the words within the `answer_text` that are the
        answer. Prints them out.
        '''
        answer_text = news_dataset.loc[news_dataset['id'] == article_id, 'processed_article'].iloc[0]
        input_ids, segment_ids = self.tokenize(question, answer_text)
        start_scores, end_scores = self.evaluate(input_ids, segment_ids)
        answer = self.reconstruct_answer(input_ids, start_scores, end_scores)
        return answer

    def answer_question_top3(self, question, article_id):
        answer_text = news_dataset.loc[news_dataset['id'] == article_id, 'processed_article'].iloc[0]
        input_ids, segment_ids = self.tokenize(question, answer_text)
        start_scores, end_scores = self.evaluate(input_ids, segment_ids)
        answers = []
        max_answers = 3
        # Find the top answer candidates
        for _ in range(max_answers):
            answer_start = torch.argmax(start_scores)
            answer_end = torch.argmax(end_scores)
            # Get the string versions of the input tokens.
            tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
            answer = tokens[answer_start]
            for i in range(answer_start + 1, answer_end + 1):
                if tokens[i][0:2] == '##':
                    answer += tokens[i][2:]
                else:
                    answer += ' ' + tokens[i]
            confidence_score = start_scores[0][answer_start] + end_scores[0][answer_end]
            answers.append((answer, confidence_score.item()))
            # Mask the used tokens
            start_scores[0][answer_start] = end_scores[0][answer_end] = float('-inf')
        # Sort answers by confidence scores
        top_answers = heapq.nlargest(max_answers, answers, key=lambda x: x[1])
        return [ans[0] for ans in top_answers]


In [86]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

qa_model = QuestionAnsweringModel(model, tokenizer)

# Define your question and article ID
question = "Who is the vice chairman of Samsung?"
article_id =17574

# Call the answer_question method to get the answer
answer = qa_model.answer_question_top3(question, article_id)
print("Question:", question)
print("Answer:", answer)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Question: Who is the vice chairman of Samsung?
Answer: ['jay y . lee', 'lee ,', '[SEP]']


In [88]:
def predicted_labels(test_questions):
    start_time = time.time()
    answers = []
    article_id = 17574
    for index, question in enumerate(test_questions):
        query = qa_model.answer_question(question['query'], article_id)
        predicted_labels = query.replace(" ", "")
        answers.append(predicted_labels)
    return answers
predicted_labels = predicted_labels(test_questions)
print(predicted_labels)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

['jayy.lee', 'jayy.lee', 'corruption', 'jayy.lee', '[SEP]', 'ms.park', 'lee', 'coercing53bigbusinesses', "[CLS]whatorganization'ssupportwascrucialforthemergeroftwosamsungaffiliates?[SEP]", '$1.3million']


In [89]:
test_questions

[{'query': 'Who is the vice chairman of Samsung?', 'answer': 'Jay Y. Lee'},
 {'query': 'Who is the de facto head of Samsung being questioned for bribery?',
  'answer': 'Jay Y. Lee'},
 {'query': "What scandal led to President Park's impeachment?",
  'answer': 'corruption scandal'},
 {'query': "What is the name of Samsung's vice chairman?",
  'answer': 'Jay Y. Lee'},
 {'query': 'What is the name of the special prosecutor investigating the corruption scandal?',
  'answer': '[SEP]'},
 {'query': 'Who is on trial at the Constitutional Court?',
  'answer': 'Ms. Park'},
 {'query': "What is the name of the special prosecutor's office spokesman?",
  'answer': 'Lee'},
 {'query': 'What charges were filed against Ms. Choi by state prosecutors?',
  'answer': 'coercing 53 big businesses'},
 {'query': "What organization's support was crucial for the merger of two Samsung affiliates?",
  'answer': 'National Pension Service'},
 {'query': "What amount did Samsung contribute to Ms. Choi's winter sports pr

In [47]:
# Instantiate TestUtility object with test questions
test_utility = TestUtility(test_questions)

# Call get_predicted_labels method to obtain predicted labels
article_id = 17574

predicted_labels = test_utility.get_predicted_labels(article_id, qa_model)

test_utility.evaluate_performance(predicted_labels)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Execution time: 3.814912796020508 seconds
jayy.lee jayy.lee
jayy.lee jayy.lee
corruptionscandal corruption
jayy.lee jayy.lee
[sep] [sep]
ms.park ms.park
lee lee
coercing53bigbusinesses coercing53bigbusinesses
nationalpensionservice [cls]whatorganization'ssupportwascrucialforthemergeroftwosamsungaffiliates?[sep]
$1.3million $1.3million
Accuracy: 0.8
Mean Reciprocal Rank: 0.8


In [109]:
for i, question in enumerate(test_questions_all):
    answer = qa_model.answer_question(question['query'], question['passage_id'])
    # Print the question, passage, and answer
    print("Article no.{}:{}\n".format(i+1, question['passage_id']))
    print("Question {}: {}\n".format(i+1, question['query']))
    print("Predicted Answer {}: {}\n".format(i+1, answer))
    print("Actual Answer {}: {}\n".format(i+1, question['answer']))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Article no.1:17904

Question 1: What movie is seen as a leading contender for Best Picture at the 89th Academy Awards?

Predicted Answer 1: young black man miami

Actual Answer 1: Moonlight



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Article no.2:18166

Question 2: Who is President Trump encouraging to invoke the nuclear option regarding the confirmation of his nominee to the Supreme Court?

Predicted Answer 2: mitch mcconnell

Actual Answer 2: Mitch McConnell



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Article no.3:18166

Question 3: What tactic did President Trump suggest Senate Majority Leader Mitch McConnell use to confirm Neil Gorsuch to the Supreme Court?

Predicted Answer 3: nuclear option

Actual Answer 3: Nuclear option



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Article no.4:18055

Question 4: What is the name of Talea's mother?

Predicted Answer 4: trenicia childs

Actual Answer 4: Trenicia



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Article no.5:17776

Question 5: Who is the Roman Catholic Archbishop of New York?

Predicted Answer 5: cardinal dolan

Actual Answer 5: Cardinal Dolan



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Article no.6:17776

Question 6: How many religious leaders are scheduled to participate in Donald J. Trump's inauguration ceremony?

Predicted Answer 6: six

Actual Answer 6: Six

Article no.7:18079

Question 7: What is the name of the atoll where Tim Snider and other veterans were tasked with cleaning up nuclear fallout?

Predicted Answer 7: enewetak

Actual Answer 7: Enewetak Atoll

Article no.8:17895

Question 8: What is the biggest problem identified by Energy Department reports regarding the cleanup of Enewetak Atoll?

Predicted Answer 8: [CLS] what is the biggest problem identified by energy department reports regarding the cleanup of enewetak atoll ? [SEP]

Actual Answer 8: Runit Island



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Article no.9:18253

Question 9: According to Bonsor, what can travelers do to avoid feeling overwhelmed in large cities?

Predicted Answer 9: [CLS] according to bonsor , what can travelers do to avoid feeling overwhelmed in large cities ? [SEP] beach spa vacation typically associated relaxation urban getaway tend go , go , go , always case , according michael bonsor , hotel manager rosewood london . absolutely head city relaxing vacation , said . here , mr . bonsor share tip trip busy city setting : set mood beforehand use journey destination opportunity get vacation mode : flying , use pair headphone order glass champagne another drink enjoy get board . watch movie listen music ipad airline ' s entertainment system . driving , plan hit road rush hour , spend hour sitting traffic playlist hand favorite song carry indulgent snack bar good chocolate . idea , mr . bonsor said , start unwinding vacation starts . pick manageable city large city like new york city , paris tokyo usually lend 

In [111]:
# Instantiate TestUtility object with test questions
test_utility = TestUtility(test_questions_all)
predicted_labels = test_utility.get_predicted_labels_all(qa_model)
test_utility.evaluate_performance(predicted_labels)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Execution time: 3.726841688156128 seconds
moonlight youngblackmanmiami
mitchmcconnell mitchmcconnell
nuclearoption nuclearoption
trenicia treniciachilds
cardinaldolan cardinaldolan
six six
enewetakatoll enewetak
runitisland [cls]whatisthebiggestproblemidentifiedbyenergydepartmentreportsregardingthecleanupofenewetakatoll?[sep]
choosesmallercities [cls]accordingtobonsor,whatcantravelersdotoavoidfeelingoverwhelmedinlargecities?[sep]beachspavacationtypicallyassociatedrelaxationurbangetawaytendgo,go,go,alwayscase,accordingmichaelbonsor,hotelmanagerrosewoodlondon.absolutelyheadcityrelaxingvacation,said.here,mr.bonsorsharetiptripbusycitysetting:setmoodbeforehandusejourneydestinationopportunitygetvacationmode:flying,usepairheadphoneorderglasschampagneanotherdrinkenjoygetboard.watchmovielistenmusicipadairline'sentertainmentsystem.driving,planhitroadrushhour,spendhoursittingtrafficplaylisthandfavoritesongcarryindulgentsnackbargoodchocolate.idea,mr.bonsorsaid,startunwindingvacationstarts.pickmana

In [116]:
def ask_question():
    article_id = input("Enter the article ID (type 'exit' to quit): ")
    if article_id.lower() == 'exit':
        print("Exiting the program...")
        return
    try:
        article_id = int(article_id)
        if article_id not in news_dataset['id'].tolist():
            print("Invalid article number.")
            ask_question()
            return
        question = input("Ask your question: ")
        answer = qa_model.answer_question(question, article_id)
        print("Answer:", answer)
    except ValueError:
        print("Article ID should be an integer. Please try again.")
    ask_question()

In [117]:
ask_question()

Enter the article ID (type 'exit' to quit): 17574
Ask your question: Who is the vice chairman of Samsung?


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Answer: jay y . lee
Enter the article ID (type 'exit' to quit): 17776
Ask your question: Who is the Roman Catholic Archbishop of New York?


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Answer: cardinal dolan
Enter the article ID (type 'exit' to quit): What is the name of the atoll where Tim Snider and other veterans were tasked with cleaning up nuclear fallout?
Article ID should be an integer. Please try again.
Enter the article ID (type 'exit' to quit): 17776
Ask your question:  How many religious leaders are scheduled to participate in Donald J. Trump's inauguration ceremony?


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Answer: six
Enter the article ID (type 'exit' to quit): 113315
Invalid article number.
Enter the article ID (type 'exit' to quit): 17574
Ask your question: What amount did Samsung contribute to Ms. Choi's winter sports program?


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Answer: $ 1 . 3 million
Enter the article ID (type 'exit' to quit): exit
Exiting the program...


## B. References

## C. Appendix