# Assignment 2 
### Kusal Bista

In [25]:
# Libraries for reading data
import random
import numpy as np
import pandas as pd 
import glob
import json
from tqdm import tqdm

# Libraries for pre-processing
import re
import nltk

from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Libraries for information retrieval
import spacy
from spacy import displacy
from spacy.lang.en import English
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Libraries for data analysis
import matplotlib.pyplot as plt
from tabulate import tabulate

In [26]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !pip install tabulate
# !pip install nltk

In [27]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### 1 Reading dataset and pre-processing

In [28]:
news_dataset = pd.read_csv('news_dataset.csv', encoding='latin-1')

In [29]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."


In [30]:
news_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1000 non-null   int64 
 1   author   994 non-null    object
 2   date     1000 non-null   object
 3   year     1000 non-null   object
 4   month    1000 non-null   object
 5   topic    1000 non-null   object
 6   article  1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


### 1.2 Handling missing value

In [31]:
print("Missing value")
print(news_dataset.isnull().sum())

Missing value
id         0
author     6
date       0
year       0
month      0
topic      0
article    0
dtype: int64


In [32]:
# Handling missing value
news_dataset['author'] = news_dataset['author'].fillna('No author')
# checking missing value after handling missing value

In [33]:
print("After handling missing value")
print(news_dataset.isnull().sum())

After handling missing value
id         0
author     0
date       0
year       0
month      0
topic      0
article    0
dtype: int64


In [34]:
news_dataset = news_dataset.drop_duplicates(subset=['article'], keep='first').reset_index(drop=True)
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."


In [35]:
def pre_process(data):
    # Define stop words
    stop_words = set(stopwords.words('english'))
    stop_words.update(["This", "The", "the"])

    s = " \[(?=.*\d).*?\]" 

    # Lemmatization and removal of stopwords
    lemmatizer = WordNetLemmatizer()

    result = []
    for text in data:
        # Clean text
        # Remove non-ASCII characters
        text = ''.join([char for char in text if ord(char) < 128])

        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)

        # Remove question mark problems
        text = re.sub(r'(\s\?)',' ',text)
        text = re.sub(r"\b\?\b", "\'", text)
        text = re.sub(r"(,\?)",",", text)
        text = re.sub(r"\?+", "?", text)
        text = text.strip()

        # Lemmatization and removal of stopwords
        processed_text = " ".join([lemmatizer.lemmatize(word) for word in re.sub(s, "", text).split() if word.lower() not in stop_words])

        result.append(processed_text)

    return result

In [36]:
news_dataset['processed_article'] = pre_process(news_dataset['article'])

In [37]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article,processed_article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...,PARIS Islamic State driven ancient city Palmyr...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...,Angels everywhere Mu'iz family's apartment Bro...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...,Finally. Second Avenue subway opened New York ...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...,WASHINGTON time Republicans. tumultuous decade...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB...","Megyn Kelly, shift Fox News NBC host daily day..."


In [38]:
def chunk_text(data_index, data_text, chunk_size, chunk_overlap):
    # Initialize a list to store chunked text
    chunked_texts = []
    # Iterate through each text in the input data
    for idx, text in enumerate(data_text):
        words = text.split()
        # Chunk the text data
        for i in range(0, len(words), chunk_size - chunk_overlap):
            chunk = ' '.join(words[i:i+chunk_size])
            chunked_texts.append((str(data_index[idx]) + str(i), chunk))

    # Convert the list of chunked text into a DataFrame
    chunk_df = pd.DataFrame(chunked_texts, columns=['id', 'processed_article'])
    return chunk_df

In [45]:
# Create training dataset
train_chunk_size = 100
train_overlap_size = 50
data_chunk_train = chunk_text(news_dataset[:5]['id'], news_dataset[:5]['processed_article'], train_chunk_size, train_overlap_size)

# Create testing dataset
test_chunk_size = 500
test_overlap_size = 50
data_chuck_test = news_dataset[5:].reset_index(drop=True)
data_chuck_test = chunk_text(data_chuck_test['id'], data_chuck_test['processed_article'], test_chunk_size, test_overlap_size)

In [40]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')


In [47]:
# Vectorize data
def vectorize_data(text_list):
    encoded_data = model.encode(text_list)
    return np.asarray(encoded_data.astype('float32'))

encoded_data_train = vectorize_data(data_chunk_train['processed_article'].tolist())
encoded_data_test = vectorize_data(data_chuck_test['processed_article'].tolist())

In [61]:
import pickle

encoded_data = np.random.rand(100, 768)  # Replace this with your actual encoded data
data_chunk = data_chuck_test  # Replace this with your actual data chunk

# Save vector database
index = {'index': encoded_data, 'ids': np.array(range(0, len(data_chunk)))}

# Save index using pickle
with open('data_article_index.pkl', 'wb') as f:
    pickle.dump(index, f)

In [50]:
questions = ['Who is the author of the memoir "Nicotine"?',
             'What skills are inmates learning in the innovative program at Lecce Penitentiary?',
             'Who surrendered to the authorities for changing the Hollywood sign to "Hollyweed"?',
             'What is the name of the journalist and archivist who advocated for the online publication of the C.I.A. files?',
             'Which leader of the fringe movement embracing white nationalism was punched in the face during the protests in Washington?',
             'Who narrated the documentary "I Am Not Your Negro"?',
             'Who is the billionaire restaurant owner nominated to head the Labor Department?',
             "Who is the federal judge that ordered President Trump's golf resort to pay $5.7 million for refusing to refund deposits to members?",
             'What business relationship between Donald Trump and Elon Musk surprised many people?',
             "Who vowed to take executive action on a nearly daily basis to unravel his predecessor's legacy and begin enacting his own agenda?",
             "What was Hubert Edward Spires's discharge status changed to by the Air Force Board for the Correction of Military Records?",
             'What advice was given to Mr. LaCasse regarding restructuring his student loans to aid his retirement savings?',
             "What word did The New York Times ultimately choose to describe President Trump's assertion about illegal voting in the headline?",
             "What is the title of Vladimir Nabokov's controversial novel published in 1955?",
             "What adversity have the Mu'iz family faced?",
             "Why did Jennifer Holliday decide to withdraw from performing at the inauguration concert for Donald Trump?",
             "What did Donald Trump concede for the first time during his news conference at Trump Tower?",
             "Who founded Airline Ambassadors International?",
             "Where was the suspect in the Istanbul nightclub attack arrested?",
             "What significant event involving Emmett Till occurred on August 28, 1955?",
             "Who recently sold his personal papers to the Beinecke Rare Book Manuscript Library at Yale University?",
             "Who was Don Ciccone, and what significant roles did he play in the music industry?",
             "Who is celebrating their sapphire jubilee, marking 65 years on the British throne?",
             "Who performed a soulful rendition of 'America the Beautiful' at the Lincoln Memorial concert?",
             "Who challenged Germany's national atonement for the Holocaust and Nazi crimes during a speech in Dresden?",
             "What trend did Jerry Silverman, the president and chief executive of the Jewish Federations of North America, describe as disturbing?",
             "What significant event in Tom Casperson's political career was influenced by the DeVos family's opposition?",
             "Who has been involved in financial mismanagement issues according to the article?",
             "Who won the Australian Open for the fifth time, becoming the oldest man to win a Grand Slam singles title in 45 years?",
             "What significant event occurred in Greenwood, Miss., in 1963 that was documented by Claude Sitton?",
             "Who announced their resignation from Sony's entertainment division to focus on Snapchat?",
             "What was the incident involving a Palestinian driver and Israeli soldiers that occurred in Jerusalem?",
             "How many people did Governor Peter Shumlin of Vermont pardon for misdemeanor marijuana convictions?",
             "Who attended the Women's March on Washington?",
             "What was the outcome of the 1998 Australian Open match between Venus and Serena Williams?",
             "What role did Steve Bannon receive in the National Security Council?",
             "What did President Trump emphasize in his Inaugural Address?"]

In [51]:
doc_id = [17552, 17382, 17547, 17778, 17841, 18228, 18443, 18170, 17980, 17838, 17544,
          17434, 17972, 18163, 17292, 17645, 17556, 18382, 17701, 18084, 17372, 18054,
          18352, 17787, 17765, 17502, 17500, 18174, 18130, 18282, 17629, 17507, 17406,
          17863, 17990, 18122, 17837]

In [52]:
import pandas as pd
test_data = []
for i in range(len(questions)):
    test_data.append([doc_id[i], questions[i]])
test_data = pd.DataFrame(test_data, columns=['doc_id', 'question'])

In [53]:
test_data

Unnamed: 0,doc_id,question
0,17552,"Who is the author of the memoir ""Nicotine""?"
1,17382,What skills are inmates learning in the innova...
2,17547,Who surrendered to the authorities for changin...
3,17778,What is the name of the journalist and archivi...
4,17841,Which leader of the fringe movement embracing ...
5,18228,"Who narrated the documentary ""I Am Not Your Ne..."
6,18443,Who is the billionaire restaurant owner nomina...
7,18170,Who is the federal judge that ordered Presiden...
8,17980,What business relationship between Donald Trum...
9,17838,Who vowed to take executive action on a nearly...


In [54]:
encoded_data_test

array([[ 0.1513082 , -0.13743426, -0.7632094 , ..., -0.04565372,
        -0.36991662,  0.4084309 ],
       [-0.07812551, -0.04014127, -0.63176763, ...,  0.03896321,
        -0.82238185,  0.16641286],
       [-0.20078646,  0.32540423, -0.89036834, ...,  0.5825768 ,
        -0.34565404,  0.16913635],
       ...,
       [ 0.28563666,  0.00128658, -0.7693261 , ...,  0.4912801 ,
        -0.30372304,  0.48328856],
       [-0.1693944 ,  0.06060646, -0.5516199 , ..., -0.03478811,
        -0.01499029,  0.3165173 ],
       [ 0.1363754 , -0.1558264 , -0.6475471 , ...,  0.18430465,
        -0.20248297,  0.06620441]], dtype=float32)

In [62]:
import pandas as pd
import time

from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder

model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')
# cross_model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6', max_length=512)

with open('data_article_index.pkl', 'rb') as f:
    loaded_index = pickle.load(f)

# Access loaded index
loaded_encoded_data = loaded_index['index']
index = loaded_index['ids']
data_chunk = encoded_data_test

In [63]:
index

array([   0,    1,    2, ..., 1907, 1908, 1909])

In [64]:
def fetch_data_info(dataframe_idx, score):

    '''Data should be data_chunk'''
    info = data_chunk.iloc[dataframe_idx]
    meta_dict = {}
    meta_dict['id'] = info['id']
    meta_dict['article'] = info['article']
    meta_dict['score'] = score

    return meta_dict

In [65]:
# def search(query, top_k, index, model):

#     query_vector = model.encode([query])
#     top_k = index.search(query_vector, top_k)

#     top_k_ids = list(top_k[1].tolist()[0])
#     score = list(top_k[0].tolist()[0])

#     results =  [fetch_data_info(idx, score) for idx, score in zip(top_k_ids, score)]

#     return results

In [73]:
def clean_text(text):
    # Remove non-ASCII characters
    text = ''.join([char for char in text if ord(char) < 128])

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove question mark problems
    text = re.sub(r'(\s\?)',' ',text)
    text = re.sub(r"\b\?\b", "\'", text)
    text = re.sub(r"(,\?)",",", text)
    text = re.sub(r"\?+", "?", text)
    text = text.strip()

    return text

In [78]:
def query_answer(query, query_id, document, cross_model):
    query = clean_text(query)
    document = clean_text(document)

    # Prepare model input
    model_input = [[query, document]]

    # Predict score using cross_model
    score = cross_model.predict(model_input)[0]

    result_dataset = [{
        'question_id': query_id,
        'rank': 1,
        'id': query_id // 10,  # Assuming query_id represents document ID
        'score': score
    }]

    return result_dataset


In [67]:
def mrr_score(answers, queries):
    '''answers is a list of list of ids'''
    score = []
    for i, answer in enumerate(answers):
        for j, index in enumerate(answer):
            if index == queries[i]:
                score.append(1 / (j + 1))
                break
        if len(score) < (i + 1):
            score.append(0)
    return sum(score) / len(score) if len(score) > 0 else 0

In [68]:
def accuracy_score(answers, queries):
    '''answers is a list of list of ids'''
    score = []
    for i, answer in enumerate(answers):
        for index in answer:
            if index == queries[i]:
                score.append(1)
                break
        if len(score) != i + 1:
            score.append(0)
    return sum(score) / len(score) if len(score) > 0 else 0

In [69]:
cross_models = ['cross-encoder/ms-marco-MiniLM-L-12-v2',
                'cross-encoder/ms-marco-MiniLM-L-6-v2',
                'cross-encoder/ms-marco-MiniLM-L-4-v2',
                'cross-encoder/ms-marco-MiniLM-L-2-v2',
                'cross-encoder/ms-marco-TinyBERT-L-6',
                'cross-encoder/ms-marco-TinyBERT-L-2-v2']

In [70]:
question_list = pd.read_csv('question_test_data_2.csv')
question_list

Unnamed: 0,doc_id,question
0,17552,"Who is the author of the memoir ""Nicotine""?"
1,17382,What skills are inmates learning in the innova...
2,17547,Who surrendered to the authorities for changin...
3,17778,What is the name of the journalist and archivi...
4,17841,Which leader of the fringe movement embracing ...
5,18228,"Who narrated the documentary ""I Am Not Your Ne..."
6,18443,Who is the billionaire restaurant owner nomina...
7,18170,Who is the federal judge that ordered Presiden...
8,17980,What business relationship between Donald Trum...
9,17838,Who vowed to take executive action on a nearly...


In [81]:
def test_model(question_list, cross_model):
    start_time = time.time()

    answers = []
    for id, question in enumerate(question_list['question']):
        answer = query_answer(question, id, cross_model)
        answers.append(answer)

    answers = [answer for sublist in answers for answer in sublist]
    answers = pd.DataFrame(answers)
    
    question_article_ids = {}
    for i, question_id in enumerate(answers['question_id']):
        if question_id not in question_article_ids:
            question_article_ids[question_id] = [answers['id'][i]]
        else:
            question_article_ids[question_id].append(answers['id'][i])

    reranked_result = [question_article_ids[x] for x in question_article_ids]

    accuracy = accuracy_score(reranked_result, question_list['doc_id'])
    mrr = mrr_score(reranked_result, question_list['doc_id'])
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    result = {'accuracy_score': accuracy, 
              'mrr_score': mrr,
              'time': elapsed_time}

    return result

In [82]:
test_result = []

for cross_model_name in cross_models:
    cross_model = CrossEncoder(cross_model_name) 

    result = test_model(question_list, cross_model)

    result['cross_model'] = cross_model_name
    
    test_result.append(result)

test_result = pd.DataFrame(test_result)

TypeError: query_answer() missing 1 required positional argument: 'cross_model'

In [83]:
test_result.sort_values(by='mrr_score', ascending=False)

AttributeError: 'list' object has no attribute 'sort_values'

### 1.3 Data pre-processing

In [13]:
import json
from pprint import pprint
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import _pickle as pkl

In [15]:
import torch

In [16]:
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

In [60]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [115]:
def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text, truncation=True, max_length=512)

    # Report how long the input sequence is.
    print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example through the model.
    outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                    token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                    return_dict=True) 

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    print('Answer: "' + answer + '"')

In [116]:
import textwrap

# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=80) 

answer_text = news_dataset['processed_article'].iloc[0]


print(wrapper.fill(answer_text))
print(len(answer_text))

PARIS Islamic State driven ancient city Palmyra March, Yves Ubelmann got call
Syria's director antiquity come hurry. architect training, Mr. Ubelmann, 36,
worked Syria country engulfed war. special urgency kind work youthful team
architects, mathematician designer cramped office Paris: producing digital copy
threatened historical sites. Palmyra, part already destroyed Islamists deemed
monument idolatrous, still rigged explosives. Houmam Saad, Syrian colleague,
spent four day flying drone robot camera crumbled arch temples. Drones four six
rotor hover really close register structural details, every crack hole, take
precise measurements, said Mr. Ubelmann, founded company Iconem. stuff architect
archaeologist need. need new push virtual preservation scientists, archaeologist
others, like Mr. Ubelmann, compiling large scale. record could used create
computer model would show monument endangered historical site might one day
restored, repaired reconstructed. special interest today ancient 

In [117]:
question = "What motivated Yves Ubelmann and his team to urgently document threatened historical sites like Palmyra, and what methods did they employ?"
answer_question(question, answer_text)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Query has 512 tokens.

Answer: "team"
