## Loading the Dataset

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("squad")
raw_datasets = raw_datasets.remove_columns(["id", "title"])

def prepare_data(example):
    answer = example["answers"]["text"][0]
    example["answer_start"] = example["answers"]["answer_start"][0]
    example["answer_end"] = example["answer_start"] + len(answer)
    return example

raw_datasets = raw_datasets.map(prepare_data, remove_columns=["answers"])
raw_datasets["train"]

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/87599 [00:00<?, ?ex/s]

  0%|          | 0/10570 [00:00<?, ?ex/s]

Dataset({
    features: ['context', 'question', 'answer_start', 'answer_end'],
    num_rows: 87599
})

In [2]:
## 1. printing data instance

print(f"Context: {raw_datasets['train'][0]['context']}")
print(f"Question: {raw_datasets['train'][0]['question']}")
start=raw_datasets['train'][0]['answer_start']
end=raw_datasets['train'][0]['answer_end']
print(f"Answer start: {raw_datasets['train'][0]['answer_start']}")
print(f"Answer end: {raw_datasets['train'][0]['answer_end']}")
print(f"\nAnswer: {raw_datasets['train'][0]['context'][start:end]}")


Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer start: 515
Answer end: 541

Answer: Saint Bernadette Soubirous


In [3]:
# 2. Converting to pandas DataFrame

import pandas as pd
df = pd.DataFrame(raw_datasets['train'])
print(df.head())

                                             context  \
0  Architecturally, the school has a Catholic cha...   
1  Architecturally, the school has a Catholic cha...   
2  Architecturally, the school has a Catholic cha...   
3  Architecturally, the school has a Catholic cha...   
4  Architecturally, the school has a Catholic cha...   

                                            question  answer_start  answer_end  
0  To whom did the Virgin Mary allegedly appear i...           515         541  
1  What is in front of the Notre Dame Main Building?           188         213  
2  The Basilica of the Sacred heart at Notre Dame...           279         296  
3                  What is the Grotto at Notre Dame?           381         420  
4  What sits on top of the Main Building at Notre...            92         126  


In [4]:
## 3. Appending the sentence_ind to the dataframe

answer_sent_index = []
context_sentences = []

for i, sample in df.iterrows():
        
    context = sample['context']
    answer_start = sample['answer_start']
    answer_end = sample['answer_end'] 

    # Split the context into sentences
    sentences = context.split(". ")
    context_sentences.append(sentences)
    
    # Find the sentence containing the answer
    sentence_index = -1
    for j, sentence in enumerate(sentences):
        if answer_start >= context.find(sentence) and answer_end <= context.find(sentence) + len(sentence):
            sentence_index = j
            break
    
    answer_sent_index.append(sentence_index)

df['answer_sent_index'] = answer_sent_index
df['context_sentences'] = context_sentences

In [5]:
df.columns

Index(['context', 'question', 'answer_start', 'answer_end',
       'answer_sent_index', 'context_sentences'],
      dtype='object')

# Unsupervised Learning

## Jaccard Similarity

In [20]:
import nltk
import numpy as np

def get_jaccard_prediction(df_squad):
    """
    Identify the answer sentence as one that has the largest Jaccard overlap with the input question.
    
    args:
        df_squad (pd.DataFrame) : a copy of the SQuAD dataset
        
    returns:
        pd.DataFrame : the input dataframe with two additional columns, "jaccard_prediction" and "jaccard_value"
    """
    jaccard_max = list()
    y_hat = list()
    for index, row in df_squad.iterrows():
        q = set(nltk.tokenize.word_tokenize(row['question']))
        jaccard_list = list()
        for sent in row['context_sentences']:
            sentence = set(nltk.tokenize.word_tokenize(sent))
            if q != set() or sentence != set():
                intersection = len(q.intersection(sentence))
                union = (len(q) + len(sentence)) - intersection
                jaccard = float(intersection) / union
                jaccard_list.append(jaccard)
            else:
                jaccard = 1
                jaccard_list.append(jaccard)
        jaccard_max.append(max(jaccard_list))
        y_hat.append(np.argmax(jaccard_list))
        
#     df_squad['jaccard_value'] = jaccard_max
    df_squad['jaccard_prediction'] = y_hat
    return df_squad

In [23]:
jaccard_df = df.head(1000).copy()
jaccard_df = get_jaccard_prediction(jaccard_df)
jaccard_accuracy = (jaccard_df['jaccard_prediction'] == jaccard_df["answer_sent_index"]).values.mean()
print(f"Accuracy of Jaccard Prediction on sentence indices: {jaccard_accuracy * 100}%")

Accuracy of Jaccard Prediction on sentence indices: 63.2%


## TF-IDF Vectors

In [54]:
from nltk.corpus import stopwords
import nltk
import scipy as sp
import numpy as np

def get_tfidf_prediction(df_squad, tfidf_vectorizer):
    """
    Identify the answer sentence as one whose TF-IDF representation has minimal distance to that of the question.
    
    args:
        df_squad (pd.DataFrame) : a copy of the SQuAD dataset
        tfidf_vectorizer (sklearn.feature_extraction.text.TfidfVectorizer) :
            the TF-IDF model to transform questions and sentences
        
    returns:
        pd.DataFrame : the input dataframe with two additional columns, "tfidf_prediction" and "distance_value"
    """
    tfidf_max = list()
    y_hat = list()
    
    for index, row in df_squad.iterrows():
        
        tfidf_question = tfidf_vectorizer.transform([row['question']])
        tfidf_context = tfidf_vectorizer.transform(row['context_sentences'])
        
        tfidf_list = list()
        for j in range(tfidf_context.shape[0]):
            dist = np.linalg.norm(tfidf_question[0].toarray() - tfidf_context[j].toarray())
            tfidf_list.append(dist)
      
        tfidf_max.append(min(tfidf_list))
        y_hat.append(np.argmin(tfidf_list))
        
#     df_squad['tfidf_value'] = tfidf_max
    df_squad['tfidf_prediction'] = y_hat
    return df_squad

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    tokenizer = nltk.word_tokenize,
    stop_words = stopwords.words('english'),
    ngram_range = (1,2),
    max_df = 1.0,
    min_df = 10
)
tfidf_vectorizer.fit(df["context"].unique())

In [46]:
abc = tfidf_question - tfidf_context[j]
print(type(abc))

<class 'scipy.sparse._csr.csr_matrix'>


In [55]:
tfidf_df = df.head(1000).copy()
tfidf_df = get_tfidf_prediction(tfidf_df, tfidf_vectorizer)
tfidf_accuracy = (tfidf_df['tfidf_prediction'] == tfidf_df["answer_sent_index"]).values.mean()
print(f"Accuracy of tf-idf Prediction on sentence indices: {tfidf_accuracy * 100}%")

Accuracy of tf-idf Prediction on sentence indices: 58.8%


## Sent2vec Encoders

In [6]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.6.1


In [8]:
import torch
a = torch.tensor([[10, 11, 12]])
p, q = torch.max(a, dim = 1)
print(p, q)

tensor([12]) tensor([2])


In [9]:
def get_sent2vec_prediction(df_squad, encoder):
    """
    Identify the answer sentence as one that has the max cosine similarity with the input question.
    
    args:
        df_squad (pd.DataFrame) : a copy of the SQuAD dataset
        
    returns:
        pd.DataFrame : the input dataframe with two additional columns, "sent2vec_prediction" and "sent2vec_value"
    """
    sen2vec_max = list()
    y_hat = list()
    
    for index, row in df_squad.iterrows():
        question_embedding = encoder.encode([row['question']], convert_to_tensor=True)
        context_embedding = encoder.encode(row['context_sentences'], convert_to_tensor=True)
        
        cosine_scores = util.cos_sim(question_embedding, context_embedding)
        max_score, max_ind = torch.max(cosine_scores, dim = 1)
        sen2vec_max.append(max_score[0])
        y_hat.append(max_ind[0])
        
        if index % 1000 == 0: print(index)
        
#     df_squad['sen2vec_value'] = sen2vec_max
    df_squad['sent2vec_prediction'] = y_hat
    return df_squad

In [None]:
## 1. Defining the encoder

from sentence_transformers import SentenceTransformer, util
encoder = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
## 2. Making the prediction

sent2vec_df = df.head(1000).copy()
sent2vec_df = get_sent2vec_prediction(sent2vec_df, encoder)

In [14]:
## 3. printing the accuracy

sent2vec_accuracy = (sent2vec_df['sent2vec_prediction'] == sent2vec_df["answer_sent_index"]).values.mean()
print(f"Accuracy of sent2vec Prediction on sentence indices: {sent2vec_accuracy * 100}%")

Accuracy of sent2vec Prediction on sentence indices: 66.60000000000001%
