**Final Project**

Kevin Gardenhire (keving8)

Michael Foster ()

In [8]:
%%writefile requirements.txt

pandas==2.2.3
seaborn==0.13.2
nltk==3.9.1
iteration-utilities==0.13.0
scikit-learn==1.6.1
torch>=2.0.0
numpy>=2.2
transformers>=4.51

Overwriting requirements.txt


In [9]:
!pip install -r requirements.txt



In [10]:
#Download NLTK tokenizer
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/Kevin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/Kevin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
import re
import numpy as np
from sklearn import metrics
import csv, json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from nltk.tokenize import word_tokenize, sent_tokenize
from iteration_utilities import duplicates, unique_everseen

In [17]:
#calculates EM and F1 scores.
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = {'a','an','the'}


def word_extraction(sentence):   
    words = re.sub("[^\w]", " ",  sentence).split()    
    return words


def tokenize(sentences):

    words = []
    for sentence in sentences:
        w = word_extraction(sentence)
        words.extend(w)
    words = sorted(list(set(words)))   
    return words


def prep(text):
    
    text = text.replace('\n', ' ').lower()              # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ',text)            # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('',text)                  # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([w for w in text.split() if not w in STOPWORDS])# delete stopwords from text
    return text


def calculate_f1(w1, w2):
    
    vocab = tokenize([w1, w2])
    true_words = word_extraction(w1)
    predicted_words = word_extraction(w2)
    true_bag_vector = np.zeros(len(vocab))
    predicted_bag_vector = np.zeros(len(vocab))
    
    for w in true_words:
        for i, word in enumerate(vocab):
            if word == w:
                true_bag_vector[i] += 1
                
    for w in predicted_words:
        for i, word in enumerate(vocab):
            if word == w:
                predicted_bag_vector[i] += 1
                
    macro_f1 = metrics.f1_score(true_bag_vector, predicted_bag_vector, average = 'macro')
    
    return macro_f1

In [13]:
#Datset analysis Methods
#we are calculating:
#1. average number of words per paragraph, question, answer
#2. calculate entailment for answers and questions
#3. calculate similarities between questions and answers
#4. draw histograms/pie chart
def read_files(file_list):
    
    q_a = []
    
    for file in file_list:
        with open(file, "r", encoding = "utf-8") as fin:
            for line in csv.reader(fin, delimiter = "\t"):
                
                q = line[0]
                a = line[1].replace('["', "").replace('"]', "").replace('"', '') 
                
                q_a.append([q, a])
    
    return q_a


def avg_no_words(file_list, text_corpus):
      
    no_words_p, no_words_q, no_words_a = [], [], []
    first_word, second_word = defaultdict(int), defaultdict(int)
    
    q_a = read_files(file_list)
    
    for q, a in q_a:
                
        words_q = word_tokenize(q)
        words_a = word_tokenize(a)
        
        no_words_q.append(len(words_q))
        no_words_a.append(len(words_a))

        first_word[words_q[0]] += 1
        second_word[words_q[1]] += 1
   
    with open(text_corpus, "r", encoding = "utf-8") as fin:
        for line in csv.reader(fin, delimiter = "\t"):

            p = line[1]         
            words_p = word_tokenize(p)          
            no_words_p.append(len(words_p))
   
    print("Average number of words in paragraphs: {:.1f}".format(sum(no_words_p)/len(no_words_p)))   
    print("Average number of words in questions: {:.1f}".format(sum(no_words_q)/len(no_words_q)))   
    print("Average number of words in answers: {:.1f}".format(sum(no_words_a)/len(no_words_a)))  
    
    for key in first_word.keys():
        print("{}: {:.1f}%".format(key, first_word[key]*100/len(no_words_q)))  
    
    # draw histograms of par/q/a lenghts    
    draw_hist(no_words_p, no_words_q, no_words_a)
    draw_pie(second_word)  
    
        
def draw_hist(no_words_p, no_words_q, no_words_a):
    
    plt.figure(figsize=(7, 7))  
    plt.hist(no_words_q, alpha = 0.5, bins = np.arange(25), edgecolor = 'black', label="questions")
    plt.hist(no_words_a, alpha = 0.5, bins = np.arange(25), edgecolor = 'black', label="answers")
    plt.xlabel('Number of words', fontsize = 18)
    plt.ylabel('Number of questions/answers', fontsize = 18)
    plt.xticks(fontsize = 15)
    plt.yticks(fontsize = 15)
    plt.xlim(xmin = 0, xmax = 25)
    plt.ylim(ymin = 0, ymax = 750)
    plt.legend(loc='upper right')
    plt.savefig("no-words-qa.pdf", transparent = True, bbox_inches = 'tight')
    plt.close()
    
    plt.figure(figsize=(7, 7))  
    plt.hist(no_words_p, bins = 25, alpha = 0.5, edgecolor = 'black', color='green')
    plt.xlabel('Number of words', fontsize = 18)
    plt.ylabel('Number of paragraphs', fontsize = 18)
    plt.xticks(fontsize = 15)
    plt.yticks(fontsize = 15)
    plt.xlim(xmin = 50, xmax = 220)
    plt.ylim(ymin = 0, ymax = 2100)
    plt.savefig("no-words-par.pdf", transparent = True, bbox_inches = 'tight')
    plt.close()


def draw_pie(second_word):
 
    total, others = 0, 0
    values, labels = [], []
    for key in dict(sorted(second_word.items(), reverse = True, key = lambda item: item[1])):
         # first five most popular words
         if(total < 5):
             values.append(second_word[key])
             labels.append(key)
         else:
             others += second_word[key]
         total += 1
    
    values.append(others)       
    labels.append("others")
    
    colors = sns.color_palette("husl", 6)
    plt.figure(figsize=(9, 9))  
    plt.pie(values, labels = labels, colors = colors, autopct = '%.0f%%', textprops = {'fontsize': 18})
    plt.savefig("second-word-qns.pdf", transparent = True, bbox_inches = 'tight')
    plt.close()
    

def calculate_entailment(file_list):
  
    ent_a, ent_q = defaultdict(set), defaultdict(set)
    
    q_a = read_files(file_list) 
    dup_q = list(unique_everseen(duplicates([item[0] for item in q_a])))
    dup_a = list(unique_everseen(duplicates([item[1] for item in q_a])))
  
    for q, a in q_a:
         if(a in dup_a):
             ent_q[a].add(q)
         if(q in dup_q):
             ent_a[q].add(a)
    
    print("Number of question entailment: {}".format(sum([len(list(ent_q[x])) for x in ent_q if isinstance(list(ent_q[x]), list)])))
    print("Number of answer entailment: {}".format(sum([len(list(ent_a[x])) for x in ent_a if isinstance(list(ent_a[x]), list)])))
   

def calculate_qa_sim(test_json):
    
    counter = 0
    f1 = []
    
    with open(test_json, "r", encoding = "utf-8") as fin: 
        objs = json.load(fin)
        for obj in objs:
            q = obj['question']
            a = obj['answers'][0]
            
            try:
                p = obj['positive_ctxs'][0]['text']
                
                if(p.count(a) == 1):
                    counter += 1
                   
                    if(counter == 1000):
                        break
                    
                    sentences = sent_tokenize(p)
                    
                    for sentence in sentences:
                        if(sentence.find(a) != -1):
                            
                            macro_f1 = calculate_f1(q, sentence)   
                            f1.append(macro_f1)
            except:
                pass
                  
    print("Similarities between questions and answers (f1): {:.2f}".format(sum(f1)/len(f1)))    

In [18]:
#calculate inter-annotator agreement for label collection process and for extrinsic evaluation.
def labels_agreement(agreement_file):
    
    f1, em = [], []
   
    a = defaultdict(list)

    # main one first
    with open(agreement_file, "r", encoding = "utf-8") as fin:
        for line in csv.reader(fin):
            # no#answer#is_main
            if(line[2] == "TRUE"): 
                a[int(line[0])].append(prep(line[1]))
              
    # others after
    with open(agreement_file, "r", encoding = "utf-8") as fin:
        for line in csv.reader(fin):
            if(line[2] == "FALSE" and int(line[0]) in a.keys()):
                a[int(line[0])].append(prep(line[1]))
   
    for key in a:
       
        if(len(a[key]) != 5):
            print(a[key])
           
        max_f1 = -1
        for i in range(1, 5):
            macro_f1 = calculate_f1(a[key][0], a[key][i]) 
            if(macro_f1 > max_f1):
                max_f1 = macro_f1
           
        f1.append(max_f1)       
        
        if(max_f1 == 1):
            em.append(1)
        else:
            em.append(0)
   
    print("em: {:.2f}".format(sum(em)/len(em)))
    print("f1: {:.2f}".format(sum(f1)/len(f1)))
    


In [15]:
agreement_file = "data/agreement/labels_agreement.csv"
    
file_list = ["data/training/sleep-train.csv", 
                "data/training/sleep-dev.csv", 
                "data/training/sleep-test.csv"]
    
text_corpus = "data/training/sleep-corpus.tsv"
    
test_json = "data/training/sleep-train.json"
  
# calculate em/f1 for inter-annotators agreement on labels
labels_agreement(agreement_file)
    
# calculate avg. number of words in paragraphs/questions/answers
avg_no_words(file_list, text_corpus)
    
# calculate question/answer entailment
calculate_entailment(file_list)
    
# calculate similarities between questions and answers
calculate_qa_sim(test_json)

em: 0.85
f1: 0.91
Average number of words in paragraphs: 120.6
Average number of words in questions: 9.9
Average number of words in answers: 10.3
what: 68.4%
how: 16.8%
why: 5.9%
who: 2.9%
when: 4.9%
where: 1.0%
Number of question entailment: 222
Number of answer entailment: 149
Similarities between questions and answers (f1): 0.17


In [4]:
import sys 
sys.path.append("../utils")
sys.path.append("../DPR-main/")

#from f1_score import calculate_f1

import csv, pickle, torch, time
from transformers import (
    DPRContextEncoder, DPRContextEncoderTokenizer, 
    DPRQuestionEncoder, DPRQuestionEncoderTokenizer, 
    DPRReader, DPRReaderTokenizer
    )

from faiss_indexers import DenseFlatIndexer


def iterate_encoded_files(file):
    
    print("Reading file {}".format(file))
    with open(file, "rb") as reader:
        doc_vectors = pickle.load(reader)
        for doc in doc_vectors:
            doc = list(doc)                
            yield doc


class LocalFaissRetriever():

    def __init__(self): 
        self.index = DenseFlatIndexer()
        self.index.init_index(768)
        
    def index_encoded_data(self, file, buffer_size):
        buffer = []
        for item in iterate_encoded_files(file):
            buffer.append(item)
            if 0 < buffer_size == len(buffer):
                self.index.index_data(buffer)
                buffer = []
        self.index.index_data(buffer)
        print("Data indexing completed.")

    def get_top_docs(self, query_vectors, top_docs):
        time0 = time.time()
        results = self.index.search_knn(query_vectors, top_docs)
        print("index search time: {} sec.".format(time.time() - time0))
        self.index = None
        return results


def validate_retriever(questions, retrieved_passages):
    
    top = 0
    qa_dic = {}
      
    # read questions
    with open(questions, "r", encoding = "utf-8") as fin:
         reader = csv.reader(fin, delimiter = "\t")
         for row in reader:
             question = row[0]
             answer = row[1].strip('"["').strip('"]"')
             qa_dic[question] = answer
          
    # read in passages from our pipeline    
    with open(retrieved_passages, "r", encoding = "utf-8") as fin:
       reader = csv.reader(fin, delimiter = "\t")
       for row in reader:
           question = row[0]
           text = row[2]        

           # check if the retrieved paragraph from our pipeline is contains answer
           if(text.find(qa_dic[question]) != -1):
                top += 1
              
    print("Top k documents hits: {}".format(top))


def validate_reader(questions, span_answers):
    
    f1, em = [], []
    qa_dic = {}
    
    with open(questions, "r", encoding = "utf-8") as fin:
         reader = csv.reader(fin, delimiter = "\t")
         for row in reader:
             question = row[0]
             answer = row[1].strip('"["').strip('"]"')
             qa_dic[question] = answer.replace(".", "")
             
    with open(span_answers, "r", encoding = "utf-8") as fin:
       reader = csv.reader(fin, delimiter = "\t")
       for row in reader:
           question = row[0]
           
           answer = row[1].strip().replace(".", "").replace(" %", "%")
           
           macro_f1 = calculate_f1(qa_dic[question], answer)   
           f1.append(macro_f1)
           
           if(macro_f1 == 1):
               em.append(1)
           else:
               em.append(0)

 
    print("em: {:.2f}, f1: {:.2f}".format(sum(em)/len(em), sum(f1)/len(f1)))
    
    
############################################## CONTEXT #############################################################
# facebook/dpr-ctx_encoder-single-nq-base
def generate_dense_encodings(text_corpus, ctx_encoder, out_file):
    
    total = 0
    results = []
    
    tokenizer = DPRContextEncoderTokenizer.from_pretrained(ctx_encoder)
    encoder = DPRContextEncoder.from_pretrained(ctx_encoder)

    with open(text_corpus, encoding = "utf-8") as fin:
         reader = csv.reader(fin, delimiter="\t")
         for row in reader:
             sample_id = str(row[0])
             passage = row[1].strip('"')    
             title = row[2]
             
             tokens = tokenizer(title + "[SEP]" + passage, return_tensors="pt", max_length = 256, 
                                  padding='max_length', truncation = True)["input_ids"]
                    
             tokens[0][255] = 102            # add 102 in the end of padding  
             embeddings = encoder(tokens).pooler_output       
             results.extend([(sample_id, embeddings[0,:].detach().numpy())])
             
             total += 1
             if(total % 10 == 0):
                 print("Encoded {} passages.".format(total))                 

    with open(out_file, mode = "wb") as f:
        pickle.dump(results, f)

    print("Total passages processed {}. Written to {}".format(len(results), out_file))


############################################## QUESTION #############################################################
# facebook/dpr-question_encoder-single-nq-base
def dense_retriever(questions, question_encoder, text_corpus, corpus_embedded, retrieved_passages):
    
    index_buffer_sz = 50000  
    par_dic = {}
    
    with open(text_corpus, "r", encoding = "utf-8") as fin:
         reader = csv.reader(fin, delimiter="\t")
         for row in reader:
             sample_id = str(row[0])
             passage = row[1].strip('"')    
             title = row[2]
             
             par_dic[sample_id] = (title, passage)
    
    tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(question_encoder)
    model = DPRQuestionEncoder.from_pretrained(question_encoder)   

    questions_embedded = []
    questions_list = []
    
    total = 0
    with open(retrieved_passages, "w", encoding = "utf-8") as fout:
        with open(questions, encoding = "utf-8") as fin:
             reader = csv.reader(fin, delimiter="\t")
             for row in reader:
                 question = row[0].strip('"')
                 questions_list.append(question)
                 
                 tokens = tokenizer(question, return_tensors="pt", max_length = 256, 
                                      padding='max_length', truncation = True)["input_ids"]          
                 tokens[0][255] = 102            # add 102 in the end of padding          
                 embeddings = model(tokens).pooler_output
                 
                 questions_embedded.append(embeddings[0,:])                
                     
                 total += 1
                 if(total % 10 == 0):
                     print("Encoded {} questions.".format(total))                 
                                  
                 if(total % 100 == 0):
    
                     retriever = LocalFaissRetriever()
                     retriever.index_encoded_data(corpus_embedded, index_buffer_sz)
                 
                     questions_embedded = torch.stack(questions_embedded)     
                     top_results_and_scores = retriever.get_top_docs(questions_embedded.detach().numpy(), 1)
                                                
                     for i in range(len(top_results_and_scores)):
                        par_id = top_results_and_scores[i][0][0]
                            
                        results = par_dic[par_id]
                        title = results[0]
                        passage = results[1]
                            
                        fout.write("{}\t{}\t{}\n".format(questions_list[i], title, passage))
                        
                     questions_embedded, questions_list = [], []
                 

# ############################################### READER ##############################################################
# facebook/dpr-reader-single-nq-base
def extractive_reader(retrieved_passages, reader, span_answers):

    tokenizer = DPRReaderTokenizer.from_pretrained(reader)
    model = DPRReader.from_pretrained(reader)
    
    total = 0
    with open(span_answers, "w", encoding = "utf-8") as fout:
        with open(retrieved_passages, "r", encoding = "utf-8") as fin:
             reader = csv.reader(fin, delimiter="\t")
                  
             for row in reader:
                 question = row[0].strip('"')
                 title = row[1].strip('"')
                 text = row[2].strip('"')
                 encoded_inputs = tokenizer(question, title, text, return_tensors = "pt", 
                        max_length = 300, padding = 'max_length', truncation = True)

                 outputs = model(**encoded_inputs)
                 predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs, max_answer_length = 20, num_spans = 1, num_spans_per_passage = 1)
             
                 fout.write("{}\t{}\n".format(question, predicted_spans[0].text))

                 total += 1
                 if(total % 10 == 0):
                     print("Extracted spans for {} questions.".format(total)) 
             

if __name__ == "__main__":

    text_corpus = "data/training/sleep-corpus.tsv"
    questions = "data/training/sleep-test.csv"
    
    ctx_encoder = "models/pytorch/ctx_encoder/"
    question_encoder = "models/pytorch/question_encoder/"
    reader = "models/pytorch/reader"
    
    corpus_embedded = "models/processed/sleep-corpus_e29"
    retrieved_passages = "models/processed/sleep_test_e29.csv"
    span_answers = "models/processed/pipeline1_label_1.250.csv"
    
    #generate_dense_encodings(text_corpus, ctx_encoder, corpus_embedded)
    
    #dense_retriever(questions, question_encoder, text_corpus, corpus_embedded, retrieved_passages)

    #validate_retriever(questions, retrieved_passages)
 
    extractive_reader(retrieved_passages, reader, span_answers)
    
    validate_reader(questions, span_answers)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRReaderTokenizer'.


Extracted spans for 10 questions.
Extracted spans for 20 questions.
Extracted spans for 30 questions.
Extracted spans for 40 questions.
Extracted spans for 50 questions.
Extracted spans for 60 questions.
Extracted spans for 70 questions.
Extracted spans for 80 questions.
Extracted spans for 90 questions.
Extracted spans for 100 questions.
Extracted spans for 110 questions.
Extracted spans for 120 questions.
Extracted spans for 130 questions.
Extracted spans for 140 questions.
Extracted spans for 150 questions.
Extracted spans for 160 questions.
Extracted spans for 170 questions.
Extracted spans for 180 questions.
Extracted spans for 190 questions.
Extracted spans for 200 questions.
Extracted spans for 210 questions.
Extracted spans for 220 questions.
Extracted spans for 230 questions.
Extracted spans for 240 questions.
Extracted spans for 250 questions.
Extracted spans for 260 questions.
Extracted spans for 270 questions.
Extracted spans for 280 questions.
Extracted spans for 290 quest

NameError: name 'calculate_f1' is not defined