This file is for evaluating the coverage rate of generated Q&As to paper. We hope 10 answers should roughly cover the majority of the content of the paper).

We calculate similarity score between each answer vs. each paper sentence. ious)

We use e5-mistral-7b-instruct (paper: https://arxiv.org/pdf/2401.00368.pdf) as embedding model. It utilizes the power of LLM to capture rich contextual information of natural language. Also, it offers a more contextual-based similarity measurement through task-defining by using pro.

```jsx
task = 'Given the Question & Answer pair, retrieve relevant sentences to this answer.'
queries = [get_detailed_instruct(task, f"{i['Q']}\n{i['A']}") for i in data['Q&A']]
documents = split_into_sentences(data['txt'])
```

By using ‘task’ as part of query, we strengthen that we want to measure the similarity between answer and paper sentence from the aspect of how much a paper sentence is related with this answer.

In [1]:
# read file for evaluation, the file is a list of dict with keys shown in following
# please enter your file path at 'file_path' variable
import json
def read(file_path):
    with open(file_path, 'r') as file:
        datas = json.load(file) 
    return datas

file_path = '/fl/data/acl/acl_eval_final_vicuna16k_all.json'
datas = read(file_path)
datas[0].keys(), len(datas)

(dict_keys(['source', 'doi', 'abstract', 'keywords', 'txt', 'title', 'num_Q&A', 'Q&A', 'raw_output']),
 105)

### Calculation

In [2]:
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel


# get embedding value of the last layer
def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'


tokenizer = AutoTokenizer.from_pretrained('/fl/model/mixtral/e5-mistral-7b-instruct')
model = AutoModel.from_pretrained('/fl/model/mixtral/e5-mistral-7b-instruct')
model.eval()

# max length should be the max token length of both queries and documents
max_length = 1028

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|█████████████████████████████████| 2/2 [00:08<00:00,  4.38s/it]


In [6]:
import re
import gc
import time

# split paper to sentence
def split_into_sentences(text):
    # Regular expression for splitting sentences
    sentence_endings = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s'
    sentences = re.split(sentence_endings, text)
    
    # Filter out any empty strings
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences


# sort paper sentences' similarity score from high to low of each Q&A, recode both paper sentences' index and similarity score
def sort_and_reformat_2d_list(list_2d):
    sorted_2d_list = []
    for sublist in list_2d:
        # Sorting each sublist with scores in descending order and keeping track of original indices
        sorted_sublist = sorted(
            [(index, score) for index, score in enumerate(sublist)], 
            key=lambda x: x[1], 
            reverse=True
        )
        sorted_2d_list.append(sorted_sublist)
    return sorted_2d_list


# main function to calculate similarity score and record results into json file
def para_div_mistral(data, index):
    fn = data['doi'].replace('/', '-')

    # Each query must come with a one-sentence instruction that describes the task
    task = 'Given the Question & Answer pair, retrieve relevant sentences to this answer.'
    queries = [get_detailed_instruct(task, f"{i['Q']}\n{i['A']}") for i in data['Q&A']]
    documents = split_into_sentences(data['txt'])
    
    m = len(queries)
    n = len(documents)
    
    input_texts = queries + documents
    
    # Tokenize the input texts
    batch_dict = tokenizer(input_texts, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)
    # append eos_token_id to every input_ids
    batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
    batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**batch_dict)
        embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    
    # normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    scores = (embeddings[:m] @ embeddings[m:].T) * 100
    
    similarity_dict = sort_and_reformat_2d_list(scores.tolist())
    store = {idx:str(i) for idx, i in enumerate(similarity_dict)}
    with open(f'{pth}/q_paragraph_sim/{fn}.json', 'w') as json_file:
        json.dump(store, json_file, indent=4)

    del batch_dict
    del outputs
    del embeddings
    del scores
    gc.collect()

    if torch.cuda.is_available():
        torch.cuda.empty_cache()


In [9]:
from pathlib import Path
# path to record results
pth = 'paragraph diversity'

Path(f'{pth}/q_paragraph_sim').mkdir(parents=True)

for index, i in enumerate(datas):
    print(index)
    para_div_mistral(i, index)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104


### Visualization

In [10]:
import json
import ast
import numpy as np
from scipy.stats import entropy

colors = ['#FF7F50', '#40E0D0', '#6495ED', '#D8BFD8', '#98FB98', '#F0E68C', '#87CEFA', '#DDA0DD', '#7FFFD4', '#DEB887', '#5F9EA0', '#FFDAB9', '#7B68EE', '#F4A460', '#F08080']
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    data = {int(i): ast.literal_eval(data[i]) for i in data}    
    return data


# transform data from [qa, sentence] to [sentence, qa]
def transform_data(file_path):
    data = load_data(file_path)
    # Find the number of paragraphs and Q&As
    paragraph_count = len(data[0])
    qa_count = len(data)

    # Initialize a 2D array for storing similarity scores
    similarity_matrix = np.zeros((paragraph_count, qa_count))

    # Populate the similarity matrix
    for qa_index, pairs in data.items():
        for pair in pairs:
            paragraph_index, sim_score = pair
            similarity_matrix[int(paragraph_index), int(qa_index)] = sim_score

    return similarity_matrix


# std evaluation
def calculate_metrics(similarity_matrix, save_pth):
    std_per_paragraph = np.std(similarity_matrix, axis=1)
    # entropy_per_paragraph = [entropy(similarity_matrix[i, :]) for i in range(len(std_per_paragraph))]

    with open(save_pth, 'w') as file:
        for s in std_per_paragraph:
            file.write(str(s) + '\n')
        
    return std_per_paragraph

In [11]:
# scatter plot: show the top 10 sentences’ similarity score for each answer
def scatter_vis(similarity_dict, save_pth):
    idx = []
    for i in similarity_dict:
        idx.append([j[0] for j in similarity_dict[i][:10]])

    # plt.figure(figsize=(7, 4))
    for i, indices in enumerate(idx):
        y_values = [i+1] * len(indices)  # Same y-value for each list
        plt.scatter(indices, y_values, color=colors[i], label=f'QA {i+1}')

    plt.xlabel('Sentence Index')
    plt.xlim(0, len(similarity_dict[0]))
    plt.ylabel('QA Index')
    plt.yticks(range(1, i+2, 1))
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    # plt.show()
    plt.savefig(save_pth)
    plt.close()


In [12]:
import matplotlib.pyplot as plt
import seaborn as sns

# heatmap plot: show all sentences’ similarity score for each answer 
def heatmap_vis(similarity_matrix, save_pth):
    plt.figure(figsize=(20, 3))
    sns.heatmap(similarity_matrix.T, annot=False, cmap="coolwarm")
    plt.title("Heatmap of Paragraph-Q&A Similarity Scores")
    plt.xlabel("Sentence Index")
    plt.ylabel("Q&A Index")
    # plt.show()
    plt.tight_layout()
    plt.savefig(save_pth)
    plt.close()

In [13]:
from tqdm.notebook import tqdm

from pathlib import Path
Path(f'{pth}/fig').mkdir(parents=True)
Path(f'{pth}/std').mkdir(parents=True)

# main function to generate plots & std scores and record them into files
for i in range(len(datas)):
    data = datas[i]
    fn = data['doi'].replace('/', '-')
    path = f'{pth}/q_paragraph_sim/{fn}.json'

    scatter_pth = f'{pth}/fig/{fn}_scatter.png'
    similarity_dict = load_data(path)
    scatter_vis(similarity_dict, scatter_pth)

    heatmap_pth = f'{pth}/fig/{fn}_heatmap.png'
    similarity_matrix = transform_data(path)
    heatmap_vis(similarity_matrix, heatmap_pth)

    std_pth = f'{pth}/std/{fn}.txt'
    std = calculate_metrics(similarity_matrix, std_pth)
    # print(std)

In [59]:
def chunk_rate(sents, topk):
    num_chunks = 10
    sents_per_chunk = sents / num_chunks
    chunks_hit = set()
    for index in set(topk):
        chunk_index = int(index // sents_per_chunk)
        chunks_hit.add(chunk_index)

    num_chunks_hit = len(chunks_hit)
    return num_chunks_hit/num_chunks

In [60]:
coverage_list = []
for i in range(len(datas[:100])):
    data = datas[i]
    fn = data['doi'].replace('/', '-')
    path = f'{pth}/q_paragraph_sim/{fn}.json'
    sents = len(similarity_dict[0])  # This line is corrected to reflect the question's context
    k = int(sents*0.15)

    scores = [item for i in similarity_dict for item in similarity_dict[i]]
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    sent_idx = [d[0] for d in sorted_scores]

    top15 = set(sent_idx[:k])
    idx = k
    while len(top15)<k:
        top15.add(sent_idx[idx])
        idx += 1
    num_chunks_hit = chunk_rate(sents, top15)
    
    # intersection
    top15 = []
    individual_chunk_rate = []
    similarity_dict = load_data(path)
    for a in similarity_dict:
        sent_idx = [c[0] for c in similarity_dict[a][:k]]
        top15 += sent_idx
        individual_chunk_rate.append(chunk_rate(sents, sent_idx))
    
    coverage_list.append({'doi': data['doi'], 'overall chunk rate': num_chunks_hit,
                         'individual chunk rate': sum(individual_chunk_rate)/len(individual_chunk_rate), 'individual chunk rate list': individual_chunk_rate})

In [63]:
with open(f'{pth}coverage_list.json', 'w') as json_file:
    json.dump(coverage_list, json_file, indent=4)

In [62]:
import numpy as np
avg_chunk_rate = np.mean([i['overall chunk rate'] for i in coverage_list])
avg_idv_chunk_rate = np.mean([i['individual chunk rate'] for i in coverage_list])
avg_chunk_rate, avg_idv_chunk_rate

(0.6799999999999998, 0.6423000000000001)

In [64]:
pth

'paragraph diversity'