## 04 - Section Relevance Score
In this notebook, we will assess the similarity of each section with the lay summary.
Then, we will calculate relevance score (average) for each named section (Abstract, Conclusion, etc)

In [80]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import numpy as np
import random
from collections import defaultdict
import torch

In [2]:
file_path = "../data/biolaysumm2024_data/"
file_name = "eLife_train.jsonl"

df = pd.read_json(file_path + file_name,
                 orient="records",
                 lines=True)
df

Unnamed: 0,lay_summary,article,headings,keywords,id
0,"In the USA , more deaths happen in the winter ...","In temperate climates , winter deaths exceed s...","[Abstract, Introduction, Results, Discussion, ...",[epidemiology and global health],elife-35500-v1
1,Most people have likely experienced the discom...,Whether complement dysregulation directly cont...,"[Abstract, Introduction, Results, Discussion, ...","[microbiology and infectious disease, immunolo...",elife-48378-v2
2,The immune system protects an individual from ...,Variation in the presentation of hereditary im...,"[Abstract, Introduction, Results, Discussion, ...","[microbiology and infectious disease, immunolo...",elife-04494-v1
3,The brain adapts to control our behavior in di...,Rapid and flexible interpretation of conflicti...,"[Abstract, Introduction, Results, Discussion, ...",[neuroscience],elife-12352-v2
4,Cells use motor proteins that to move organell...,Myosin 5a is a dual-headed molecular motor tha...,"[Abstract, Introduction, Results, Discussion, ...",[structural biology and molecular biophysics],elife-05413-v2
...,...,...,...,...,...
4341,To defend itself against bacteria and viruses ...,Antibodies are critical components of adaptive...,"[Abstract, Introduction, Results, Discussion, ...","[structural biology and molecular biophysics, ...",elife-61393-v2
4342,DNA is tightly packaged in a material called c...,RNA polymerase II ( PolII ) transcribes RNA wi...,"[Abstract, Introduction, Results, Discussion, ...",[chromosomes and gene expression],elife-02042-v1
4343,Associative learning is a simple learning abil...,Gagliano et al . ( Learning by association in ...,"[Abstract, Introduction, Results, Discussion, ...","[plant biology, short report]",elife-57614-v1
4344,"In 1848 , a railroad worker named Phineas Gage...",Activity in prefrontal cortex ( PFC ) has been...,"[Abstract, Introduction, Results, Discussion, ...",[neuroscience],elife-11945-v2


In [3]:
k = 120 # a random row
item = df.iloc[k]
item

lay_summary    Genome editing allows scientists to change an ...
article        The CRISPR-Cas9 targeted nuclease technology a...
headings       [Abstract, Introduction, Results and discussio...
keywords         [chromosomes and gene expression, short report]
id                                                elife-33761-v5
Name: 120, dtype: object

In [4]:
# divide by paragraphs
paras = item.article.split("\n")
len(paras)

4

In [6]:
# check with `headings`
print(len(item.headings))
print(item.headings)

4
['Abstract', 'Introduction', 'Results and discussion', 'Materials\xa0and\xa0methods']


In [7]:
len(paras) == len(item.headings)

True

In [12]:
# test sentence embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
s1 = "This is a paper about t-cell"
s2 = "A new research paper suggesting a new role for t-cell in our body"
v1 = model.encode(s1)
v2 = model.encode(s2)
v1.shape

(384,)

In [15]:
util.cos_sim(v1, v2).item() # should be close to 1

0.7994392514228821

In [16]:
v_lay_summ = model.encode(item.lay_summary)
v_lay_summ.shape

(384,)

In [21]:
for i, heading in enumerate(item.headings):
    print(i, heading)
    v_para = model.encode(paras[i])
    score = util.cos_sim(v_para, v_lay_summ).item()
    print(f"Sim score with lay summary = {score:.2f}")

0 Abstract
Sim score with lay summary = 0.77
1 Introduction
Sim score with lay summary = 0.80
2 Results and discussion
Sim score with lay summary = 0.64
3 Materials and methods
Sim score with lay summary = 0.36


In [147]:
def get_chunks(text, chunk_size=1000, overlap=100):
    """
        split a long text into chunks
    """
    if chunk_size <= overlap:
        return None
        
    i = 0
    result = []
    while i < len(text):
        result.append(text[i:i+chunk_size])
        i += (chunk_size - overlap)
        # print("New i =", i)
        
    return result

get_chunks("This is a very very long long long text", 15, 4)

['This is a very ', 'ery very long l', 'ng long long te', 'g text']

In [148]:
def get_para_embedding(text):
    """
        return embedding of a paragraph
        TODO: for long text, will get average of n chunks
    """
    result = None
    if len(text) <= 1000:
        result = model.encode(text)
    else: # long text -> split into chunks and average
        chunks = get_chunks(text)
        v_chunks = [model.encode(chk)
                    for chk in chunks
                   ]
        # print(len(v_chunks), v_chunks[0].shape)

        result = np.average(v_chunks, axis=0)
            
        # print(result.shape)
        
    return result

In [154]:
# put into a function
def get_section_score(row_id = 0):
    """
        print section score for row i in the dataset
    """
    item = df.iloc[row_id]
    item_paras = item.article.split("\n")
    v_lay_summ = model.encode(item.lay_summary)

    result = dict()
    for i, heading in enumerate(item.headings):
        # print(i, heading)
        # v_para = model.encode(item_paras[i])
        v_para = get_para_embedding(item_paras[i])
        # print(v_para.shape)
        score = util.cos_sim(v_para, v_lay_summ).item()
        # print(f"Sim score with lay summary = {score:.2f}")
        result[heading.lower()] = [score]

    return result
    

In [155]:
get_section_score(123)

{'abstract': [0.7081866264343262],
 'introduction': [0.7312494516372681],
 'results': [0.6945871710777283],
 'discussion': [0.7358657121658325],
 'materials and methods': [0.5820637345314026]}

In [156]:
# test dictionary update
a = {"intro": [1],
     "background": [2]
    }
b = {"intro": [3],
     "background": [4]
    }
a.update(b)
a

{'intro': [3], 'background': [4]}

In [168]:
random.seed(42)

n = 1000 # testing for 10 random rows
scores = defaultdict(list)
for i in range(n):
    k = random.randint(0, len(df))
    print(f"i = {i}, k = {k}")
    score = get_section_score(k)
    # print(score)
    # update results
    for k, v in score.items():
        # results.update(score)
        scores[k].extend(v)

# scores

i = 0, k = 912
i = 1, k = 204
i = 2, k = 2253
i = 3, k = 2006
i = 4, k = 1828
i = 5, k = 1143
i = 6, k = 839
i = 7, k = 712
i = 8, k = 3456
i = 9, k = 260
i = 10, k = 244
i = 11, k = 767
i = 12, k = 1791
i = 13, k = 1905
i = 14, k = 4139
i = 15, k = 217
i = 16, k = 1628
i = 17, k = 3436
i = 18, k = 1805
i = 19, k = 3679
i = 20, k = 2278
i = 21, k = 53
i = 22, k = 1307
i = 23, k = 3462
i = 24, k = 2787
i = 25, k = 2276
i = 26, k = 1273
i = 27, k = 1763
i = 28, k = 2757
i = 29, k = 837
i = 30, k = 759
i = 31, k = 3112
i = 32, k = 792
i = 33, k = 2940
i = 34, k = 2817
i = 35, k = 2166
i = 36, k = 355
i = 37, k = 3763
i = 38, k = 1022
i = 39, k = 3100
i = 40, k = 645
i = 41, k = 2401
i = 42, k = 2962
i = 43, k = 1575
i = 44, k = 569
i = 45, k = 375
i = 46, k = 1866
i = 47, k = 2370
i = 48, k = 653
i = 49, k = 1907
i = 50, k = 827
i = 51, k = 3113
i = 52, k = 2277
i = 53, k = 3714
i = 54, k = 2988
i = 55, k = 1332
i = 56, k = 3032
i = 57, k = 2910
i = 58, k = 1716
i = 59, k = 2187
i = 60, k

In [169]:
# calculate average and sd
for section in scores.keys():
    print("Section =", section)
    score_avg = np.average(scores[section])
    score_std = np.std(scores[section])
    print(f"Count = {len(scores[section])}, Average = {score_avg:.2f}, std = {score_std:.2f}")
    print("-----------")

Section = abstract
Count = 1000, Average = 0.68, std = 0.09
-----------
Section = introduction
Count = 993, Average = 0.73, std = 0.08
-----------
Section = results
Count = 922, Average = 0.62, std = 0.10
-----------
Section = discussion
Count = 927, Average = 0.69, std = 0.09
-----------
Section = materials and methods
Count = 887, Average = 0.49, std = 0.10
-----------
Section = materials and methods
Count = 64, Average = 0.49, std = 0.09
-----------
Section = material and methods
Count = 22, Average = 0.46, std = 0.08
-----------
Section = methods
Count = 5, Average = 0.58, std = 0.07
-----------
Section = results and discussion
Count = 55, Average = 0.66, std = 0.08
-----------
Section = acknowledgments
Count = 1, Average = 0.11, std = 0.00
-----------
Section = materials and methods
Count = 3, Average = 0.49, std = 0.07
-----------
Section = main text
Count = 3, Average = 0.74, std = 0.03
-----------
Section = materials
Count = 3, Average = 0.49, std = 0.05
-----------
Section = s