In [None]:
!pip install vllm==0.3.3

Collecting vllm==0.3.3
  Downloading vllm-0.3.3-cp310-cp310-manylinux1_x86_64.whl (44.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja (from vllm==0.3.3)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Collecting ray>=2.9 (from vllm==0.3.3)
  Downloading ray-2.10.0-cp310-cp310-manylinux2014_x86_64.whl (65.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.1/65.1 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting torch==2.1.2 (from vllm==0.3.3)
  Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting xformers==0.0.23.post1 (from vllm==0.3.3)
  Downloa

In [61]:
from vllm import LLM
from vllm import SamplingParams
import pandas as pd

In [62]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [63]:
import pandas as pd

train_filepath = "/content/drive/MyDrive/biolaysumm2024_data/"
train_plos_filename = "PLOS_train.jsonl"
train_elife_filename = "eLife_train.jsonl"

def read_jsonl(filepath, filename):
    df = pd.read_json(filepath + filename, orient="records", lines=True)
    return df

In [64]:
train_plos_df = read_jsonl(train_filepath, train_plos_filename)
train_elife_df = read_jsonl(train_filepath, train_elife_filename)

mini_plos_df = read_jsonl(mini_dev_filepath, mini_dev_plos_filename)
mini_elife_df = read_jsonl(mini_dev_filepath, mini_dev_elife_filename)

# full_plos_df = read_jsonl(full_dev_filepath, full_dev_plos_filename)
# full_elife_df = read_jsonl(full_dev_filepath, full_dev_elife_filename)

test_plos_df = read_jsonl(test_filepath, test_plos_filename)
test_elife_df = read_jsonl(test_filepath, test_elife_filename)

In [65]:
import pandas as pd
import random

random.seed(42)
torch.manual_seed(42)

plos_sample = train_plos_df.sample(5)
elife_sample = train_elife_df.sample(5)

combined_samples = pd.concat([plos_sample, elife_sample]).sample(frac=1).reset_index(drop=True)

print(combined_samples)

                                         lay_summary  \
0  Embryonic stem cells and induced pluripotent s...   
1  During an animal’s lifetime , many of its cell...   
2  Parasitic helminths are inducers of chronic di...   
3  Genes encode instructions to make proteins and...   
4  Rough-skinned newts produce tetrodotoxin or TT...   
5  DNA carries the genetic information that is es...   
6  Interferon Regulatory Factor 3 ( IRF3 ) is an ...   
7  Inside cells , proteins are produced by comple...   
8  Plasmacytoid dendritic cells ( pDC ) are innat...   
9  Gene action determines how mutations affect ph...   

                                             article  \
0  Embryonic stem cells and induced pluripotent s...   
1  Mesenchymal ( lamellipodial ) migration is het...   
2  Alveolar echinococcosis , caused by Echinococc...   
3  Errors during transcription may play an import...   
4  Rough-skinned newts ( Taricha granulosa ) use ...   
5  The Dna2 nuclease-helicase maintains genomic

In [66]:
def build_text_with_headings(item):
    """
    return the article text with heading embedded
    """

    result = ""
    paras = item["article"].split("\n")
    keywords = ', '.join(item["keywords"])
    keywords = f"## Keywords: {keywords}"
    result += keywords + "\n"

    headings = item["headings"]
    if len(paras) != len(headings):
        print("Error, not matching length")
        return item["article"]
    else:
        for (heading, paragraph) in zip(headings, paras):
            result += f"## {heading}\n{paragraph}\n\n"

    return result

In [67]:
import pandas as pd

def get_unique_headings(df, headings_column):
    all_headings = [heading for sublist in df[headings_column] for heading in sublist]
    unique_headings = set(all_headings)
    return unique_headings

unique_headings = get_unique_headings(combined_samples, 'headings')

print("Unique Headings:", unique_headings)

Unique Headings: {'Materials and methods', 'Results and discussion', 'Introduction', 'Conclusions', 'Materials and Methods', 'Abstract', 'Results', 'Results and Discussion', 'Discussion'}


In [68]:
combined_samples['processed_text'] = combined_samples.apply(lambda row: build_text_with_headings(row), axis=1)

In [69]:
def split_into_sections(text, unique_headings):
    unique_headings = {heading.replace('\xa0', ' ') for heading in unique_headings}

    section_dict = {heading: "" for heading in unique_headings}

    sections = text.split("\n\n")
    current_heading = None

    for section in sections:
        section_clean = section.strip().replace('\xa0', ' ')
        for heading in unique_headings:
            heading_marker = f"## {heading}"
            if section_clean.startswith(heading_marker):
                current_heading = heading
                section_text = section_clean[len(heading_marker):].strip()
                section_dict[heading] += section_text + "\n\n"
                break
        else:
            if current_heading:
                section_dict[current_heading] += section + "\n\n"

    return section_dict

In [70]:
def split_text_by_headings(text):
    lines = text.split("\n")
    split_sections = {}
    current_heading = None
    section_content = []

    for line in lines:
        if line.startswith("##"):
            if current_heading is not None:
                split_sections[current_heading] = "\n".join(section_content).strip()
                section_content = []

            current_heading = line[2:].strip()
        else:
            section_content.append(line)

    if current_heading is not None and section_content:
        split_sections[current_heading] = "\n".join(section_content).strip()

    return split_sections


In [71]:
combined_samples['split_sections'] = combined_samples['processed_text'].apply(split_text_by_headings)

unique_headings = {'Abstract', 'Introduction', 'Results', 'Discussion', 'Methods', 'Materials and Methods', 'Results and Discussion', 'Materials and methods'}

for heading in unique_headings:
    combined_samples[heading] = combined_samples['split_sections'].apply(lambda sections: sections.get(heading, ""))

In [72]:
combined_samples

Unnamed: 0,lay_summary,article,headings,keywords,id,processed_text,split_sections,Materials and methods,Methods,Introduction,Materials and Methods,Abstract,Results,Results and Discussion,Discussion
0,Embryonic stem cells and induced pluripotent s...,Embryonic stem cells and induced pluripotent s...,"[Abstract, Introduction, Results, Discussion, ...","[mechanisms of signal transduction, cell diffe...",journal.pgen.1003112,## Keywords: mechanisms of signal transduction...,"{'Keywords: mechanisms of signal transduction,...",,,Embryonic stem cells and induced pluripotent s...,ES cells were generally maintained in NDiff N2...,Embryonic stem cells and induced pluripotent s...,To identify the programme of genes involved in...,,The derivation of pluripotent iPS cells and th...
1,"During an animal’s lifetime , many of its cell...",Mesenchymal ( lamellipodial ) migration is het...,"[Abstract, Introduction, Results, Discussion, ...","[computational and systems biology, cell biolo...",elife-11384-v1,## Keywords: computational and systems biology...,"{'Keywords: computational and systems biology,...",We have integrated a comprehensive and unique ...,,Cell migration is a profoundly heterogeneous p...,,Mesenchymal ( lamellipodial ) migration is het...,To enable the detection of discrete mesenchyma...,,We here present an integrated analytical appro...
2,Parasitic helminths are inducers of chronic di...,"Alveolar echinococcosis , caused by Echinococc...","[Abstract, Introduction, Materials and Methods...","[medicine, immune cells, clinical immunology, ...",journal.pntd.0001516,"## Keywords: medicine, immune cells, clinical ...","{'Keywords: medicine, immune cells, clinical i...",,,The metacestode larval stage of the fox-tapewo...,All experiments were carried out in accordance...,"Alveolar echinococcosis , caused by Echinococc...",The morphology of the three different E . mult...,,As typical in the case of helminth infections ...
3,Genes encode instructions to make proteins and...,Errors during transcription may play an import...,"[Abstract, Materials and methods]","[short report, computational and systems biology]",elife-09945-v2,"## Keywords: short report, computational and s...","{'Keywords: short report, computational and sy...",Much existing RNA-seq data is available as bam...,,,,Errors during transcription may play an import...,,,
4,Rough-skinned newts produce tetrodotoxin or TT...,Rough-skinned newts ( Taricha granulosa ) use ...,"[Abstract, Introduction, Results, Discussion, ...",[evolutionary biology],elife-53898-v1,## Keywords: evolutionary biology\n## Abstract...,"{'Keywords: evolutionary biology': '', 'Abstra...",Adult male rough-skinned newts ( Taricha granu...,,Coevolutionary interactions among species are ...,,Rough-skinned newts ( Taricha granulosa ) use ...,To investigate whether bacterial symbionts pro...,,"In this study , we found that bacterial isolat..."
5,DNA carries the genetic information that is es...,The Dna2 nuclease-helicase maintains genomic i...,"[Abstract, Introduction, Results and discussio...","[chromosomes and gene expression, structural b...",elife-09832-v3,"## Keywords: chromosomes and gene expression, ...","{'Keywords: chromosomes and gene expression, s...",Full-length mouse Dna2 was cloned into a pFast...,,Dna2 has nuclease and helicase activities and ...,,The Dna2 nuclease-helicase maintains genomic i...,,,
6,Interferon Regulatory Factor 3 ( IRF3 ) is an ...,Innate immunity is the first line of defense a...,"[Abstract, Introduction, Results, Discussion, ...",[],journal.ppat.1004779,## Keywords: \n## Abstract\nInnate immunity is...,"{'Keywords:': '', 'Abstract': 'Innate immunity...",,,"Toxoplasma gondii , an obligate intracellular ...","H196 , H1048 , HME , and HEK293 cell lines wer...",Innate immunity is the first line of defense a...,To investigate whether the type I IFN system r...,,"We have uncovered a new signaling pathway , PI..."
7,"Inside cells , proteins are produced by comple...","Using cryo-electron microscopy ( cryo-EM ) , w...","[Abstract, Introduction, Results, Discussion, ...",[structural biology and molecular biophysics],elife-60482-v2,## Keywords: structural biology and molecular ...,{'Keywords: structural biology and molecular b...,E . coli 70S ribosome purification ( Travin et...,,The ribosome performs the crucial task of tran...,,"Using cryo-electron microscopy ( cryo-EM ) , w...",We determined the structure of the E . coli 70...,,High-resolution cryo-EM maps are now on the cu...
8,Plasmacytoid dendritic cells ( pDC ) are innat...,Plasmacytoid dendritic cells ( pDC ) are innat...,"[Abstract, Introduction, Results, Discussion, ...","[medicine and health sciences, lysosomes, vesi...",journal.ppat.1005553,"## Keywords: medicine and health sciences, lys...","{'Keywords: medicine and health sciences, lyso...",,,Type I interferon ( IFN ) plays a dichotomous ...,PBMCs were separated on Ficoll-Hypaque ( Amers...,Plasmacytoid dendritic cells ( pDC ) are innat...,We hypothesized that HIV envelope protein inte...,,Although many receptors and signaling pathways...
9,Gene action determines how mutations affect ph...,The genetic component of complex disease risk ...,"[Abstract, Introduction, Results and Discussio...","[genome-wide association studies, deletion mut...",journal.pgen.1006573,"## Keywords: genome-wide association studies, ...","{'Keywords: genome-wide association studies, d...",,,"Risk for complex diseases in humans , such as ...",Using the fwdpp template library v0 . 2 . 8 [8...,The genetic component of complex disease risk ...,,"As in [36] , we simulate a 100 kilobase region...",
