In [1]:
import os
import time
import json
from openai import OpenAI
from dotenv import load_dotenv

import pandas as pd

In [2]:
load_dotenv()

client = OpenAI(
  api_key=os.getenv("OPENAI_API_KEY")
)

In [3]:
DATA = "../../data/"

filename = "eLife_test.jsonl"
df = pd.read_json(DATA+filename, orient="records", lines=True)
df.head()

Unnamed: 0,article,headings,keywords,id
0,Acylation of diverse carbohydrates occurs acro...,"[Abstract, Introduction, Results and discussio...","[biochemistry and chemical biology, computatio...",elife-81547-v1
1,Honey bee ecology demands they make both rapid...,"[Abstract, Introduction, Results, Discussion, ...",[computational and systems biology],elife-86176-v2
2,"Biguanides , including the world’s most prescr...","[Abstract, Introduction, Results, Discussion, ...",[genetics and genomics],elife-82210-v1
3,Ecological relationships between bacteria medi...,"[Abstract, Introduction, Results, Discussion, ...","[microbiology and infectious disease, ecology]",elife-83152-v2
4,Gamma oscillations are believed to underlie co...,"[Abstract, Introduction, Results, Discussion, ...",[neuroscience],elife-83044-v2


In [4]:
df.shape

(142, 4)

In [7]:
SLEEP_TIME = 10 # pause between requests

def sumarize_article(content, min_words=250, max_words=500, client=client):
    """
        summarize the content
        input: context (text), model (gpt3.5_api client), max_words (int)
        output: summarized text
    """
    result = ""
    prompt = f'Simplify and summarize in minimum {min_words} to maximum {max_words} words, combine answer into 1 paragraph, keep important factual details:  "{content}"'
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                },
            ],
            max_tokens=2048
        )
        
        summary = response.choices[0].message.content
        
    except Exception as err:
        print("Skipping, error : ", err)
        summary = ""
        
    # pause to avoid hitting bandwidth limit (~ 14K token / minute)
    print(f"Completed. Pausing for {SLEEP_TIME} secs...", end="")
    time.sleep(SLEEP_TIME)
    print("OK")
    
    return summary

In [8]:
sumarize_article(df.loc[0, 'article'][:20000])

Completed. Pausing for 10 secs...OK


'Acylation of diverse carbohydrates with proteins containing a membrane-bound acyltransferase-3 (AT3) domain is crucial for essential processes in bacteria, such as symbiosis, viral and antimicrobial resistance, and antibiotic biosynthesis. Evolutionary co-variance analysis was employed to construct a computational model of the structure of a bacterial O-antigen modifying acetyltransferase, OafB, revealing a unique fold for the AT3 domain with 10 transmembrane helices facilitating acetyl group transfer. AT3 domains are found in all life domains, including bacteria, where they are involved in acylating various extracytoplasmic and surface polysaccharides with significant implications in bacterial physiology and pathogenesis. While mainly standalone proteins, AT3 domains can also be fused with other domains such as SGNH, highlighting their versatile functions. Key acylation processes mediated by AT3 proteins in bacteria involve O-antigen and peptidoglycan modifications that influence bac

In [10]:
# apply to all rows in test set
text_cap = 20_000  #limit to 20k characters, set to -1 for full text

print("Summarization process started...")
df["gpt_summary"] = df["article"].apply(lambda text: sumarize_article(text[:text_cap]))
print("Completed")

Summarization process started...
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs.

In [11]:
filename = "PLOS_test.jsonl"
df_plos = pd.read_json(DATA+filename, orient="records", lines=True)
df_plos.head()

Unnamed: 0,article,headings,keywords,id
0,Lung-resident ( LR ) mesenchymal stem and stro...,"[Abstract, Introduction, Results, Discussion, ...","[immune system, medical conditions, molecular ...",journal.ppat.1009789
1,Visceral leishmaniasis ( VL ) is endemic in So...,"[Abstract, Introduction, Methods, Results, Dis...","[neonates, clinical laboratory sciences, trans...",journal.pntd.0007992
2,A high burden of Salmonella enterica subspecie...,"[Abstract, Introduction, Methods, Results, Dis...","[pathogens, medical conditions, taxonomy, bact...",journal.pntd.0010704
3,Severe Acute Respiratory Syndrome Coronavirus-...,"[Abstract, Introduction, Results, Discussion, ...","[pathogens, amniotes, medical conditions, bind...",journal.ppat.1010691
4,Many fungal species utilize hydroxyderivatives...,"[Abstract, Introduction, Results and discussio...","[taxonomy, proteins, chemistry, genetics, enzy...",journal.pgen.1009815


In [12]:
df_plos.shape

(142, 4)

In [13]:
print("Summarization process started...")
df_plos["gpt_summary"] = df_plos["article"].apply(lambda text: sumarize_article(text[:text_cap]))
print("Completed")

Summarization process started...
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs...OK
Completed. Pausing for 10 secs.

In [19]:
df.to_csv('elife.csv', index=False)
df_plos.to_csv('plos.csv', index=False)

In [21]:
filenames = ["./elife.csv",
             "./plos.csv"
            ]

output_filenames = [ "./output/elife.txt",
                     "./output/plos.txt"
                   ]

for fname, output_fname in zip(filenames, output_filenames):
    print("Processing file =", fname)
    df = pd.read_csv(fname)
    print("Len =", len(df))

    print("Writing to file =", output_fname)
    # does some simple data cleaning
    df_txt = df["gpt_summary"].apply(lambda text: text.replace("\n", "")) #("\n", "")
    df_txt.to_csv(output_fname,
                  header=False,
                  index=False,
                  sep="\n"
                 )
    print("Completed")
    print("------------------")

print("=================")
print("All completed")

Processing file = ./elife.csv
Len = 142
Writing to file = ./output/elife.txt
Completed
------------------
Processing file = ./plos.csv
Len = 142
Writing to file = ./output/plos.txt
Completed
------------------
All completed
