## Abstract Summarizer from Research Paper
#### Summarize abstract, currently focused on biological science paper

1. Import packages

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
from nltk.tokenize import word_tokenize
nltk.download("punkt")
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\HP Elitebook
[nltk_data]     X360 i5\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\HP Elitebook X360
[nltk_data]     i5\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\HP Elitebook X360
[nltk_data]     i5\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\HP Elitebook X360
[nltk_data]     i5\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

2. Scrape data for training purpose from PubMed

In [2]:
# Function for scraping pubmed using the pubmed API
def get_pubmed_abstracts(query, results=5):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    # Search for the paper ID
    search_url = f"{base_url}esearch.fcgi?db=pubmed&term={query}&retmax={results}&retmode=json"
    search_response = requests.get(search_url).json()
    ids = search_response.get("esearchresult", {}).get("idlist", [])

    # Extract abstracts using ID
    abstracts = []
    for id in ids:
        fetch_url = f"{base_url}efetch.fcgi?db=pubmed&id={id}&retmode=xml"
        fetch_response = requests.get(fetch_url).text
        soup = BeautifulSoup(fetch_response, "xml")
        abstract = soup.find("AbstractText")
        if abstract:
            abstracts.append({"PMID": id, "Abstract": abstract.text})
    return abstracts    

In [33]:
# Extract and store
query = "virus" # Changeable
abstracts = get_pubmed_abstracts(query, results=20)

for i, entry in enumerate(abstracts):
    print(f"\nAbstract {i+1} (PMID {entry['PMID']}):\n{entry['Abstract']}\n")

# Save as JSON
with open("abstracts.json", "w") as f:
    json.dump(abstracts, f, indent=4)


Abstract 1 (PMID 39921843):
Influenza A virus (IAV) remains a significant public health concern due to its annual epidemics and potential for global pandemics. Despite the availability of countermeasures such as vaccines and antiviral treatments, their effectiveness is often questioned due to the emergence of novel strains with antiviral resistance and the variable efficacy of influenza vaccines compared to other vaccines. Traditionally, influenza vaccination strategies have focused on matrix, neuraminidase, and nucleoproteins. In this study, considering the crucial roles of HA and RdRp (PA, PB1, and PB2) of Influenza A, a reverse vaccinology approach is put forth in designing a possible promising antigenic protein toward the development of vaccines against H1N1 viruses. With the development of immunoinformatics approach, one can design/construct potential candidates for vaccine formulation against IAV with the epitope segments identified based on B- and T-cell recognition linked via 

In [5]:
# (RUN THIS CELL ONLY ONCE)
# Store to DataFrame 
df_abstracts = pd.DataFrame(abstracts)

In [34]:
# (RUN AS MANY AS YOU WANT AFTER SCRAPING ADDITIONAL DATA)
# Create additional dataframe for new data
new_abstracts = pd.DataFrame(abstracts)

# Append additional scraping data
df_abstracts = pd.concat([df_abstracts, new_abstracts], ignore_index=True)

In [35]:
# Check the dataframe
print(df_abstracts.head())
print(df_abstracts.info())
print(df_abstracts.isnull().sum())
# Check duplicate
duplicated_data = df_abstracts.duplicated().sum()
print("duplicated data: ",duplicated_data)

       PMID                                           Abstract
0  39921842  Aberrant activation of the Wnt/β-catenin signa...
1  39921807  Cancer cells can modulate the expression of ma...
2  39921789  Colorectal cancer (CRC) is a molecularly heter...
3  39921761  Ingenol mebutate (IM), a diterpene ester deriv...
4  39921753  Metastasis and chemoresistance are often major...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   PMID      155 non-null    object
 1   Abstract  155 non-null    object
dtypes: object(2)
memory usage: 2.5+ KB
None
PMID        0
Abstract    0
dtype: int64
duplicated data:  26


In [36]:
# Delete duplicate
duplicate = df_abstracts[df_abstracts.duplicated(subset=["PMID"], keep="first")].index
df_abstracts = df_abstracts.drop(index=duplicate).reset_index(drop=True)
# Check duplicate
duplicated_data = df_abstracts.duplicated().sum()
print("duplicated data: ",duplicated_data)

duplicated data:  0


In [37]:
# Save as csv
df_abstracts.to_csv("abstracts.csv", index=False)

3. Preprocessing

In [3]:
# Load Data
df_data = pd.read_csv("abstracts.csv")

In [12]:
# Initialize stopwords and lemmatizer
stopwords = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [13]:
# Function for preprocess
def preprocess(text):
    text = text.lower() # convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9%<>/=+\-\^.,\s]', '', text) # remove special characters and numbers but keep scientific terms, symbols, numbers
    tokens = word_tokenize(text) # tokenize text
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords] # remove stopwords and apply lemmatization
    return " ". join(processed_tokens)

In [15]:
# Apply preprocess
df_data["Processed_abstract"] = df_data["Abstract"].apply(preprocess)
# Check result
print(df_data[["Abstract", "Processed_abstract"]].head())
# Save
df_data.to_csv("preprocessed_data.csv", index=False)

                                            Abstract  \
0  Aberrant activation of the Wnt/β-catenin signa...   
1  Cancer cells can modulate the expression of ma...   
2  Colorectal cancer (CRC) is a molecularly heter...   
3  Ingenol mebutate (IM), a diterpene ester deriv...   
4  Metastasis and chemoresistance are often major...   

                                  Processed_abstract  
0  aberrant activation wnt/-catenin signaling pat...  
1  cancer cell modulate expression many protein e...  
2  colorectal cancer crc molecularly heterogeneou...  
3  ingenol mebutate im , diterpene ester derived ...  
4  metastasis chemoresistance often major challen...  


4. Prepare training data

In [17]:
df_final = pd.read_csv("preprocessed_data.csv")
# Tokenizer setup
max_vocab_size = 15000
max_sequence_length = 500
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df_final["Processed_abstract"])
# Text to sequence and pad sequences for uniforming
sequences = tokenizer.texts_to_sequences(df_final["Processed_abstract"])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding="post", truncating="post")

In [18]:
# Split data
# Convert to array
X = np.array(padded_sequences)
# Split
X_train, X_temp = train_test_split(X, test_size=0.2, random_state=42)
X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)

print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}, Test size: {len(X_test)}")

Train size: 103, Validation size: 13, Test size: 13


In [None]:
pip install transformers datasets torch sentencepiece accelerate
