In [2]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [1]:
import pandas as pd
from Bio import Entrez
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Set your email for Entrez API
Entrez.email = "bathomansam2334@gmail.com"

In [3]:
# Fetch abstracts from PubMed based on keyword
def fetch_abstracts(keyword: str, max_results: int = 10):
    if not keyword:
        raise ValueError("Keyword must not be empty.")

    search_handle = Entrez.esearch(db="pubmed", term=keyword, retmax=max_results)
    search_results = Entrez.read(search_handle)
    search_handle.close()

    id_list = search_results["IdList"]
    fetch_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="xml")
    fetch_results = Entrez.read(fetch_handle)
    fetch_handle.close()

    abstracts = []
    for article in fetch_results['PubmedArticle']:
        try:
            abstract_text = article['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
            abstracts.append(abstract_text)
        except (KeyError, IndexError):
            abstracts.append("")

    return abstracts

In [4]:
# --- Define keywords for multi-class classification ---
keywords = ["cancer", "diabetes", "stroke", "asthma", "hypertension"]
all_data = []

for kw in keywords:
    abstracts = fetch_abstracts(kw, max_results=50)
    temp_df = pd.DataFrame(abstracts, columns=['abstract'])
    temp_df['target'] = kw
    all_data.append(temp_df)

# --- Combine and clean the dataset ---
df = pd.concat(all_data, ignore_index=True)
df = df[df['abstract'].str.strip() != ""]  # Remove empty abstracts
df.dropna(inplace=True)

df

Unnamed: 0,abstract,target
0,Quantitative chemical exchange saturation tran...,cancer
1,Cancer has become a leading cause of mortality...,cancer
2,Nano-catalytic therapy is an emerging tumor th...,cancer
3,Ovarian cancer is the eighth most common cause...,cancer
4,Missed and delayed cancer diagnoses worsen pat...,cancer
...,...,...
245,Hereditary hemorrhagic telangiectasia (HHT) is...,hypertension
246,"Dasatinib, a Tyrosine kinase inhibitor, functi...",hypertension
247,Invasive treatments for chronic thromboembolic...,hypertension
248,Thyroid dysfunction can cause several cardiova...,hypertension


In [5]:
# --- Train classifier ---
X = df['abstract']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

clf_pipeline.fit(X_train, y_train)
y_pred = clf_pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
acc

0.6086956521739131