In [2]:
# Modules
from openai import OpenAI
import os
import json
from tqdm import tqdm
import time
from Bio import Entrez
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import instructor
from pydantic import BaseModel, Field
from typing import List
import asyncio
from concurrent.futures import ThreadPoolExecutor
from collections import Counter
# from typing import Literal
import statistics

In [5]:
load_dotenv('../../.env')

Entrez.email = os.getenv('ENTREZ_EMAIL')
Entrez.api_key = os.getenv('ENTREZ_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [6]:
# Test the OpenAI API key is working

client = OpenAI(
  api_key=openai_api_key,  # this is also the default, it can be omitted
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "What is the capital city of Victoria? Answer with only the name of the city",
        }
    ],
    model="gpt-4o-mini",
)

result = chat_completion.choices[0].message.content
print(result)

Melbourne


In [7]:
# Prepare dummy variables for testing purposes

user_query = "Identify datasets and samples which are relevant to exploring immunotherapies for lung cancer"
num_queries = 3

In [8]:
# Prepare functions for term extraction from research query

# Prepare output structures
## For term extraction from user query
class ExtractedTerms(BaseModel):
    extracted_terms: List[str] = Field(description="List of terms extracted from the query")
    expanded_terms: List[str] = Field(description="List of related terms generated from the extracted terms")

## For determining dataset relevance
class Assessment(BaseModel):
    ID: str
    RelevanceScore: int = Field(description="Score from 0 to 10, indicating relevance")
    Justification: str = Field(description="A brief explanation for the score")

class Assessments(BaseModel):
    assessments: List[Assessment]


# "Patch" the client?
client = instructor.patch(OpenAI(
    api_key=openai_api_key))

# Define function for term extraction 
def extract_terms(user_query: str) -> List[str]:
    prompt = f"""

## IDENTITY AND PURPOSE
You are an expert in literature searches of biological ideas. Your task is to identify biological term(s) from a query, and generate related terms for the purposes of generating a search query. 

## STEPS

- First, extract the biological term(s) from the input query. These should be specific and fall into one of the following categories:
1. Genes - Examples: BRCA1, TP53
2. Treatments/Methods - Examples: chemotherapy, CRISPR
3. Tissues/Cells - Examples: lung, hepatocytes
4. Diseases - Examples: Alzheimer's disease, lung cancer.

Do not fabricate items if no relevant term exists. Avoid general terms such as "disease" or "variant."

- Second, for each extracted biological term, generate two related terms. Make a considered effort to keep these terms in the same category as the original term. These are examples of an identified term, and possible relevant terms:
1. Genes: BRCA1 - Examples: BRCA2, oncogene
2. Treatments: Chemotherapy - Examples: radiotherapy, monoclonal antibody
3. Tissues/Cells: Lung - Examples: respiratory, alveoli
4. Diseases: Alzheimer's disease - Examples: dementia, amyloid plaque

## OUTPUT

Provide two lists:
1. Extracted terms: The primary terms identified directly from the query.
2. Expanded terms: The related terms generated from the extracted terms.
Do not include categories or justifications.

## INPUT
User query: {user_query}"""

    extracted_terms = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=1,
        response_model=ExtractedTerms,
        messages=[
            {"role": "system", "content": "You are an assistant that helps to extract and expand terms for better search results."},
            {"role": "user", "content": prompt}
        ]
    )
    
    print(f"Raw extracted terms: {extracted_terms.extracted_terms}")
    print(f"Raw expanded terms: {extracted_terms.expanded_terms}")
    
    all_terms = extracted_terms.extracted_terms + extracted_terms.expanded_terms
    terms_with_filter = [term + ' AND "gse"[Filter]' for term in all_terms]
    return terms_with_filter

# Extension - define function to perform term extraction multiple times
async def extract_terms_multiple(user_query: str, num_queries: int = 3) -> List[str]:
    async def single_extract():
        return extract_terms(user_query)
    
    tasks = [single_extract() for _ in range(num_queries)]
    results = await asyncio.gather(*tasks)
    
    # Flatten the list of lists and remove duplicates
    all_terms = list(set([term for sublist in results for term in sublist]))
    return all_terms

# Define function for performing search
def perform_search(term):
    search_handle = Entrez.esearch(db="gds", term=term, retmode="xml", retmax = 50) # CHANGE
    search_results = Entrez.read(search_handle)
    search_handle.close()
    return search_results

# Define function for extracting information from above search results
def extract_geo_info_batch(geo_ids):
    """
    Retrieve GEO information for a batch of GEO IDs.
    """
    ids_str = ",".join(geo_ids)
    handle = Entrez.esummary(db="gds", id=ids_str, retmode="xml")
    output = Entrez.read(handle)
    handle.close()

    data = []
    for geo_id, geo_data in zip(geo_ids, output):
        if isinstance(geo_data, dict):
            data.append({
                'ID': geo_id,
                'Title': geo_data.get('title', 'No title available'),
                'Summary': geo_data.get('summary', 'No summary available'),
                'Accession': geo_data.get('Accession', 'No accession available'),
                'Species': geo_data.get('taxon', 'No taxon available'),
                'Date': geo_data.get('PDAT', 'Date made public unknown')
            })
        else:
            data.append({'ID': geo_id, 'Title': 'Error', 'Summary': 'Unable to fetch data', 'Accession': 'Error'})

    return data

def create_geo_dataframe(geo_ids, batch_size=10):
    """Create a DataFrame from GEO search results using batch processing."""
    data = []
    for i in tqdm(range(0, len(geo_ids), batch_size), desc="Processing GEO IDs in batches"):
        batch_ids = geo_ids[i:i + batch_size]
        data.extend(extract_geo_info_batch(batch_ids))
        time.sleep(0.2)  # Be nice to NCBI servers
    return pd.DataFrame(data)


# Define function for determining relevance of datasets
def assess_relevance_batch(df, query, batch_size=10):
    results = []
    total_batches = (len(df) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(df), batch_size), desc="Determining dataset relevance", total=total_batches):
        batch = df.iloc[i:i+batch_size]
        prompt = f"""
## IDENTITY AND PURPOSE

You are a highly knowledgeable biologist tasked with identifying relevant datasets for a given research query. Your goal is to assess NCBI GEO datasets based on their titles and summaries, and determine their relevance to the research question at hand.

## STEPS

1. For each dataset, carefully analyze the provided title and summary.
2. Extract ALL biological concepts represented in the dataset, including but not limited to:
   - Genes and variants investigated (e.g., p53, BRCA1)
   - Species studied (e.g., Homo sapiens, Escherichia coli)
   - Sample sources (e.g., organoid cultures, human samples)
   - Diseases or phenotypes studied (e.g., Alzheimer's disease, lung cancer)
   - Cell types or tissues examined (e.g., lung tissue, neural progenitor cells)
   - Experimental techniques or methodologies used (e.g., RNA-seq, ChIP-seq)
3. Extract ALL biological concepts represented in the research query using the same categories.
4. Assign a relevance score from 0 to 10 in increments of 1, based solely on the provided information. 
   - Do not fabricate or assume information not explicitly stated about the dataset. 
   - If confirmed information about the dataset is POSSIBLY useful for the research question, view this favourably for determining dataset relevance. 
   - Note that the gene, disease, and cell type/tissue being studied are the most important in determining relevance. The other factors are considered minor aspects.
   - Use the following scoring guide:

   0: No relevance. All biological concepts (genes, species, samples, diseases, cell types, methods) are completely unrelated to the research query.
   1: Minimal relevance. One minor aspect is loosely related, but the overall focus is different.
   2: Low relevance. One major aspect aligns with the query, but other key elements differ significantly.
   3: Somewhat low relevance. Two aspects align, but critical elements are still mismatched.
   4: Moderate relevance. Multiple aspects align, but there are still significant differences in focus or approach.
   5: Moderately relevant. Most major aspects align, but there are some notable differences that may limit direct applicability.
   6: Relevant. All major aspects align, but there might be differences in specific genes, cell types, or methodologies that somewhat reduce direct applicability.
   7: Highly relevant. Very close alignment in all major aspects, with only minor differences that don't significantly impact applicability.
   8: Very highly relevant. Near-perfect alignment in all major aspects, with at most one or two minor differences.
   9: Extremely relevant. Perfect alignment in all major aspects, with at most one negligible difference.
   10: Perfectly relevant. The dataset appears to be an exact match for the research query in all aspects.

5. Provide a brief justification (3-4 sentences) for the assigned score, highlighting key similarities and differences.

## OUTPUT
For each dataset, provide a JSON object with the ID, relevance score, and justification. 
- The relevance score should be a number, with no other information.
- The justification should be a 4-5 sentence explanation for the relevance score. 

## HANDLING LIMITED INFORMATION
If the dataset title or summary lacks sufficient detail:
- Focus on the information that is available
- Do not make assumptions about missing information
- Assign a lower score if critical information is absent
- Note the lack of information in the justification

Remember, it's better to assign a lower score due to lack of information than to assume relevance without evidence.

Given the following datasets and query, determine if each dataset is relevant.
        Query: {query}
        Datasets:
        """
        for _, row in batch.iterrows():
            prompt += f"""
            ID: {row['ID']}
            Title: {row['Title']}
            Summary: {row['Summary']}
            Species: {row['Species']}
            """
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0.3,
                response_model=Assessments,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that determines dataset relevance and responds in JSON format."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=10000
            )
            results.extend([assessment.dict() for assessment in response.assessments])
        except Exception as e:
            results.extend([{"ID": row['ID'], "Relevance": "Error", "Justification": str(e)} for _, row in batch.iterrows()])
        time.sleep(1)  # Be nice to the API
    return results

# Extension - define function to assess relevance multiple times
async def assess_relevance_batch_multiple(df, query, num_queries: int = 3, batch_size=20):
    async def single_assess():
        return assess_relevance_batch(df, query, batch_size)
    
    tasks = [single_assess() for _ in range(num_queries)]
    results = await asyncio.gather(*tasks)
    
    # Collate results
    collated_results = {}
    for i, result_set in enumerate(results):
        for assessment in result_set:
            id = assessment['ID']
            if id not in collated_results:
                collated_results[id] = {'scores': [], 'justifications': []}
            collated_results[id]['scores'].append(assessment['RelevanceScore'])
            collated_results[id]['justifications'].append(assessment['Justification'])
    
    # Determine final relevance and format output
    final_results = []
    for id, data in collated_results.items():
        mean_score = statistics.mean(data['scores'])
        
        result = {
            'ID': id,
            'RelevanceScore': round(mean_score, 1),
        }
        
        # Add individual scores and justifications
        for i in range(num_queries):
            result[f'IndividualScore{i+1}'] = data['scores'][i] if i < len(data['scores']) else None
            result[f'Justification{i+1}'] = data['justifications'][i] if i < len(data['justifications']) else None
        
        final_results.append(result)
    
    return final_results

async def main(user_query):
    # Extract terms
    search_terms = await extract_terms_multiple(user_query)
    print("Extracted terms:", search_terms)

    # Perform Entrez search and remove duplicates
    geo_ids = set()  # Use a set to automatically remove duplicates
    for term in search_terms:
        search_results = perform_search(term)
        geo_ids.update(search_results.get('IdList', []))  # Update the set with new IDs
    if not geo_ids:
        return pd.DataFrame({'Error': ["No results found for the extracted terms"]})

    # Convert set back to list
    geo_ids = list(geo_ids)

    # Create DataFrame with GEO information
    df = create_geo_dataframe(geo_ids)

    # Assess relevance
    relevance_results = await assess_relevance_batch_multiple(df, user_query, num_queries=num_queries) # Currently have this at 3
    relevance_df = pd.DataFrame(relevance_results)

    # Merge results
    df['ID'] = df['ID'].astype(str)
    relevance_df['ID'] = relevance_df['ID'].astype(str)
    result_df = df.merge(relevance_df, on='ID', how='left')

    # Dynamically create the desired order of columns
    base_columns = ['ID', 'Title', 'Summary', 'Species', 'Accession', 'Date', 'RelevanceScore']
    score_columns = [f'IndividualScore{i+1}' for i in range(num_queries)]
    justification_columns = [f'Justification{i+1}' for i in range(num_queries)]
    desired_order = base_columns + score_columns + justification_columns

    # Reorder columns
    result_df = result_df[desired_order]

    # Reset index
    result_df = result_df.reset_index(drop=True)

    return result_df

In [9]:
# Test case - term extraction
search_terms = await (extract_terms_multiple(user_query))
print(search_terms)

Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['targeted therapies', 'non-small cell lung cancer', 'immune checkpoint inhibitors', 'small cell lung cancer']
Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'immunotherapy', 'targeted therapy']
Raw extracted terms: ['immunotherapy', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'NSCLC', 'pembrolizumab', 'small cell lung cancer']
['targeted therapies AND "gse"[Filter]', 'immune checkpoint inhibitors AND "gse"[Filter]', 'checkpoint inhibitors AND "gse"[Filter]', 'small cell lung cancer AND "gse"[Filter]', 'immunotherapy AND "gse"[Filter]', 'NSCLC AND "gse"[Filter]', 'targeted therapy AND "gse"[Filter]', 'immunotherapies AND "gse"[Filter]', 'pembrolizumab AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]', 'non-small cell lung cancer AND "gse"[Filter]']


In [10]:
# Test case - performing search
geo_ids = []
for term in tqdm(search_terms, "Performing search for extracted terms"):
    search_results = perform_search(term)
    geo_ids.extend(search_results.get('IdList', []))


Performing search for extracted terms: 100%|██████████| 11/11 [00:09<00:00,  1.14it/s]


In [11]:
# Test case - create data frame out of search results
df = create_geo_dataframe(geo_ids)
df # Jupyter is quite nice, I don't need to specify some specific function. Just this gives good output.

Processing GEO IDs in batches: 100%|██████████| 55/55 [01:31<00:00,  1.66s/it]


Unnamed: 0,ID,Title,Summary,Accession,Species,Date
0,200278220,Dynamic behavior and lineage plasticity of the...,Repair of the pulmonary vascular bed and the o...,GSE278220,Mus musculus,2024/10/16
1,200278879,Single nucleus RNA sequencing analysis reveals...,Endometrial cancer (EC) is the sixth most prev...,GSE278879,Homo sapiens,2024/10/15
2,200271987,TARGETING KRAS SIGNALING AT THREE INDEPENDENT ...,Pancreatic ductal adenocarcinoma (PDAC) has on...,GSE271987,Mus musculus,2024/10/15
3,200271518,TARGETING KRAS SIGNALING AT THREE INDEPENDENT ...,Pancreatic ductal adenocarcinoma (PDAC) has on...,GSE271518,Mus musculus,2024/10/15
4,200262378,SUMOylation-Induced Membrane Localization of T...,Gastric cancer (GC) remains a significant heal...,GSE262378,Homo sapiens,2024/10/15
...,...,...,...,...,...,...
545,200259380,"TG6050, an oncolytic vaccinia virus encoding i...",By their selective infection or replication in...,GSE259380,Mus musculus,2024/07/30
546,200259379,"TG6050, an oncolytic vaccinia virus encoding i...",By their selective infection or replication in...,GSE259379,Mus musculus,2024/07/30
547,200273176,DNA Methylome Analysis of Human Cancer Cells w...,Genome-wide DNA methylation profiling of non-s...,GSE273176,Homo sapiens,2024/07/28
548,200207715,Extracellular vesicles miR-574-5p and miR-181a...,Plasma from 245 patients with advanced NSCLC w...,GSE207715,Homo sapiens,2024/07/28


In [None]:
# Test case - assessing dataset relevances

relevance_results = await assess_relevance_batch_multiple(df, user_query, num_queries=num_queries) # Currently have this at 3
relevance_df = pd.DataFrame(relevance_results)

# Merge results
df['ID'] = df['ID'].astype(str)
relevance_df['ID'] = relevance_df['ID'].astype(str)
result_df = df.merge(relevance_df, on='ID', how='left')

# Dynamically create the desired order of columns
base_columns = ['ID', 'Title', 'Summary', 'Species', 'Accession', 'Date', 'RelevanceScore']
score_columns = [f'IndividualScore{i+1}' for i in range(num_queries)]
justification_columns = [f'Justification{i+1}' for i in range(num_queries)]
desired_order = base_columns + score_columns + justification_columns

# Reorder columns
result_df = result_df[desired_order]

# Reset index
result_df = result_df.reset_index(drop=True)

result_df # Based on the output I'm assuming the token limit is too low

In [12]:
# Test case - full pipeline

result_df = await (main(user_query))
result_df

Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['immune checkpoint inhibitors', 'non-small cell lung cancer', 'monoclonal antibodies', 'small cell lung cancer']
Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'CAR T-cell therapy', 'tumor microenvironment']
Raw extracted terms: ['immunotherapy', 'lung cancer']
Raw expanded terms: ['immune checkpoint inhibitors', 'NSCLC', 'radiotherapy', 'chemotherapy', 'respiratory', 'alveoli']
Extracted terms: ['radiotherapy AND "gse"[Filter]', 'non-small cell lung cancer AND "gse"[Filter]', 'chemotherapy AND "gse"[Filter]', 'alveoli AND "gse"[Filter]', 'immunotherapies AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]', 'immunotherapy AND "gse"[Filter]', 'respiratory AND "gse"[Filter]', 'checkpoint inhibitors AND "gse"[Filter]', 'CAR T-cell therapy AND "gse"[Filter]', 'tumor microenvironment AND "gse"[Filter]', 'monoclonal antibodies AND "

Processing GEO IDs in batches: 100%|██████████| 57/57 [01:51<00:00,  1.95s/it]
Determining dataset relevance: 100%|██████████| 29/29 [08:20<00:00, 17.25s/it]
Determining dataset relevance: 100%|██████████| 29/29 [08:14<00:00, 17.06s/it]
Determining dataset relevance: 100%|██████████| 29/29 [07:46<00:00, 16.07s/it]


Unnamed: 0,ID,Title,Summary,Species,Accession,Date,RelevanceScore,IndividualScore1,IndividualScore2,IndividualScore3,Justification1,Justification2,Justification3
0,200236306,Chromatin accessibility profiling in wild-type...,To investigate the kinetics of chromatin acces...,Mus musculus,GSE236306,2024/06/03,2.0,2.0,2.0,2.0,This dataset focuses on chromatin accessibilit...,This dataset focuses on chromatin accessibilit...,This dataset focuses on chromatin accessibilit...
1,200242839,PD-1 is induced on tumor-associated macrophage...,Obesity is a leading risk factor for progressi...,Homo sapiens; Mus musculus,GSE242839,2024/07/19,7.3,8.0,8.0,6.0,This dataset investigates the role of PD-1 on ...,This dataset investigates the role of PD-1 on ...,This dataset investigates the role of PD-1 on ...
2,200244104,Targeting DHX9 triggers interferon response an...,Activating innate immunity in cancer cells thr...,Homo sapiens,GSE244104,2024/02/01,9.0,9.0,9.0,9.0,This dataset explores the role of DHX9 in smal...,This dataset explores the role of DHX9 in smal...,This dataset explores the role of DHX9 in smal...
3,200202543,Joint single-cell transcriptomics and epigenom...,"In this study, we revealed the molecular netwo...",Mus musculus,GSE202543,2023/10/09,5.0,5.0,5.0,5.0,This dataset examines CAR T cell differentiati...,This dataset examines CAR T cell differentiati...,This dataset examines CAR T cell differentiati...
4,200267560,Microenvironment Shapes Small Cell Lung Cancer...,Small-cell lung cancer (SCLC) is the most fata...,Homo sapiens,GSE267560,2024/05/16,8.0,7.0,9.0,8.0,This dataset maps the tumor microenvironment i...,This dataset provides insights into the tumor ...,This dataset provides insights into the tumor ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
557,200234976,Pan-cancer mapping of single CD8+ T cell profi...,Cytotoxic CD8+ T cells need to persist and fun...,Mus musculus,GSE234976,2024/07/02,7.3,6.0,8.0,8.0,This dataset investigates CD8+ T cell profiles...,This dataset examines CD8+ T cell profiles and...,This dataset investigates CD8+ T cell profiles...
558,200247055,A pair of NUCLEAR FACTOR Y transcription facto...,This SuperSeries is composed of the SubSeries ...,Arabidopsis thaliana,GSE247055,2024/07/15,0.3,0.0,0.0,1.0,This dataset focuses on plant biology and does...,This dataset focuses on transcription factors ...,This dataset focuses on plant biology and jasm...
559,200268155,Epigenomic signatures of sarcomatoid different...,Renal cell carcinoma with sarcomatoid differen...,Homo sapiens,GSE268155,2024/05/23,6.3,7.0,6.0,6.0,This dataset discusses renal cell carcinoma wi...,This dataset examines renal cell carcinoma wit...,This dataset discusses renal cell carcinoma wi...
560,200248348,A Th17 cell-intrinsic glutathione/mitochondria...,The intestinal tract generates significant rea...,Mus musculus,GSE248348,2024/07/01,1.0,1.0,1.0,1.0,This dataset focuses on Th17 cells and their r...,The dataset focuses on Th17 cells and their ro...,The dataset primarily focuses on Th17 cells an...


In [13]:
result_df.to_csv("../results/2024_07_25_DetermineDatasetRelevance/DatasetRelevance_MultipleScores.csv")