In [14]:
# Modules
from openai import OpenAI
import os
import json
from tqdm import tqdm
import time
from Bio import Entrez
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import instructor
from pydantic import BaseModel, Field
from typing import List
import asyncio
from concurrent.futures import ThreadPoolExecutor
from collections import Counter
from typing import Literal

In [2]:
load_dotenv('../.env')

Entrez.email = os.getenv('ENTREZ_EMAIL')
Entrez.api_key = os.getenv('ENTREZ_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [3]:
# Test the OpenAI API key is working

client = OpenAI(
  api_key=openai_api_key,  # this is also the default, it can be omitted
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "What is the capital city of Victoria? Answer with only the name of the city",
        }
    ],
    model="gpt-4o-mini",
)

result = chat_completion.choices[0].message.content
print(result)

Melbourne


In [4]:
# Prepare dummy variables for testing purposes

user_query = "Identify datasets and samples which are relevant to exploring immunotherapies for lung cancer"

In [17]:
# Prepare functions for term extraction from research query

# Prepare output structures
## For term extraction from user query
class ExtractedTerms(BaseModel):
    terms: List[str] = Field(description="List of extracted primary biological terms")

## For determining dataset relevance
class Assessment(BaseModel):
    ID: str
    Relevance: Literal['Relevant', 'Not Relevant'] = Field(description="Either 'Relevant' or 'Not Relevant'")
    Justification: str = Field(description="A brief 3-sentence explanation")

class Assessments(BaseModel):
    assessments: List[Assessment]

# "Patch" the client?
client = instructor.patch(OpenAI(
    api_key=openai_api_key))

# Define function for term extraction 
def extract_terms(user_query: str) -> List[str]:
    prompt = f"""Your task is to identify the primary biological term(s) in the query. These should be specific and fall into one of the following categories:
1. Genes - Examples: BRCA1, TP53
2. Treatments - Examples: chemotherapy, CRISPR
3. Tissues/Cells - Examples: lung, hepatocytes
4. Diseases - Examples: Alzheimer's disease, lung cancer.
Do not fabricate items if no relevant term exists. Avoid general terms such as "disease" or "variant."
User query: {user_query}"""

    extracted_terms = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.8,
        response_model=ExtractedTerms,
        messages=[
            {"role": "system", "content": "You are an assistant that helps to extract terms for better search results."},
            {"role": "user", "content": prompt}
        ]
    )

    terms = extracted_terms.terms
    terms_with_filter = [term + ' AND "gse"[Filter]' for term in terms]
    return terms_with_filter

# Extension - define function to perform term extraction multiple times
async def extract_terms_multiple(user_query: str, num_queries: int = 3) -> List[str]:
    async def single_extract():
        terms = extract_terms(user_query)
        print(f"Extracted terms: {terms}")
        return terms
    
    tasks = [single_extract() for _ in range(num_queries)]
    results = await asyncio.gather(*tasks)
    
    # Flatten the list of lists and remove duplicates
    all_terms = list(set([term for sublist in results for term in sublist]))
    return all_terms

# Define function for performing search
def perform_search(term):
    search_handle = Entrez.esearch(db="gds", term=term, retmode="xml", retmax = 15) # CHANGE
    search_results = Entrez.read(search_handle)
    search_handle.close()
    return search_results

# Define function for extracting information from above search results
def extract_geo_info_batch(geo_ids):
    """
    Retrieve GEO information for a batch of GEO IDs.
    """
    ids_str = ",".join(geo_ids)
    handle = Entrez.esummary(db="gds", id=ids_str, retmode="xml")
    output = Entrez.read(handle)
    handle.close()

    data = []
    for geo_id, geo_data in zip(geo_ids, output):
        if isinstance(geo_data, dict):
            data.append({
                'ID': geo_id,
                'Title': geo_data.get('title', 'No title available'),
                'Summary': geo_data.get('summary', 'No summary available'),
                'Accession': geo_data.get('Accession', 'No accession available'),
                'Date': geo_data.get('PDAT', 'Date made public unknown')
            })
        else:
            data.append({'ID': geo_id, 'Title': 'Error', 'Summary': 'Unable to fetch data', 'Accession': 'Error'})

    return data

# Define function for constructing data frame out of extracted results
def create_geo_dataframe(geo_ids, batch_size=20):
    """
    Retrieve GEO information for all provided GEO IDs in batches.
    """
    data = []
    for i in tqdm(range(0, len(geo_ids), batch_size), desc="Processing GEO IDs in batches"):
        batch_ids = geo_ids[i:i + batch_size]
        data.extend(extract_geo_info_batch(batch_ids))
        time.sleep(0.2)  # Be nice to NCBI servers
    return pd.DataFrame(data)

# Define function for determining relevance of datasets
def assess_relevance_batch(df, query, batch_size=20):
    results = []
    total_batches = (len(df) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(df), batch_size), desc="Determining dataset relevance", total=total_batches):
        batch = df.iloc[i:i+batch_size]
        prompt = f"""Given the following datasets and query, determine if each dataset is relevant.
        Query: {query}
        Datasets:
        """
        for _, row in batch.iterrows():
            prompt += f"""
            ID: {row['ID']}
            Title: {row['Title']}
            Summary: {row['Summary']}
            """
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0.8,
                response_model=Assessments,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that determines dataset relevance and responds in JSON format."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=3000
            )
            results.extend([assessment.dict() for assessment in response.assessments])
        except Exception as e:
            results.extend([{"ID": row['ID'], "Relevance": "Error", "Justification": str(e)} for _, row in batch.iterrows()])
        time.sleep(1)  # Be nice to the API
    return results

# Extension - define function to assess relevance multiple times
async def assess_relevance_batch_multiple(df, query, num_queries: int = 3, batch_size=20):
    async def single_assess():
        return assess_relevance_batch(df, query, batch_size)
    
    tasks = [single_assess() for _ in range(num_queries)]
    results = await asyncio.gather(*tasks)
    
    # Collate results
    collated_results = {}
    for result_set in results:
        for assessment in result_set:
            id = assessment['ID']
            if id not in collated_results:
                collated_results[id] = {'relevance_counts': Counter(), 'justifications': []}
            collated_results[id]['relevance_counts'][assessment['Relevance']] += 1
            collated_results[id]['justifications'].append(assessment['Justification'])
    
    # Determine final relevance and format output
    final_results = []
    for id, data in collated_results.items():
        relevance_counts = data['relevance_counts']
        total_counts = sum(relevance_counts.values())
        most_common_relevance = relevance_counts.most_common(1)[0][0]
        
        final_results.append({
            'ID': id,
            'Relevance': most_common_relevance,
            'RelevanceConfidence': f"{relevance_counts[most_common_relevance]}/{total_counts}",
            'Justification': "; ".join(data['justifications'])
        })
    
    return final_results

async def main(user_query):
    # Extract terms
    search_terms = await extract_terms_multiple(user_query)
    print("Extracted terms:", search_terms)

    # Perform Entrez search and remove duplicates
    geo_ids = set()  # Use a set to automatically remove duplicates
    for term in search_terms:
        search_results = perform_search(term)
        geo_ids.update(search_results.get('IdList', []))  # Update the set with new IDs
    if not geo_ids:
        return pd.DataFrame({'Error': ["No results found for the extracted terms"]})

    # Convert set back to list
    geo_ids = list(geo_ids)

    # Create DataFrame with GEO information
    df = create_geo_dataframe(geo_ids)

    # Assess relevance
    relevance_results = await assess_relevance_batch_multiple(df, user_query)
    relevance_df = pd.DataFrame(relevance_results)

    # Merge results
    df['ID'] = df['ID'].astype(str)
    relevance_df['ID'] = relevance_df['ID'].astype(str)
    result_df = df.merge(relevance_df, on='ID', how='left')

    # Reorder columns
    desired_order = ['ID', 'Title', 'Summary', 'Accession', 'Date', 'Relevance', 'RelevanceConfidence', 'Justification']
    result_df = result_df[desired_order]

    # Reset index
    result_df = result_df.reset_index(drop=True)

    return result_df


In [6]:
# Test case - term extraction
#search_terms = await (extract_terms_multiple(user_query))
#print(search_terms)

['immunotherapies AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]']


In [7]:
# Test case - performing search
#geo_ids = []
#for term in tqdm(search_terms, "Performing search for extracted terms"):
    search_results = perform_search(term)
    geo_ids.extend(search_results.get('IdList', []))


Performing search for extracted terms: 100%|██████████| 2/2 [00:02<00:00,  1.37s/it]


In [11]:
# Test case - create data frame out of search results
#df = create_geo_dataframe(geo_ids)
#df # Jupyter is quite nice, I don't need to specify some specific function. Just this gives good output.

Processing GEO IDs in batches: 100%|██████████| 5/5 [00:10<00:00,  2.16s/it]


Unnamed: 0,ID,Title,Summary,Accession,Date
0,200270741,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270741,2024/07/24
1,200270740,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270740,2024/07/24
2,200267325,RNA profiling of CSF resident macrophages and ...,We investigated the functional effect of dura-...,GSE267325,2024/07/24
3,200267322,Single-cell atlas of dura cells from non-tumor...,We performed scRNA-seq on the cells isolated f...,GSE267322,2024/07/24
4,200246173,Comparison of the response of Col-0 and ate1 a...,Arabidopsis mutants of the N-degron pathway ha...,GSE246173,2024/07/24
...,...,...,...,...,...
95,200244683,Digital transformation of herbal medicine: Con...,This study aimed to identify transcript expres...,GSE244683,2024/06/21
96,200244682,Digital transformation of herbal medicine: Con...,This study aimed to identify transcript expres...,GSE244682,2024/06/21
97,200236720,Synthetic essentiality of thymine DNA glycosyl...,Thymine DNA Glycosylase (TDG) is a versatile p...,GSE236720,2024/06/21
98,200236719,Synthetic essentiality of thymine DNA glycosyl...,Thymine DNA Glycosylase (TDG) is a versatile p...,GSE236719,2024/06/21


In [30]:
# Test case - assessing dataset relevances

#relevance_results = assess_relevance_batch(df, user_query)
#relevance_df = pd.DataFrame(relevance_results)
#df['ID'] = df['ID'].astype(str)
#relevance_df['ID'] = relevance_df['ID'].astype(str)

#result_df = df.merge(relevance_df, on='ID', how='left')
#result_df # Later we will sort the columns, and probably the rows too.

Processing batches: 100%|██████████| 4/4 [00:56<00:00, 14.10s/it]


Unnamed: 0,ID,Title,Summary,Accession,Date,Relevance,Justification
0,200270741,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270741,2024/07/24,Not Relevant,This dataset focuses on melanoma rather than l...
1,200270740,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270740,2024/07/24,Not Relevant,"Similar to dataset 200270741, this dataset con..."
2,200267325,RNA profiling of CSF resident macrophages and ...,We investigated the functional effect of dura-...,GSE267325,2024/07/24,Relevant,This dataset analyzes macrophages in a model o...
3,200267325,RNA profiling of CSF resident macrophages and ...,We investigated the functional effect of dura-...,GSE267325,2024/07/24,Not Relevant,This dataset focuses on macrophages' effect in...
4,200267322,Single-cell atlas of dura cells from non-tumor...,We performed scRNA-seq on the cells isolated f...,GSE267322,2024/07/24,Relevant,By performing single-cell RNA sequencing on ce...
...,...,...,...,...,...,...,...
79,200270541,Mechanisms of response and tolerance to active...,Resistance to inactive state-selective RASG12C...,GSE270541,2024/06/27,Relevant,Focuses on RAS inhibition mechanisms in KRAS-m...
80,200256236,Comparison of the expression profile between P...,Lung cancer is one of the most commonly diagno...,GSE256236,2024/06/26,Relevant,The research on PDLIM2 highlights its role in ...
81,200252587,Regulatory role of echinochrome A in cancer-as...,"Echinochrome A (Ech A), a marine biosubstance ...",GSE252587,2024/06/26,Relevant,The insights on how Echinochrome A impacts lun...
82,200242951,Contribution of crosstalk of mesothelial and t...,Tumor metastasis commonly affects pleura in ad...,GSE242951,2024/06/26,Relevant,This dataset's focus on cell crosstalk and ple...


In [18]:
# Test case - full pipeline

result_df = await (main(user_query))
result_df

Extracted terms: ['immunotherapies AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]']
Extracted terms: ['immunotherapies AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]']
Extracted terms: ['immunotherapies AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]']
Extracted terms: ['immunotherapies AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]']


Processing GEO IDs in batches: 100%|██████████| 2/2 [00:04<00:00,  2.12s/it]
Determining dataset relevance: 100%|██████████| 2/2 [00:29<00:00, 14.80s/it]
Determining dataset relevance: 100%|██████████| 2/2 [00:34<00:00, 17.20s/it]
Determining dataset relevance: 100%|██████████| 2/2 [00:26<00:00, 13.38s/it]


Unnamed: 0,ID,Title,Summary,Accession,Date,Relevance,RelevanceConfidence,Justification
0,200270741,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270741,2024/07/24,Not Relevant,3/3,The study focuses on melanoma and its resistan...
1,200245808,HMGA1 causes a global shift in chromatin archi...,This SuperSeries is composed of the SubSeries ...,GSE245808,2024/07/08,Not Relevant,3/3,This dataset discusses chromatin architecture ...
2,200268242,Anti-tumor Efficacy of HRS-4642 and Its Potent...,KRAS G12D is the most frequently mutated oncog...,GSE268242,2024/07/12,Not Relevant,3/3,The research is centered around KRAS G12D-muta...
3,200261624,Gene expression profile of all CD4 T cells fro...,The discovery of naïve T cell transcriptional ...,GSE261624,2024/07/25,Not Relevant,3/3,This dataset primarily deals with CD4 T cells ...
4,200270084,Genome-wide Methylation Patterns in Primary Uv...,Despite studies highlighting the prognostic ut...,GSE270084,2024/07/24,Not Relevant,3/3,The focus of this study is on uveal melanoma a...
5,200268049,The effect of exposure to neighborhood violenc...,"Despite lower rates and intensity of smoking, ...",GSE268049,2024/07/10,Relevant,3/3,It investigates lung tumors and the effects of...
6,200267325,RNA profiling of CSF resident macrophages and ...,We investigated the functional effect of dura-...,GSE267325,2024/07/24,Not Relevant,3/3,This dataset focuses on macrophages in leptome...
7,200269782,Defective N-Glycosylation of IL6 Induces Metas...,The biological consequences of various IL-6 gl...,GSE269782,2024/07/16,Relevant,3/3,This dataset analyzes the role of IL-6 glycosy...
8,200234818,Faecalibaterium prausnitzii strain EXL01 boost...,Gut microbiota impacts responses to immune che...,GSE234818,2024/07/10,Relevant,3/3,It discusses the impact of gut microbiota on i...
9,200268048,The effect of exposure to neighborhood violenc...,"Despite lower rates and intensity of smoking, ...",GSE268048,2024/07/10,Relevant,3/3,This dataset also examines the effects of neig...


In [19]:
result_df.to_csv("../results/2024_07_25_DetermineDatasetRelevance/DatasetRelevance.csv")