In [28]:
# Modules
from openai import OpenAI
import os
import json
from tqdm import tqdm
import time
from Bio import Entrez
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import instructor
from pydantic import BaseModel, Field
from typing import List
import asyncio
from concurrent.futures import ThreadPoolExecutor
from collections import Counter
# from typing import Literal
import statistics

In [2]:
load_dotenv('../.env')

Entrez.email = os.getenv('ENTREZ_EMAIL')
Entrez.api_key = os.getenv('ENTREZ_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [3]:
# Test the OpenAI API key is working

client = OpenAI(
  api_key=openai_api_key,  # this is also the default, it can be omitted
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "What is the capital city of Victoria? Answer with only the name of the city",
        }
    ],
    model="gpt-4o-mini",
)

result = chat_completion.choices[0].message.content
print(result)

Melbourne


In [4]:
# Prepare dummy variables for testing purposes

user_query = "Identify datasets and samples which are relevant to exploring immunotherapies for lung cancer"

In [41]:
# Prepare functions for term extraction from research query

# Prepare output structures
## For term extraction from user query
class ExtractedTerms(BaseModel):
    extracted_terms: List[str] = Field(description="List of terms extracted from the query")
    expanded_terms: List[str] = Field(description="List of related terms generated from the extracted terms")

## For determining dataset relevance
class Assessment(BaseModel):
    ID: str
    RelevanceScore: int = Field(description="Score from 0 to 10, indicating relevance")
    Justification: str = Field(description="A brief explanation for the score")

class Assessments(BaseModel):
    assessments: List[Assessment]


# "Patch" the client?
client = instructor.patch(OpenAI(
    api_key=openai_api_key))

# Define function for term extraction 
def extract_terms(user_query: str) -> List[str]:
    prompt = f"""

## IDENTITY AND PURPOSE
You are an expert in literature searches of biological ideas. Your task is to identify biological term(s) from a query, and generate related terms for the purposes of generating a search query. 

## STEPS

- First, extract the biological term(s) from the input query. These should be specific and fall into one of the following categories:
1. Genes - Examples: BRCA1, TP53
2. Treatments/Methods - Examples: chemotherapy, CRISPR
3. Tissues/Cells - Examples: lung, hepatocytes
4. Diseases - Examples: Alzheimer's disease, lung cancer.

Do not fabricate items if no relevant term exists. Avoid general terms such as "disease" or "variant."

- Second, for each extracted biological term, generate two related terms. Make a considered effort to keep these terms in the same category as the original term. These are examples of an identified term, and possible relevant terms:
1. Genes: BRCA1 - Examples: BRCA2, oncogene
2. Treatments: Chemotherapy - Examples: radiotherapy, monoclonal antibody
3. Tissues/Cells: Lung - Examples: respiratory, alveoli
4. Diseases: Alzheimer's disease - Examples: dementia, amyloid plaque

## OUTPUT

Provide two lists:
1. Extracted terms: The primary terms identified directly from the query.
2. Expanded terms: The related terms generated from the extracted terms.
Do not include categories or justifications.

## INPUT
User query: {user_query}"""

    extracted_terms = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=1,
        response_model=ExtractedTerms,
        messages=[
            {"role": "system", "content": "You are an assistant that helps to extract and expand terms for better search results."},
            {"role": "user", "content": prompt}
        ]
    )
    
    print(f"Raw extracted terms: {extracted_terms.extracted_terms}")
    print(f"Raw expanded terms: {extracted_terms.expanded_terms}")
    
    all_terms = extracted_terms.extracted_terms + extracted_terms.expanded_terms
    terms_with_filter = [term + ' AND "gse"[Filter]' for term in all_terms]
    return terms_with_filter

# Extension - define function to perform term extraction multiple times
async def extract_terms_multiple(user_query: str, num_queries: int = 3) -> List[str]:
    async def single_extract():
        return extract_terms(user_query)
    
    tasks = [single_extract() for _ in range(num_queries)]
    results = await asyncio.gather(*tasks)
    
    # Flatten the list of lists and remove duplicates
    all_terms = list(set([term for sublist in results for term in sublist]))
    return all_terms

# Define function for performing search
def perform_search(term):
    search_handle = Entrez.esearch(db="gds", term=term, retmode="xml", retmax = 5) # CHANGE
    search_results = Entrez.read(search_handle)
    search_handle.close()
    return search_results

# Define function for extracting information from above search results
def extract_geo_info_batch(geo_ids):
    """
    Retrieve GEO information for a batch of GEO IDs.
    """
    ids_str = ",".join(geo_ids)
    handle = Entrez.esummary(db="gds", id=ids_str, retmode="xml")
    output = Entrez.read(handle)
    handle.close()

    data = []
    for geo_id, geo_data in zip(geo_ids, output):
        if isinstance(geo_data, dict):
            data.append({
                'ID': geo_id,
                'Title': geo_data.get('title', 'No title available'),
                'Summary': geo_data.get('summary', 'No summary available'),
                'Accession': geo_data.get('Accession', 'No accession available'),
                'Date': geo_data.get('PDAT', 'Date made public unknown')
            })
        else:
            data.append({'ID': geo_id, 'Title': 'Error', 'Summary': 'Unable to fetch data', 'Accession': 'Error'})

    return data

# Define function for constructing data frame out of extracted results
def create_geo_dataframe(geo_ids, batch_size=20):
    """
    Retrieve GEO information for all provided GEO IDs in batches.
    """
    data = []
    for i in tqdm(range(0, len(geo_ids), batch_size), desc="Processing GEO IDs in batches"):
        batch_ids = geo_ids[i:i + batch_size]
        data.extend(extract_geo_info_batch(batch_ids))
        time.sleep(0.2)  # Be nice to NCBI servers
    return pd.DataFrame(data)

# Define function for determining relevance of datasets
def assess_relevance_batch(df, query, batch_size=20):
    results = []
    total_batches = (len(df) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(df), batch_size), desc="Determining dataset relevance", total=total_batches):
        batch = df.iloc[i:i+batch_size]
        prompt = f"""
## IDENTITY AND PURPOSE

You are a highly knowledgeable biologist tasked with identifying relevant datasets for a given research query. Your goal is to assess NCBI GEO datasets based on their titles and summaries, and determine their relevance to the research question at hand.

## STEPS

1. For each dataset, carefully analyze the provided title and summary.
2. Extract ALL biological concepts represented in the dataset, including but not limited to:
   - Genes and variants investigated (e.g., p53, BRCA1)
   - Species studied (e.g., Homo sapiens, Escherichia coli)
   - Sample sources (e.g., organoid cultures, human samples)
   - Diseases or phenotypes studied (e.g., Alzheimer's disease, lung cancer)
   - Cell types or tissues examined (e.g., lung tissue, neural progenitor cells)
   - Experimental techniques or methodologies used (e.g., RNA-seq, ChIP-seq)
3. Extract ALL biological concepts represented in the research query using the same categories.
4. Assign a relevance score from 0 to 10 in increments of 1, based solely on the provided information. 
   - Do not fabricate or assume information not explicitly stated about the dataset. 
   - If confirmed information about the dataset is POSSIBLY useful for the research question, view this favourably for determining dataset relevance. 
   - Note that the gene, disease, and cell type/tissue being studied are the most important in determining relevance.
   - Use the following scoring guide:

   0: No relevance. All biological concepts (genes, species, samples, diseases, cell types, methods) are completely unrelated to the research query.
   1: Minimal relevance. One minor aspect is loosely related, but the overall focus is different.
   2: Low relevance. One major aspect aligns with the query, but other key elements differ significantly.
   3: Somewhat low relevance. Two aspects align, but critical elements are still mismatched.
   4: Moderate relevance. Multiple aspects align, but there are still significant differences in focus or approach.
   5: Moderately relevant. Most major aspects align, but there are some notable differences that may limit direct applicability.
   6: Relevant. All major aspects align, but there might be differences in specific genes, cell types, or methodologies that somewhat reduce direct applicability.
   7: Highly relevant. Very close alignment in all major aspects, with only minor differences that don't significantly impact applicability.
   8: Very highly relevant. Near-perfect alignment in all major aspects, with at most one or two minor differences.
   9: Extremely relevant. Perfect alignment in all major aspects, with at most one negligible difference.
   10: Perfectly relevant. The dataset appears to be an exact match for the research query in all aspects.

5. Provide a brief justification (4-5 sentences) for the assigned score, highlighting key similarities and differences.

## OUTPUT
For each dataset, provide a JSON object with the ID, relevance score, and justification. Do not include any other information.

## HANDLING LIMITED INFORMATION
If the dataset title or summary lacks sufficient detail:
- Focus on the information that is available
- Do not make assumptions about missing information
- Assign a lower score if critical information is absent
- Note the lack of information in the justification

Remember, it's better to assign a lower score due to lack of information than to assume relevance without evidence.

Given the following datasets and query, determine if each dataset is relevant.
        Query: {query}
        Datasets:
        """
        for _, row in batch.iterrows():
            prompt += f"""
            ID: {row['ID']}
            Title: {row['Title']}
            Summary: {row['Summary']}
            """
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0.3,
                response_model=Assessments,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that determines dataset relevance and responds in JSON format."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=3000
            )
            results.extend([assessment.dict() for assessment in response.assessments])
        except Exception as e:
            results.extend([{"ID": row['ID'], "Relevance": "Error", "Justification": str(e)} for _, row in batch.iterrows()])
        time.sleep(1)  # Be nice to the API
    return results

# Extension - define function to assess relevance multiple times
async def assess_relevance_batch_multiple(df, query, num_queries: int = 3, batch_size=20):
    async def single_assess():
        return assess_relevance_batch(df, query, batch_size)
    
    tasks = [single_assess() for _ in range(num_queries)]
    results = await asyncio.gather(*tasks)
    
    # Collate results
    collated_results = {}
    for i, result_set in enumerate(results):
        for assessment in result_set:
            id = assessment['ID']
            if id not in collated_results:
                collated_results[id] = {'scores': [], 'justifications': []}
            collated_results[id]['scores'].append(assessment['RelevanceScore'])
            collated_results[id]['justifications'].append(f"Attempt {i+1}: Score {assessment['RelevanceScore']} - {assessment['Justification']}")
    
    # Determine final relevance and format output
    final_results = []
    for id, data in collated_results.items():
        mean_score = statistics.mean(data['scores'])
        
        result = {
            'ID': id,
            'RelevanceScore': round(mean_score, 1),
        }
        
        # Add individual scores and justifications
        for i in range(num_queries):
            result[f'IndividualScore{i+1}'] = data['scores'][i] if i < len(data['scores']) else None
            result[f'Justification{i+1}'] = data['justifications'][i] if i < len(data['justifications']) else None
        
        final_results.append(result)
    
    return final_results

async def main(user_query):
    # Extract terms
    search_terms = await extract_terms_multiple(user_query)
    print("Extracted terms:", search_terms)

    # Perform Entrez search and remove duplicates
    geo_ids = set()  # Use a set to automatically remove duplicates
    for term in search_terms:
        search_results = perform_search(term)
        geo_ids.update(search_results.get('IdList', []))  # Update the set with new IDs
    if not geo_ids:
        return pd.DataFrame({'Error': ["No results found for the extracted terms"]})

    # Convert set back to list
    geo_ids = list(geo_ids)

    # Create DataFrame with GEO information
    df = create_geo_dataframe(geo_ids)

    # Assess relevance
    relevance_results = await assess_relevance_batch_multiple(df, user_query, num_queries=num_queries)
    relevance_df = pd.DataFrame(relevance_results)

    # Merge results
    df['ID'] = df['ID'].astype(str)
    relevance_df['ID'] = relevance_df['ID'].astype(str)
    result_df = df.merge(relevance_df, on='ID', how='left')

    # Dynamically create the desired order of columns
    base_columns = ['ID', 'Title', 'Summary', 'Accession', 'Date', 'RelevanceScore']
    score_columns = [f'IndividualScore{i+1}' for i in range(num_queries)]
    justification_columns = [f'Justification{i+1}' for i in range(num_queries)]
    desired_order = base_columns + score_columns + justification_columns

    # Reorder columns
    result_df = result_df[desired_order]

    # Reset index
    result_df = result_df.reset_index(drop=True)

    return result_df

In [39]:
# Test case - term extraction
search_terms = await (extract_terms_multiple(user_query))
print(search_terms)

Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'immune response', 'lung carcinoma']
Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'CAR T-cell therapy', 'small cell lung cancer']
Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'CAR T-cell therapy', 'lung carcinoma']
['immunotherapies AND "gse"[Filter]', 'non-small cell lung cancer AND "gse"[Filter]', 'CAR T-cell therapy AND "gse"[Filter]', 'checkpoint inhibitors AND "gse"[Filter]', 'small cell lung cancer AND "gse"[Filter]', 'lung carcinoma AND "gse"[Filter]', 'immune response AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]']


In [40]:
# Test case - performing search
geo_ids = []
for term in tqdm(search_terms, "Performing search for extracted terms"):
    search_results = perform_search(term)
    geo_ids.extend(search_results.get('IdList', []))


Performing search for extracted terms: 100%|██████████| 8/8 [00:12<00:00,  1.56s/it]


In [11]:
# Test case - create data frame out of search results
df = create_geo_dataframe(geo_ids)
df # Jupyter is quite nice, I don't need to specify some specific function. Just this gives good output.

Processing GEO IDs in batches: 100%|██████████| 5/5 [00:10<00:00,  2.16s/it]


Unnamed: 0,ID,Title,Summary,Accession,Date
0,200270741,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270741,2024/07/24
1,200270740,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270740,2024/07/24
2,200267325,RNA profiling of CSF resident macrophages and ...,We investigated the functional effect of dura-...,GSE267325,2024/07/24
3,200267322,Single-cell atlas of dura cells from non-tumor...,We performed scRNA-seq on the cells isolated f...,GSE267322,2024/07/24
4,200246173,Comparison of the response of Col-0 and ate1 a...,Arabidopsis mutants of the N-degron pathway ha...,GSE246173,2024/07/24
...,...,...,...,...,...
95,200244683,Digital transformation of herbal medicine: Con...,This study aimed to identify transcript expres...,GSE244683,2024/06/21
96,200244682,Digital transformation of herbal medicine: Con...,This study aimed to identify transcript expres...,GSE244682,2024/06/21
97,200236720,Synthetic essentiality of thymine DNA glycosyl...,Thymine DNA Glycosylase (TDG) is a versatile p...,GSE236720,2024/06/21
98,200236719,Synthetic essentiality of thymine DNA glycosyl...,Thymine DNA Glycosylase (TDG) is a versatile p...,GSE236719,2024/06/21


In [33]:
df2 = df[0:25]
df2

Unnamed: 0,ID,Title,Summary,Accession,Date
0,200270741,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270741,2024/07/24
1,200270740,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270740,2024/07/24
2,200267325,RNA profiling of CSF resident macrophages and ...,We investigated the functional effect of dura-...,GSE267325,2024/07/24
3,200267322,Single-cell atlas of dura cells from non-tumor...,We performed scRNA-seq on the cells isolated f...,GSE267322,2024/07/24
4,200246173,Comparison of the response of Col-0 and ate1 a...,Arabidopsis mutants of the N-degron pathway ha...,GSE246173,2024/07/24
5,200241235,A novel adjuvant formulation induces robust Th...,"After clean drinking water, vaccination is the...",GSE241235,2024/07/24
6,200235093,Pre-existing skin-resident CD8 and γδ T cell c...,This SuperSeries is composed of the SubSeries ...,GSE235093,2024/07/24
7,200235092,Pre-existing skin-resident CD8 and γδ T cell c...,Merkel Cell Carcinoma (MCC) is an aggressive s...,GSE235092,2024/07/24
8,200235091,Pre-existing skin-resident CD8 and γδ T cell c...,Merkel Cell Carcinoma (MCC) is an aggressie sk...,GSE235091,2024/07/24
9,200235090,Pre-existing skin-resident CD8 and γδ T cell c...,Merkel Cell Carcinoma (MCC) is an aggressie sk...,GSE235090,2024/07/24


In [36]:
# Test case - assessing dataset relevances

relevance_results = await assess_relevance_batch_multiple(df2, user_query)
relevance_df = pd.DataFrame(relevance_results)

# Merge results
df2['ID'] = df2['ID'].astype(str)
relevance_df['ID'] = relevance_df['ID'].astype(str)
result_df = df2.merge(relevance_df, on='ID', how='left')

# Reorder columns
desired_order = ['ID', 'Title', 'Summary', 'Accession', 'Date', 'RelevanceScore', 'IndividualScores', 'Justifications']
result_df = result_df[desired_order]

# Reset index
result_df = result_df.reset_index(drop=True)
result_df

Determining dataset relevance: 100%|██████████| 2/2 [00:22<00:00, 11.25s/it]
Determining dataset relevance: 100%|██████████| 2/2 [00:26<00:00, 13.45s/it]
Determining dataset relevance: 100%|██████████| 2/2 [00:38<00:00, 19.41s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['ID'] = df2['ID'].astype(str)


NameError: name 'reuslt_df' is not defined

In [37]:
result_df

Unnamed: 0,ID,Title,Summary,Accession,Date,RelevanceScore,IndividualScores,Justifications
0,200270741,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270741,2024/07/24,1.0,"[1, 1, 1]",Attempt 1: Score 1 - The dataset focuses on me...
1,200270740,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270740,2024/07/24,1.0,"[1, 1, 1]",Attempt 1: Score 1 - Similar to the previous d...
2,200267325,RNA profiling of CSF resident macrophages and ...,We investigated the functional effect of dura-...,GSE267325,2024/07/24,7.0,"[6, 6, 9]",Attempt 1: Score 6 - This dataset analyzes mac...
3,200267322,Single-cell atlas of dura cells from non-tumor...,We performed scRNA-seq on the cells isolated f...,GSE267322,2024/07/24,6.0,"[5, 5, 8]",Attempt 1: Score 5 - The study involves single...
4,200246173,Comparison of the response of Col-0 and ate1 a...,Arabidopsis mutants of the N-degron pathway ha...,GSE246173,2024/07/24,0.7,"[0, 0, 2]",Attempt 1: Score 0 - This dataset features Ara...
5,200241235,A novel adjuvant formulation induces robust Th...,"After clean drinking water, vaccination is the...",GSE241235,2024/07/24,1.0,"[1, 0, 2]",Attempt 1: Score 1 - While this study involves...
6,200235093,Pre-existing skin-resident CD8 and γδ T cell c...,This SuperSeries is composed of the SubSeries ...,GSE235093,2024/07/24,5.0,"[4, 5, 6]",Attempt 1: Score 4 - This dataset pertains to ...
7,200235092,Pre-existing skin-resident CD8 and γδ T cell c...,Merkel Cell Carcinoma (MCC) is an aggressive s...,GSE235092,2024/07/24,5.0,"[4, 5, 6]",Attempt 1: Score 4 - Similar to the previous d...
8,200235091,Pre-existing skin-resident CD8 and γδ T cell c...,Merkel Cell Carcinoma (MCC) is an aggressie sk...,GSE235091,2024/07/24,5.0,"[4, 5, 6]",Attempt 1: Score 4 - This dataset also focuses...
9,200235090,Pre-existing skin-resident CD8 and γδ T cell c...,Merkel Cell Carcinoma (MCC) is an aggressie sk...,GSE235090,2024/07/24,5.0,"[4, 5, 6]",Attempt 1: Score 4 - This study is aligned wit...


In [31]:
# Test case - full pipeline

result_df = await (main(user_query))
result_df

Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'CAR T-cell therapy', 'lung tumor microenvironment']
Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'immune response', 'lung tumor']
Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'CAR T-cell therapy', 'lung tumor microenvironment']
Extracted terms: ['lung tumor microenvironment AND "gse"[Filter]', 'immunotherapies AND "gse"[Filter]', 'CAR T-cell therapy AND "gse"[Filter]', 'non-small cell lung cancer AND "gse"[Filter]', 'checkpoint inhibitors AND "gse"[Filter]', 'immune response AND "gse"[Filter]', 'lung tumor AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]']


URLError: <urlopen error [Errno -2] Name or service not known>

In [38]:
result_df.to_csv("../results/2024_07_25_DetermineDatasetRelevance/DatasetRelevance_MultipleScores.csv")