In [28]:
# Modules
from openai import OpenAI
import os
import json
from tqdm import tqdm
import time
from Bio import Entrez
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import instructor
from pydantic import BaseModel, Field
from typing import List
import asyncio
from concurrent.futures import ThreadPoolExecutor
from collections import Counter
# from typing import Literal
import statistics

In [2]:
load_dotenv('../.env')

Entrez.email = os.getenv('ENTREZ_EMAIL')
Entrez.api_key = os.getenv('ENTREZ_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [3]:
# Test the OpenAI API key is working

client = OpenAI(
  api_key=openai_api_key,  # this is also the default, it can be omitted
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "What is the capital city of Victoria? Answer with only the name of the city",
        }
    ],
    model="gpt-4o-mini",
)

result = chat_completion.choices[0].message.content
print(result)

Melbourne


In [46]:
# Prepare dummy variables for testing purposes

user_query = "Identify datasets and samples which are relevant to exploring immunotherapies for lung cancer"
num_queries = 3

In [74]:
# Prepare functions for term extraction from research query

# Prepare output structures
## For term extraction from user query
class ExtractedTerms(BaseModel):
    extracted_terms: List[str] = Field(description="List of terms extracted from the query")
    expanded_terms: List[str] = Field(description="List of related terms generated from the extracted terms")

## For determining dataset relevance
class Assessment(BaseModel):
    ID: str
    RelevanceScore: int = Field(description="Score from 0 to 10, indicating relevance")
    Justification: str = Field(description="A brief explanation for the score")

class Assessments(BaseModel):
    assessments: List[Assessment]


# "Patch" the client?
client = instructor.patch(OpenAI(
    api_key=openai_api_key))

# Define function for term extraction 
def extract_terms(user_query: str) -> List[str]:
    prompt = f"""

## IDENTITY AND PURPOSE
You are an expert in literature searches of biological ideas. Your task is to identify biological term(s) from a query, and generate related terms for the purposes of generating a search query. 

## STEPS

- First, extract the biological term(s) from the input query. These should be specific and fall into one of the following categories:
1. Genes - Examples: BRCA1, TP53
2. Treatments/Methods - Examples: chemotherapy, CRISPR
3. Tissues/Cells - Examples: lung, hepatocytes
4. Diseases - Examples: Alzheimer's disease, lung cancer.

Do not fabricate items if no relevant term exists. Avoid general terms such as "disease" or "variant."

- Second, for each extracted biological term, generate two related terms. Make a considered effort to keep these terms in the same category as the original term. These are examples of an identified term, and possible relevant terms:
1. Genes: BRCA1 - Examples: BRCA2, oncogene
2. Treatments: Chemotherapy - Examples: radiotherapy, monoclonal antibody
3. Tissues/Cells: Lung - Examples: respiratory, alveoli
4. Diseases: Alzheimer's disease - Examples: dementia, amyloid plaque

## OUTPUT

Provide two lists:
1. Extracted terms: The primary terms identified directly from the query.
2. Expanded terms: The related terms generated from the extracted terms.
Do not include categories or justifications.

## INPUT
User query: {user_query}"""

    extracted_terms = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=1,
        response_model=ExtractedTerms,
        messages=[
            {"role": "system", "content": "You are an assistant that helps to extract and expand terms for better search results."},
            {"role": "user", "content": prompt}
        ]
    )
    
    print(f"Raw extracted terms: {extracted_terms.extracted_terms}")
    print(f"Raw expanded terms: {extracted_terms.expanded_terms}")
    
    all_terms = extracted_terms.extracted_terms + extracted_terms.expanded_terms
    terms_with_filter = [term + ' AND "gse"[Filter]' for term in all_terms]
    return terms_with_filter

# Extension - define function to perform term extraction multiple times
async def extract_terms_multiple(user_query: str, num_queries: int = 3) -> List[str]:
    async def single_extract():
        return extract_terms(user_query)
    
    tasks = [single_extract() for _ in range(num_queries)]
    results = await asyncio.gather(*tasks)
    
    # Flatten the list of lists and remove duplicates
    all_terms = list(set([term for sublist in results for term in sublist]))
    return all_terms

# Define function for performing search
def perform_search(term):
    search_handle = Entrez.esearch(db="gds", term=term, retmode="xml", retmax = 15) # CHANGE
    search_results = Entrez.read(search_handle)
    search_handle.close()
    return search_results

# Define function for extracting information from above search results
def extract_geo_info_batch(geo_ids):
    """
    Retrieve GEO information for a batch of GEO IDs.
    """
    ids_str = ",".join(geo_ids)
    handle = Entrez.esummary(db="gds", id=ids_str, retmode="xml")
    output = Entrez.read(handle)
    handle.close()

    data = []
    for geo_id, geo_data in zip(geo_ids, output):
        if isinstance(geo_data, dict):
            data.append({
                'ID': geo_id,
                'Title': geo_data.get('title', 'No title available'),
                'Summary': geo_data.get('summary', 'No summary available'),
                'Accession': geo_data.get('Accession', 'No accession available'),
                'Species': geo_data.get('taxon', 'No taxon available'),
                'Date': geo_data.get('PDAT', 'Date made public unknown')
            })
        else:
            data.append({'ID': geo_id, 'Title': 'Error', 'Summary': 'Unable to fetch data', 'Accession': 'Error'})

    return data

def create_geo_dataframe(geo_ids, batch_size=10):
    """Create a DataFrame from GEO search results using batch processing."""
    data = []
    for i in tqdm(range(0, len(geo_ids), batch_size), desc="Processing GEO IDs in batches"):
        batch_ids = geo_ids[i:i + batch_size]
        data.extend(extract_geo_info_batch(batch_ids))
        time.sleep(0.2)  # Be nice to NCBI servers
    return pd.DataFrame(data)


# Define function for determining relevance of datasets
def assess_relevance_batch(df, query, batch_size=10):
    results = []
    total_batches = (len(df) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(df), batch_size), desc="Determining dataset relevance", total=total_batches):
        batch = df.iloc[i:i+batch_size]
        prompt = f"""
## IDENTITY AND PURPOSE

You are a highly knowledgeable biologist tasked with identifying relevant datasets for a given research query. Your goal is to assess NCBI GEO datasets based on their titles and summaries, and determine their relevance to the research question at hand.

## STEPS

1. For each dataset, carefully analyze the provided title and summary.
2. Extract ALL biological concepts represented in the dataset, including but not limited to:
   - Genes and variants investigated (e.g., p53, BRCA1)
   - Species studied (e.g., Homo sapiens, Escherichia coli)
   - Sample sources (e.g., organoid cultures, human samples)
   - Diseases or phenotypes studied (e.g., Alzheimer's disease, lung cancer)
   - Cell types or tissues examined (e.g., lung tissue, neural progenitor cells)
   - Experimental techniques or methodologies used (e.g., RNA-seq, ChIP-seq)
3. Extract ALL biological concepts represented in the research query using the same categories.
4. Assign a relevance score from 0 to 10 in increments of 1, based solely on the provided information. 
   - Do not fabricate or assume information not explicitly stated about the dataset. 
   - If confirmed information about the dataset is POSSIBLY useful for the research question, view this favourably for determining dataset relevance. 
   - Note that the gene, disease, and cell type/tissue being studied are the most important in determining relevance. The other factors are considered minor aspects.
   - Use the following scoring guide:

   0: No relevance. All biological concepts (genes, species, samples, diseases, cell types, methods) are completely unrelated to the research query.
   1: Minimal relevance. One minor aspect is loosely related, but the overall focus is different.
   2: Low relevance. One major aspect aligns with the query, but other key elements differ significantly.
   3: Somewhat low relevance. Two aspects align, but critical elements are still mismatched.
   4: Moderate relevance. Multiple aspects align, but there are still significant differences in focus or approach.
   5: Moderately relevant. Most major aspects align, but there are some notable differences that may limit direct applicability.
   6: Relevant. All major aspects align, but there might be differences in specific genes, cell types, or methodologies that somewhat reduce direct applicability.
   7: Highly relevant. Very close alignment in all major aspects, with only minor differences that don't significantly impact applicability.
   8: Very highly relevant. Near-perfect alignment in all major aspects, with at most one or two minor differences.
   9: Extremely relevant. Perfect alignment in all major aspects, with at most one negligible difference.
   10: Perfectly relevant. The dataset appears to be an exact match for the research query in all aspects.

5. Provide a brief justification (3-4 sentences) for the assigned score, highlighting key similarities and differences.

## OUTPUT
For each dataset, provide a JSON object with the ID, relevance score, and justification. 
- The relevance score should be a number, with no other information.
- The justification should be a 4-5 sentence explanation for the relevance score. 

## HANDLING LIMITED INFORMATION
If the dataset title or summary lacks sufficient detail:
- Focus on the information that is available
- Do not make assumptions about missing information
- Assign a lower score if critical information is absent
- Note the lack of information in the justification

Remember, it's better to assign a lower score due to lack of information than to assume relevance without evidence.

Given the following datasets and query, determine if each dataset is relevant.
        Query: {query}
        Datasets:
        """
        for _, row in batch.iterrows():
            prompt += f"""
            ID: {row['ID']}
            Title: {row['Title']}
            Summary: {row['Summary']}
            Species: {row['Species']}
            """
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0.3,
                response_model=Assessments,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that determines dataset relevance and responds in JSON format."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=5000
            )
            results.extend([assessment.dict() for assessment in response.assessments])
        except Exception as e:
            results.extend([{"ID": row['ID'], "Relevance": "Error", "Justification": str(e)} for _, row in batch.iterrows()])
        time.sleep(1)  # Be nice to the API
    return results

# Extension - define function to assess relevance multiple times
async def assess_relevance_batch_multiple(df, query, num_queries: int = 3, batch_size=20):
    async def single_assess():
        return assess_relevance_batch(df, query, batch_size)
    
    tasks = [single_assess() for _ in range(num_queries)]
    results = await asyncio.gather(*tasks)
    
    # Collate results
    collated_results = {}
    for i, result_set in enumerate(results):
        for assessment in result_set:
            id = assessment['ID']
            if id not in collated_results:
                collated_results[id] = {'scores': [], 'justifications': []}
            collated_results[id]['scores'].append(assessment['RelevanceScore'])
            collated_results[id]['justifications'].append(assessment['Justification'])
    
    # Determine final relevance and format output
    final_results = []
    for id, data in collated_results.items():
        mean_score = statistics.mean(data['scores'])
        
        result = {
            'ID': id,
            'RelevanceScore': round(mean_score, 1),
        }
        
        # Add individual scores and justifications
        for i in range(num_queries):
            result[f'IndividualScore{i+1}'] = data['scores'][i] if i < len(data['scores']) else None
            result[f'Justification{i+1}'] = data['justifications'][i] if i < len(data['justifications']) else None
        
        final_results.append(result)
    
    return final_results

async def main(user_query):
    # Extract terms
    search_terms = await extract_terms_multiple(user_query)
    print("Extracted terms:", search_terms)

    # Perform Entrez search and remove duplicates
    geo_ids = set()  # Use a set to automatically remove duplicates
    for term in search_terms:
        search_results = perform_search(term)
        geo_ids.update(search_results.get('IdList', []))  # Update the set with new IDs
    if not geo_ids:
        return pd.DataFrame({'Error': ["No results found for the extracted terms"]})

    # Convert set back to list
    geo_ids = list(geo_ids)

    # Create DataFrame with GEO information
    df = create_geo_dataframe(geo_ids)

    # Assess relevance
    relevance_results = await assess_relevance_batch_multiple(df, user_query, num_queries=num_queries) # Currently have this at 3
    relevance_df = pd.DataFrame(relevance_results)

    # Merge results
    df['ID'] = df['ID'].astype(str)
    relevance_df['ID'] = relevance_df['ID'].astype(str)
    result_df = df.merge(relevance_df, on='ID', how='left')

    # Dynamically create the desired order of columns
    base_columns = ['ID', 'Title', 'Summary', 'Species', 'Accession', 'Date', 'RelevanceScore']
    score_columns = [f'IndividualScore{i+1}' for i in range(num_queries)]
    justification_columns = [f'Justification{i+1}' for i in range(num_queries)]
    desired_order = base_columns + score_columns + justification_columns

    # Reorder columns
    result_df = result_df[desired_order]

    # Reset index
    result_df = result_df.reset_index(drop=True)

    return result_df

In [39]:
# Test case - term extraction
search_terms = await (extract_terms_multiple(user_query))
print(search_terms)

Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'immune response', 'lung carcinoma']
Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'CAR T-cell therapy', 'small cell lung cancer']
Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'CAR T-cell therapy', 'lung carcinoma']
['immunotherapies AND "gse"[Filter]', 'non-small cell lung cancer AND "gse"[Filter]', 'CAR T-cell therapy AND "gse"[Filter]', 'checkpoint inhibitors AND "gse"[Filter]', 'small cell lung cancer AND "gse"[Filter]', 'lung carcinoma AND "gse"[Filter]', 'immune response AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]']


In [56]:
# Test case - performing search
geo_ids = []
for term in tqdm(search_terms, "Performing search for extracted terms"):
    search_results = perform_search(term)
    geo_ids.extend(search_results.get('IdList', []))


Performing search for extracted terms: 100%|██████████| 8/8 [00:08<00:00,  1.12s/it]


In [62]:
# Test case - create data frame out of search results
df = create_geo_dataframe(geo_ids)
df # Jupyter is quite nice, I don't need to specify some specific function. Just this gives good output.

Processing GEO IDs in batches: 100%|██████████| 4/4 [00:07<00:00,  1.92s/it]


Unnamed: 0,ID,Title,Summary,Accession,Species,Date
0,200269394,Long-lived central memory gamma delta T cells ...,The involvement of γδ TCR-bearing lymphocytes ...,GSE269394,Mus musculus,2024/07/25
1,200261624,Gene expression profile of all CD4 T cells fro...,The discovery of naïve T cell transcriptional ...,GSE261624,Mus musculus,2024/07/25
2,200270741,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270741,Homo sapiens,2024/07/24
3,200270740,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,GSE270740,Homo sapiens,2024/07/24
4,200270084,Genome-wide Methylation Patterns in Primary Uv...,Despite studies highlighting the prognostic ut...,GSE270084,Homo sapiens,2024/07/24
5,200239389,Prolyl hydroxylase domain enzyme PHD2 inhibits...,Prolyl hydroxylase domain protein 2 (PHD2) is ...,GSE239389,Homo sapiens,2024/07/17
6,200272045,Identification of mutant KRAS-related genes as...,Oncogenic KRAS is found in more than 25% of lu...,GSE272045,Homo sapiens,2024/07/16
7,200269782,Defective N-Glycosylation of IL6 Induces Metas...,The biological consequences of various IL-6 gl...,GSE269782,Homo sapiens,2024/07/16
8,200218225,CRISPR screen of radiation resistant genes in ...,Radiotherapy is an important treatment for non...,GSE218225,Homo sapiens,2024/07/12
9,200234818,Faecalibaterium prausnitzii strain EXL01 boost...,Gut microbiota impacts responses to immune che...,GSE234818,Mus musculus,2024/07/10


In [75]:
# Test case - assessing dataset relevances

relevance_results = await assess_relevance_batch_multiple(df, user_query, num_queries=num_queries) # Currently have this at 3
relevance_df = pd.DataFrame(relevance_results)

# Merge results
df['ID'] = df['ID'].astype(str)
relevance_df['ID'] = relevance_df['ID'].astype(str)
result_df = df.merge(relevance_df, on='ID', how='left')

# Dynamically create the desired order of columns
base_columns = ['ID', 'Title', 'Summary', 'Species', 'Accession', 'Date', 'RelevanceScore']
score_columns = [f'IndividualScore{i+1}' for i in range(num_queries)]
justification_columns = [f'Justification{i+1}' for i in range(num_queries)]
desired_order = base_columns + score_columns + justification_columns

# Reorder columns
result_df = result_df[desired_order]

# Reset index
result_df = result_df.reset_index(drop=True)

result_df # Based on the output I'm assuming the token limit is too low

Determining dataset relevance: 100%|██████████| 2/2 [00:38<00:00, 19.16s/it]
Determining dataset relevance: 100%|██████████| 2/2 [00:43<00:00, 21.96s/it]
Determining dataset relevance: 100%|██████████| 2/2 [00:45<00:00, 22.91s/it]


Unnamed: 0,ID,Title,Summary,Species,Accession,Date,RelevanceScore,IndividualScore1,IndividualScore2,IndividualScore3,Justification1,Justification2,Justification3
0,200269394,Long-lived central memory gamma delta T cells ...,The involvement of γδ TCR-bearing lymphocytes ...,Mus musculus,GSE269394,2024/07/25,1.8,2,2.0,2.0,The dataset focuses on γδ T cells and their ro...,This dataset investigates γδ T cells in a muri...,This dataset focuses on γδ T cells and their r...
1,200261624,Gene expression profile of all CD4 T cells fro...,The discovery of naïve T cell transcriptional ...,Mus musculus,GSE261624,2024/07/25,1.5,1,1.0,1.0,This dataset examines CD4 T cell responses to ...,This dataset focuses on CD4 T cells in a mouse...,This dataset examines CD4 T cells in the conte...
2,200270741,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,Homo sapiens,GSE270741,2024/07/24,6.7,8,8.0,6.0,This dataset investigates TNF-induced dediffer...,This dataset investigates TNF-induced dediffer...,This dataset investigates TNF-induced dediffer...
3,200270740,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,Homo sapiens,GSE270740,2024/07/24,6.7,8,8.0,6.0,"Similar to the previous dataset, this one expl...","Similar to the previous dataset, this one expl...","Similar to dataset 200270741, this dataset exp..."
4,200270084,Genome-wide Methylation Patterns in Primary Uv...,Despite studies highlighting the prognostic ut...,Homo sapiens,GSE270084,2024/07/24,3.0,3,4.0,2.0,This dataset studies DNA methylation patterns ...,This dataset analyzes DNA methylation patterns...,This dataset focuses on DNA methylation patter...
5,200239389,Prolyl hydroxylase domain enzyme PHD2 inhibits...,Prolyl hydroxylase domain protein 2 (PHD2) is ...,Homo sapiens,GSE239389,2024/07/17,7.8,9,7.0,8.0,This dataset focuses on PHD2 in non-small cell...,This dataset investigates the role of PHD2 in ...,This dataset investigates the role of PHD2 in ...
6,200272045,Identification of mutant KRAS-related genes as...,Oncogenic KRAS is found in more than 25% of lu...,Homo sapiens,GSE272045,2024/07/16,8.5,9,8.0,9.0,This dataset investigates KRAS mutations in NS...,This dataset focuses on mutant KRAS in non-sma...,This dataset focuses on mutant KRAS in non-sma...
7,200269782,Defective N-Glycosylation of IL6 Induces Metas...,The biological consequences of various IL-6 gl...,Homo sapiens,GSE269782,2024/07/16,7.0,6,7.0,8.0,This dataset examines the role of IL-6 glycosy...,This dataset explores the role of IL-6 glycosy...,This dataset investigates the role of IL-6 gly...
8,200218225,CRISPR screen of radiation resistant genes in ...,Radiotherapy is an important treatment for non...,Homo sapiens,GSE218225,2024/07/12,8.0,7,8.0,9.0,This dataset investigates radiation resistance...,This dataset examines radiation resistance in ...,This dataset examines resistance to radiothera...
9,200234818,Faecalibaterium prausnitzii strain EXL01 boost...,Gut microbiota impacts responses to immune che...,Mus musculus,GSE234818,2024/07/10,9.0,9,9.0,9.0,This dataset explores the impact of gut microb...,This dataset investigates the impact of gut mi...,This dataset explores the impact of gut microb...


In [69]:
# Test case - full pipeline

result_df = await (main(user_query))
result_df

Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['targeted therapy', 'non-small cell lung cancer', 'immune checkpoint inhibitors', 'small cell lung cancer']
Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'NSCLC', 'immune response', 'small cell lung cancer']
Raw extracted terms: ['immunotherapies', 'lung cancer']
Raw expanded terms: ['checkpoint inhibitors', 'non-small cell lung cancer', 'immune response', 'squamous cell lung cancer']
Extracted terms: ['NSCLC AND "gse"[Filter]', 'immunotherapies AND "gse"[Filter]', 'non-small cell lung cancer AND "gse"[Filter]', 'checkpoint inhibitors AND "gse"[Filter]', 'small cell lung cancer AND "gse"[Filter]', 'immune checkpoint inhibitors AND "gse"[Filter]', 'squamous cell lung cancer AND "gse"[Filter]', 'immune response AND "gse"[Filter]', 'targeted therapy AND "gse"[Filter]', 'lung cancer AND "gse"[Filter]']


Processing GEO IDs in batches: 100%|██████████| 4/4 [00:07<00:00,  1.86s/it]
Determining dataset relevance: 100%|██████████| 2/2 [00:33<00:00, 16.80s/it]
Determining dataset relevance: 100%|██████████| 2/2 [00:33<00:00, 16.94s/it]
Determining dataset relevance: 100%|██████████| 2/2 [00:32<00:00, 16.36s/it]


Unnamed: 0,ID,Title,Summary,Species,Accession,Date,RelevanceScore,IndividualScore1,IndividualScore2,IndividualScore3,Justification1,Justification2,Justification3
0,200270741,Ceramide metabolism alterations contribute to ...,Introduction: Advanced cutaneous melanoma is a...,Homo sapiens,GSE270741,2024/07/24,1.3,2,1,1,Attempt 1: Score 2 - This dataset focuses on m...,Attempt 2: Score 1 - This dataset focuses on m...,Attempt 3: Score 1 - This dataset focuses on m...
1,200266291,Single cell RNAseq profiling of Human Pancreat...,Mechanisms driving sex differences across isle...,Homo sapiens,GSE266291,2024/07/25,0.3,1,0,0,Attempt 1: Score 1 - This dataset examines pan...,Attempt 2: Score 0 - This dataset investigates...,Attempt 3: Score 0 - This dataset investigates...
2,200271671,Exosomal miR-194 from adipose-derived stem cel...,"Hypertrophic scars, which result from aberrant...",Oryctolagus cuniculus,GSE271671,2024/07/24,0.3,1,0,0,Attempt 1: Score 1 - The study investigates hy...,Attempt 2: Score 0 - The study focuses on hype...,Attempt 3: Score 0 - This dataset focuses on h...
3,200233096,STAT3 inhibition permits epigenetic reprogramm...,Cultured acute myeloid leukemia (AML) blasts c...,Mus musculus,GSE233096,2024/07/23,0.3,1,0,0,Attempt 1: Score 1 - This dataset focuses on a...,Attempt 2: Score 0 - This dataset examines acu...,Attempt 3: Score 0 - This dataset studies acut...
4,200261624,Gene expression profile of all CD4 T cells fro...,The discovery of naïve T cell transcriptional ...,Mus musculus,GSE261624,2024/07/25,0.3,1,0,0,Attempt 1: Score 1 - This dataset studies CD4 ...,Attempt 2: Score 0 - This dataset focuses on C...,Attempt 3: Score 0 - This dataset examines CD4...
5,200233098,STAT3 inhibition permits epigenetic reprogramm...,Cultured acute myeloid leukemia (AML) blasts c...,Mus musculus,GSE233098,2024/07/23,0.3,1,0,0,Attempt 1: Score 1 - Similar to the previous d...,"Attempt 2: Score 0 - Similar to ID 200233096, ...","Attempt 3: Score 0 - Similar to ID 200233096, ..."
6,200271086,Personalized dendritic cell vaccine facilitate...,Adoptive cell transfer (ACT) using neoantigen-...,Mus musculus,GSE271086,2024/07/05,1.0,3,0,0,Attempt 1: Score 3 - This dataset discusses th...,Attempt 2: Score 0 - This dataset discusses de...,Attempt 3: Score 0 - This dataset discusses de...
7,200242951,Contribution of crosstalk of mesothelial and t...,Tumor metastasis commonly affects pleura in ad...,Homo sapiens,GSE242951,2024/06/26,8.0,8,8,8,Attempt 1: Score 8 - This dataset is highly re...,Attempt 2: Score 8 - This dataset directly inv...,Attempt 3: Score 8 - This dataset directly inv...
8,200270084,Genome-wide Methylation Patterns in Primary Uv...,Despite studies highlighting the prognostic ut...,Homo sapiens,GSE270084,2024/07/24,0.7,2,0,0,Attempt 1: Score 2 - This dataset relates to u...,Attempt 2: Score 0 - This dataset focuses on u...,Attempt 3: Score 0 - This dataset focuses on u...
9,200261186,Serial Changes of Circulating Tumor Cells in P...,Background and aims: Immune checkpoint inhibit...,Homo sapiens,GSE261186,2024/07/17,0.7,2,0,0,Attempt 1: Score 2 - This dataset involves hep...,Attempt 2: Score 0 - This dataset studies circ...,Attempt 3: Score 0 - This dataset studies circ...


In [76]:
result_df.to_csv("../results/2024_07_25_DetermineDatasetRelevance/DatasetRelevance_MultipleScores.csv")