In [1]:
# Modules

from openai import OpenAI
import os
import json
from tqdm import tqdm
import time
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import instructor
from pydantic import BaseModel, Field
from typing import List, Dict

In [2]:
load_dotenv('../.env')

openai_api_key = os.getenv('OPENAI_API_KEY')

In [3]:
# Test OpenAI API...

client = OpenAI(
  api_key=openai_api_key,  # this is also the default, it can be omitted
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "What are the colours of the rainbow? Only respond with the names of the colours.",
        }
    ],
    model="gpt-4o-mini",
)

result = chat_completion.choices[0].message.content
print(result)

Red, Orange, Yellow, Green, Blue, Indigo, Violet.


In [None]:
# Testing correction of metadata

In [33]:
# Test matching of Kallisto samples to metadata

# I will begin by testing to see if 1) Kallisto samples can be matched to metadata based on the names of the samples in the metadata, and 2) if the two can be merged together in a single data frame

class ColumnIdentification(BaseModel):
    likely_sample_column: str = Field(..., description="The column name most likely to contain sample identifiers")
    confidence: float = Field(..., description="Confidence score for the column identification (0-1)")
    reasoning: str = Field(..., description="Explanation for why this column was chosen")

class SampleMatch(BaseModel):
    metadata_sample: str = Field(..., description="The sample name from the metadata")
    file_name: str = Field(..., description="The matched file name")
    confidence: float = Field(..., description="Confidence score of the match (0-1)")

class MatchResult(BaseModel):
    column_identification: ColumnIdentification = Field(..., description="Identification of the sample name column")
    matches: List[SampleMatch] = Field(..., description="List of matched samples and file names")
    matching_logic: str = Field(..., description="Explanation of the logic used to match samples to file names")

def read_csv(file_path):
    return pd.read_csv(file_path)

def create_prompt(metadata_df, file_names):
    prompt = f"""Analyze the following metadata and list of file names:

Metadata columns:
{metadata_df.columns.tolist()}

Metadata:
{metadata_df.to_string()}

File names:
{file_names}

Tasks:
1. Identify the column most likely to contain sample identifiers. Provide the column name, a confidence score, and your reasoning.
2. Match each sample from the identified column to the most likely corresponding file name. Consider variations in capitalization, spaces, dashes, and potential typos.
3. Explain the logic you used to match samples to file names.

Provide your analysis in a structured format.
"""
    return prompt

def get_openai_response(prompt, openai_api_key):
    client = instructor.patch(OpenAI(
    api_key=openai_api_key))
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_model=MatchResult
        )
        return response
    except Exception as e:
        return f"An error occurred: {str(e)}"

def update_metadata(metadata_df, match_result):
    sample_column = match_result.column_identification.likely_sample_column
    file_name_dict = {match.metadata_sample: match.file_name for match in match_result.matches}
    metadata_df['file_name'] = metadata_df[sample_column].map(file_name_dict)
    return metadata_df

In [34]:
if __name__ == "__main__":
    api_key = openai_api_key

    # Path to your metadata CSV file
    metadata_file_path = "../InputData/SETBP1_Tests/SETBP1_RNAseq_samples.csv"
    
    # Read the metadata CSV file
    metadata_df = read_csv(metadata_file_path)
    
    # List of sample file names (you would typically read this from a directory or another source)
    with open("../InputData/SETBP1_Tests/abundance_files.txt", 'r') as file:
        file_names = file.read()
    
    # Create a prompt with the metadata and file names
    prompt = create_prompt(metadata_df, file_names)
    
    # Get the response from OpenAI
    response = get_openai_response(prompt, openai_api_key)
    # Print the identified sample name column and reasoning
    print(f"Likely sample name column: {response.column_identification.likely_sample_column}")
    print(f"Confidence: {response.column_identification.confidence}")
    print(f"Reasoning: {response.column_identification.reasoning}\n")
    
    # Print the matching logic
    print("Matching logic:")
    print(response.matching_logic)
    print()
    
    # Update the metadata DataFrame with matched file names
    updated_metadata = update_metadata(metadata_df, response)
    
    # Print the updated metadata
    print("Updated Metadata:")
    print(updated_metadata)
    
    # Optionally, save the updated metadata to a new CSV file
    updated_metadata.to_csv("../results/2024_07_31_AutomatedDataProcessing/Clean_MetadataSampleMatching.csv", index=False)

Likely sample name column: Sample Name
Confidence: 1.0
Reasoning: The 'Sample Name' column contains names that are structured similarly to the file names, with consistent patterns allowing for clear correspondence.

Matching logic:
The matching was performed by normalizing both sample identifiers and file names to a consistent format: converting underscores to dashes, stripping out spaces, and ensuring uniform casing. The resulting normalized names were then compared for exact matches.

Updated Metadata:
                          Sample Name Transfection      Genotype Cell type   \
0      KOLF2_SETBP1_VUS2_A21_10_day 0            A   VUS2 HDR/WT       iPSC   
1       KOLF2_SETBP1_VUS2_A21_7_day 0            A   VUS2 HDR/WT       iPSC   
2     KOLF2_SETBP1_VUS2_A21_10_day 24            A   VUS2 HDR/WT        NPC   
3      KOLF2_SETBP1_VUS2_A21_7_day 24            A   VUS2 HDR/WT        NPC   
4      KOLF2_SETBP1_VUS2_B1_4.1_day 0            B   VUS2 HDR/WT       iPSC   
5      KOLF2_SET

In [103]:
# Repeat above but with a more error-filled CSV. This will also be where I refine my prompt.

class ColumnIdentification(BaseModel):
    likely_sample_column: str = Field(..., description="The column name most likely to contain sample identifiers")
    confidence: float = Field(..., description="Confidence score for the column identification (0-1)")
    reasoning: str = Field(..., description="Explanation for why this column was chosen")

class SampleMatch(BaseModel):
    metadata_sample: str = Field(..., description="The sample name from the metadata")
    file_name: str = Field(..., description="The matched file name")
    confidence: float = Field(..., description="Confidence score of the match (0-1)")

class MatchResult(BaseModel):
    column_identification: ColumnIdentification = Field(..., description="Identification of the sample name column")
    matches: List[SampleMatch] = Field(..., description="List of matched samples and file names")
    matching_logic: str = Field(..., description="Explanation of the logic used to match samples to file names")

def read_csv(file_path):
    return pd.read_csv(file_path)

def get_openai_response(prompt, openai_api_key):
    client = instructor.patch(OpenAI(
    api_key=openai_api_key))
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_model=MatchResult
        )
        return response
    except Exception as e:
        return f"An error occurred: {str(e)}"

def update_metadata(metadata_df, match_result):
    sample_column = match_result.column_identification.likely_sample_column
    file_name_dict = {match.metadata_sample: match.file_name for match in match_result.matches}
    file_match_score = {match.metadata_sample: match.confidence for match in match_result.matches}
    metadata_df['file_name'] = metadata_df[sample_column].map(file_name_dict)
    metadata_df['match_confidence'] = metadata_df[sample_column].map(file_match_score)
    return metadata_df

def process_metadata_and_files(metadata_file_path, file_names_path, openai_api_key):
    # Read the metadata CSV file
    metadata_df = read_csv(metadata_file_path)
    
    # Read the list of sample file names. For the moment I just manually specified the input text files, but this will need to change later.
    with open(file_names_path, 'r') as file:
        file_names = file.read().splitlines()
    
    # Create a prompt with the metadata and file names
    prompt = create_prompt(metadata_df, file_names)
    
    # Get the response from OpenAI
    response = get_openai_response(prompt, openai_api_key)
    
    if isinstance(response, str):
        print(response)
        return
    
    # Print the identified sample name column and reasoning
    print(f"Likely sample name column: {response.column_identification.likely_sample_column}")
    print(f"Confidence: {response.column_identification.confidence}")
    print(f"Reasoning: {response.column_identification.reasoning}\n")
    
    # Print the matching logic
    print("Matching logic:")
    print(response.matching_logic)
    print()
    
    # Update the metadata DataFrame with matched file names
    updated_metadata = update_metadata(metadata_df, response)
    
    # Print the updated metadata
    return(updated_metadata)

In [104]:
if __name__ == "__main__":
    api_key = openai_api_key

    # Path to your metadata CSV file
    metadata_file_path = "../InputData/SETBP1_Tests/SETBP1_RNAseq_samples_errors.csv"
    
    # Path to your file names list
    file_names_path = "../InputData/SETBP1_Tests/abundance_files.txt"
    
    # Process the metadata and files
    updated_metadata = process_metadata_and_files(metadata_file_path, file_names_path, api_key)

updated_metadata.to_csv("../results/2024_07_31_AutomatedDataProcessing/Errors_MetadataSampleMatching.csv", index=False)

Likely sample name column: Sample Name
Confidence: 0.95
Reasoning: The metadata column 'Sample Name' contains identifiers for the samples, which typically aligns with the format of file names that include sample identifiers and respective conditions.

Matching logic:
I matched the file names to the metadata samples by normalizing both data sets. I transformed both the sample names in the metadata and the file names to a consistent format by removing punctuations, standardizing case, and replacing spaces with hyphens. Each file name was then iteratively checked for substring matches against the cleaned sample names using a confidence scoring system based on the degree of exact match and logical adjustments for minor discrepancies.



In [100]:
metadata_df = read_csv("../InputData/SETBP1_Tests/SETBP1_RNAseq_samples_errors.csv")
    
    # Read the list of sample file names. For the moment I just manually specified the input text files, but this will need to change later.
with open(file_names_path, 'r') as file:
    file_names = file.read().splitlines()

class ColumnIdentification(BaseModel):
    likely_sample_column: str = Field(..., description="The column name most likely to contain sample identifiers")
    confidence: float = Field(..., description="Confidence score for the column identification (0-1)")
    reasoning: str = Field(..., description="Explanation for why this column was chosen")

class SampleMatch(BaseModel):
    metadata_sample: str = Field(..., description="The matched sample name as reported in the metadata")
    file_name: str = Field(..., description="The file name")
    confidence: float = Field(..., description="Confidence score of the match (0-1)")

class MatchResult(BaseModel):
    column_identification: ColumnIdentification = Field(..., description="Identification of the sample name column")
    matches: List[SampleMatch] = Field(..., description="List of matched samples and file names")

def create_prompt(metadata_df, file_names):
    prompt = f"""

## IDENTITY AND PURPOSE

You are an expert in bioinformatic analyses. You will be provided with a list of files, and a metadata data frame. You are tasked with matching the file names to the existing metadata data frame. 

Take a deep breath, and carefully follow the steps outlined below to achieve the intended task.

## STEPS

1. First, identify the column name that is most likely to correspond to sample names. Focus on words such as "sample name" or "ID" to make this judgement.
2. Match ALL file names to rows in the metadata. Keep the following in mind:
- Every file name should match to a row in the metadata
- There may be errors in the metadata
- You should make efforts to identify likely errors, and consider the possibility of a match if the error was corrected
- Examples of errors include random spaces, incorrect spellings, inconsistent cases, and random punctuations
- Note that you may need to systematically apply some rules to better identify matches
- Do not prescriptively follow a rule to find matches; be flexible instead
- If a file name does not have a match in the sample metadata, report this as "No Match"

## OUTPUT

Provided your analysis in a structured format:
1. Provide the column name most likely to correspond to sample names, with a confidence score and reasoning.
2. List of EACH file, and the most likely corresponding sample name. Report the sample name EXACTLY as it is in the metadata - do not attempt to correct the metadata value. Similarly, report the file name EXACTLY as given.

## INPUT
Metadata columns:
{metadata_df.columns.tolist()}

Metadata:
{metadata_df.to_string()}
"""
    return prompt
    
prompt = create_prompt(metadata_df, file_names)

def get_openai_response(prompt, openai_api_key):
    client = instructor.patch(OpenAI(
    api_key=openai_api_key))
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            max_tokens = 10000,
            messages=[{"role": "user", "content": prompt}],
            response_model=MatchResult
        )
        return response
    except Exception as e:
        return f"An error occurred: {str(e)}"
response = get_openai_response(prompt, openai_api_key)
print(metadata_df.to_string)

<bound method DataFrame.to_string of                           Sample Name Transfection      Genotype Cell type   \
0      KOLF2_SETBP1_VUS2_A21_10_day 0            A   VUS2 HDR/WT       iPSC   
1      KOLF2_SETBP1_ VUS2_A21_7_day 0            A   VUS2 HDR/WT       iPSC   
2   KOLF2_SETBP1_VUS2_A21_10_day 24.             A          VUS@        NPC   
3      KOLF2_SETBP1_VUS2_A21_7_day 24            A   VUS2 HDR/WT        NPC   
4      KOLF2_SETBP1_VUS2_B1_4.1_day 0            B   VUS2 HDR/WT       iPSC   
5      KOLF2_SETBP1_VUS2_B1_4.2_day 0            B   VUS2 HDR/WT       iPSC   
6     KOLF2_SETBP1_VUS2_B1_4.1_day 24            B   vus2 HDR/WT        NPC   
7     KOLF2_SETBP1_VUS2_B1_4.2_day 24            B   VUS2 HDR/WT        NPC   
8    KOLF2_SETBP1_VUS2_C12.1_WT_day 0            C         WT/WT       iPSC   
9    KOLF2_SETBP1_VUS2_C12.2_WT_day 0            C         WT/WT       iPSC   
10   KOLF2_SETBP1_VUS2_C12-3_WT_day 0            C         WT/WT       iPSC   
11  KOLF2_SETBP

In [102]:
response.matches

[SampleMatch(metadata_sample='KOLF2_SETBP1_VUS2_A21_10_day 0', file_name='KOLF2_SETBP1_VUS2_A21_10_day 0', confidence=1.0),
 SampleMatch(metadata_sample='KOLF2_SETBP1_ VUS2_A21_7_day 0', file_name='KOLF2_SETBP1_ VUS2_A21_7_day 0', confidence=1.0),
 SampleMatch(metadata_sample='KOLF2_SETBP1_VUS2_A21_10_day 24.', file_name='KOLF2_SETBP1_VUS2_A21_10_day 24.', confidence=1.0),
 SampleMatch(metadata_sample='KOLF2_SETBP1_VUS2_A21_7_day 24', file_name='KOLF2_SETBP1_VUS2_A21_7_day 24', confidence=1.0),
 SampleMatch(metadata_sample='KOLF2_SETBP1_VUS2_B1_4.1_day 0', file_name='KOLF2_SETBP1_VUS2_B1_4.1_day 0', confidence=1.0),
 SampleMatch(metadata_sample='KOLF2_SETBP1_VUS2_B1_4.2_day 0', file_name='KOLF2_SETBP1_VUS2_B1_4.2_day 0', confidence=1.0),
 SampleMatch(metadata_sample='KOLF2_SETBP1_VUS2_B1_4.1_day 24', file_name='KOLF2_SETBP1_VUS2_B1_4.1_day 24', confidence=1.0),
 SampleMatch(metadata_sample='KOLF2_SETBP1_VUS2_B1_4.2_day 24', file_name='KOLF2_SETBP1_VUS2_B1_4.2_day 24', confidence=1.0),


In [4]:
# Testing for identification of interesting comparisons 

meta = pd.read_csv("~/work/InputData/GSE178333_COVID/meta.csv")

prompt = f"""

## IDENTITY AND PURPOSE

You are an expert in bioinformatic analyses. You will be provided with a metadata sheet, and are tasked with identifying contrasts that could be interesting in the metadata, with the intention of analysing these in a edgeR/limma based pipeline.
Take a deep breath, and carefully follow the steps outlined below to achieve the intended task.

## STEPS

1. Carefully consider each column, inferring what each column means from its name, and also the values in the column. 
2. Determine columns that appear to contain data that would be scientifically and biologically interesting to compare within the column.
- Only include comparisons that can be easily analysed in a limma/edgeR based pipeline
- Only include comparisons that would be valuable to the literature generally, and not just within the dataset

## OUTPUT

1. State all column names EXACTLY, and also include a brief 1 sentence description of what each column contains
2. State 5 comparisons that would be interesting to analyse in a limma/edgeR-based pipeline. 
3. For each comparison, include the EXACT column name, as well as the EXACT values that should be used for the comparison. Additionally, justify why the comparison would be interesting using up to 3 sentences


## INPUT

Metadata:
{meta.to_string()}
"""
client = OpenAI(
  api_key=openai_api_key,  # this is also the default, it can be omitted
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="gpt-4o-mini",
    max_tokens = 6000
)

result = chat_completion.choices[0].message.content
print(result)

## 1. Column Names and Descriptions

1. **title**: Contains the title or name of each sample, indicating the treatment and replicate.
2. **geo_accession**: Identifier for the sample in the Gene Expression Omnibus database.
3. **status**: Current status of the sample (e.g., public).
4. **submission_date**: Date when the sample was submitted to the database.
5. **last_update_date**: Date when the sample information was last updated.
6. **type**: Type of data, indicating how the sample is categorized (e.g., SRA).
7. **channel_count**: Number of channels used in the sequencing or imaging process.
8. **source_name_ch1**: Source of the biological material (e.g., Alveolar type 2 cells).
9. **organism_ch1**: Organism from which the sample is derived (e.g., Homo sapiens).
10. **characteristics_ch1**: Detailed characteristics of the sample, including cell type.
11. **characteristics_ch1.1**: Additional sample characteristics, including tissue origin.
12. **characteristics_ch1.2**: Further charac

In [5]:
meta

Unnamed: 0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,characteristics_ch1,...,instrument_model,library_selection,library_source,library_strategy,relation,relation.1,supplementary_file_1,cell type:ch1,tissue:ch1,treatment:ch1
0,Alveolar type 2 cells infected with Bavpat-1 rep1,GSM5387875,Public on Jun 13 2024,Jun 16 2021,Jun 13 2024,SRA,1,Alveolar type 2 cells,Homo sapiens,cell type: AT2 cells,...,Illumina NextSeq 500,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,AT2 cells,Lung,Infection with SARS-CoV-2 Bavpat-1
1,Alveolar type 2 cells infected with Bavpat-1 rep2,GSM5387876,Public on Jun 13 2024,Jun 16 2021,Jun 13 2024,SRA,1,Alveolar type 2 cells,Homo sapiens,cell type: AT2 cells,...,Illumina NextSeq 500,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,AT2 cells,Lung,Infection with SARS-CoV-2 Bavpat-1
2,Alveolar type 2 cells infected with Bavpat-1 rep3,GSM5387877,Public on Jun 13 2024,Jun 16 2021,Jun 13 2024,SRA,1,Alveolar type 2 cells,Homo sapiens,cell type: AT2 cells,...,Illumina NextSeq 500,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,AT2 cells,Lung,Infection with SARS-CoV-2 Bavpat-1
3,Alveolar type 2 cells infected with B.1.1.7 rep1,GSM5387878,Public on Jun 13 2024,Jun 16 2021,Jun 13 2024,SRA,1,Alveolar type 2 cells,Homo sapiens,cell type: AT2 cells,...,Illumina NextSeq 500,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,AT2 cells,Lung,Infection with SARS-CoV-2 B.1.1.7
4,Alveolar type 2 cells infected with B.1.1.7 rep2,GSM5387879,Public on Jun 13 2024,Jun 16 2021,Jun 13 2024,SRA,1,Alveolar type 2 cells,Homo sapiens,cell type: AT2 cells,...,Illumina NextSeq 500,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,AT2 cells,Lung,Infection with SARS-CoV-2 B.1.1.7
5,Alveolar type 2 cells infected with B.1.1.7 rep3,GSM5387880,Public on Jun 13 2024,Jun 16 2021,Jun 13 2024,SRA,1,Alveolar type 2 cells,Homo sapiens,cell type: AT2 cells,...,Illumina NextSeq 500,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,AT2 cells,Lung,Infection with SARS-CoV-2 B.1.1.7
6,"Alveolar type 2 cells, uninfected rep1",GSM5387881,Public on Jun 13 2024,Jun 16 2021,Jun 13 2024,SRA,1,Alveolar type 2 cells,Homo sapiens,cell type: AT2 cells,...,Illumina NextSeq 500,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,AT2 cells,Lung,Not infected
7,"Alveolar type 2 cells, uninfected rep2",GSM5387882,Public on Jun 13 2024,Jun 16 2021,Jun 13 2024,SRA,1,Alveolar type 2 cells,Homo sapiens,cell type: AT2 cells,...,Illumina NextSeq 500,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,AT2 cells,Lung,Not infected
8,"Alveolar type 2 cells, uninfected rep3",GSM5387883,Public on Jun 13 2024,Jun 16 2021,Jun 13 2024,SRA,1,Alveolar type 2 cells,Homo sapiens,cell type: AT2 cells,...,Illumina NextSeq 500,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,AT2 cells,Lung,Not infected
9,"Bronchiolar cells, uninfected rep1",GSM5387884,Public on Jun 13 2024,Jun 16 2021,Jun 13 2024,SRA,1,Bronchiolar epithelial cells,Homo sapiens,cell type: Bronchiolar cells,...,Illumina NextSeq 500,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,Bronchiolar cells,Lung,Not infected
