In [7]:
# Modules

from openai import OpenAI
import os
import json
from tqdm import tqdm
import time
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import instructor
from pydantic import BaseModel, Field
from typing import List, Dict

In [2]:
load_dotenv('../.env')

openai_api_key = os.getenv('OPENAI_API_KEY')

In [3]:
# Test OpenAI API...

client = OpenAI(
  api_key=openai_api_key,  # this is also the default, it can be omitted
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "What are the colours of the rainbow? Only respond with the names of the colours.",
        }
    ],
    model="gpt-4o-mini",
)

result = chat_completion.choices[0].message.content
print(result)

Red, Orange, Yellow, Green, Blue, Indigo, Violet.


In [None]:
# Testing correction of metadata

In [33]:
# Test matching of Kallisto samples to metadata

# I will begin by testing to see if 1) Kallisto samples can be matched to metadata based on the names of the samples in the metadata, and 2) if the two can be merged together in a single data frame

class ColumnIdentification(BaseModel):
    likely_sample_column: str = Field(..., description="The column name most likely to contain sample identifiers")
    confidence: float = Field(..., description="Confidence score for the column identification (0-1)")
    reasoning: str = Field(..., description="Explanation for why this column was chosen")

class SampleMatch(BaseModel):
    metadata_sample: str = Field(..., description="The sample name from the metadata")
    file_name: str = Field(..., description="The matched file name")
    confidence: float = Field(..., description="Confidence score of the match (0-1)")

class MatchResult(BaseModel):
    column_identification: ColumnIdentification = Field(..., description="Identification of the sample name column")
    matches: List[SampleMatch] = Field(..., description="List of matched samples and file names")
    matching_logic: str = Field(..., description="Explanation of the logic used to match samples to file names")

def read_csv(file_path):
    return pd.read_csv(file_path)

def create_prompt(metadata_df, file_names):
    prompt = f"""Analyze the following metadata and list of file names:

Metadata columns:
{metadata_df.columns.tolist()}

Metadata:
{metadata_df.to_string()}

File names:
{file_names}

Tasks:
1. Identify the column most likely to contain sample identifiers. Provide the column name, a confidence score, and your reasoning.
2. Match each sample from the identified column to the most likely corresponding file name. Consider variations in capitalization, spaces, dashes, and potential typos.
3. Explain the logic you used to match samples to file names.

Provide your analysis in a structured format.
"""
    return prompt

def get_openai_response(prompt, openai_api_key):
    client = instructor.patch(OpenAI(
    api_key=openai_api_key))
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_model=MatchResult
        )
        return response
    except Exception as e:
        return f"An error occurred: {str(e)}"

def update_metadata(metadata_df, match_result):
    sample_column = match_result.column_identification.likely_sample_column
    file_name_dict = {match.metadata_sample: match.file_name for match in match_result.matches}
    metadata_df['file_name'] = metadata_df[sample_column].map(file_name_dict)
    return metadata_df

In [None]:
if __name__ == "__main__":
    api_key = openai_api_key

    # Path to your metadata CSV file
    metadata_file_path = "../InputData/SETBP1_Tests/SETBP1_RNAseq_samples.csv"
    
    # Read the metadata CSV file
    metadata_df = read_csv(metadata_file_path)
    
    # List of sample file names (you would typically read this from a directory or another source)
    with open("../InputData/SETBP1_Tests/abundance_files.txt", 'r') as file:
        file_names = file.read()
    
    # Create a prompt with the metadata and file names
    prompt = create_prompt(metadata_df, file_names)
    
    # Get the response from OpenAI
    response = get_openai_response(prompt, openai_api_key)
    # Print the identified sample name column and reasoning
    print(f"Likely sample name column: {response.column_identification.likely_sample_column}")
    print(f"Confidence: {response.column_identification.confidence}")
    print(f"Reasoning: {response.column_identification.reasoning}\n")
    
    # Print the matching logic
    print("Matching logic:")
    print(response.matching_logic)
    print()
    
    # Update the metadata DataFrame with matched file names
    updated_metadata = update_metadata(metadata_df, response)
    
    # Print the updated metadata
    print("Updated Metadata:")
    print(updated_metadata)
    
    # Optionally, save the updated metadata to a new CSV file
    updated_metadata.to_csv("../results/2024_07_31_AutomatedDataProcessing/Clean_MetadataSampleMatching.csv", index=False)