In [1]:
# Modules

from openai import OpenAI
import os
import json
from tqdm import tqdm
import time
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import instructor

In [5]:
load_dotenv('../.env')

openai_api_key = os.getenv('OPENAI_API_KEY')

In [6]:
# Test OpenAI API...

client = OpenAI(
  api_key=openai_api_key,  # this is also the default, it can be omitted
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "What are the colours of the rainbow? Only respond with the names of the colours.",
        }
    ],
    model="gpt-4o-mini",
)

result = chat_completion.choices[0].message.content
print(result)

Red, Orange, Yellow, Green, Blue, Indigo, Violet.


In [None]:
# Testing correction of metadata

In [7]:
# Test reading of CSVs

def read_csv(file_path):
    return pd.read_csv(file_path)

def create_prompt(df):
    # Convert the dataframe to a string representation
    csv_string = df.to_csv(index=False)

    # As a test case, just doing a generic prompt
    prompt = f"""Analyze the following CSV data:

{csv_string}

Please provide a summary of the data and any insights you can gather.
"""
    return prompt

def get_openai_response(prompt, api_key):
    client = OpenAI(api_key=api_key)
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="gpt-4o-mini",
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"An error occurred: {str(e)}"

if __name__ == "__main__":
    # Path to your CSV file
    csv_file_path = "../InputData/SETBP1_Tests/SETBP1_RNAseq_samples.csv"
    
    # Read the CSV file
    df = read_csv(csv_file_path)
    
    # Create a prompt with the CSV data
    prompt = create_prompt(df)
    
    # Get the response from OpenAI
    response = get_openai_response(prompt, openai_api_key)
    
    # Print the response
    print("OpenAI Analysis:")
    print(response)

OpenAI Analysis:
The provided CSV data is about a series of biological samples related to transfection experiments, focusing on different genotypes, cell types, and the impact of neural differentiation over two time points (0 days and 24 days). The samples appear to involve genetically modified cells derived from induced pluripotent stem cells (iPSCs) and neural progenitor cells (NPCs). 

### Summary of the Data

1. **Columns**:
   - **Sample Name**: Identifier for each sample, containing various prefixes indicating the genotype and time point.
   - **Transfection**: Represents different experimental groups designated by letters (A, B, C, D, E, F).
   - **Genotype**: Indicates the genetic background of the samples, including variants and wild types such as "VUS2 HDR/WT” and “WT/WT”.
   - **Cell type**: Specifies if the sample is derived from iPSCs or NPCs.
   - **Neural differentiation set**: Numeric value indicating a classification or additional grouping, varying from 1 to 2.

2. **S