In [1]:
from utils import *



In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Assuming your data is stored in a csv file named 'data.csv'
df = pd.read_csv(r'/data/chenxi/llm-feature-engeneering/dataset/circor.csv')
df=df.drop(columns=['Patient ID','Recording locations:','Additional ID'])
df_clean = df.copy()
df['Murmur locations'] = df['Murmur locations'].str.split('+')
locations = ['PV', 'TV', 'AV', 'MV']
for location in locations:
    df[location] = df['Murmur locations'].apply(lambda x: 1 if x is not np.nan and location in x else 0)
df.drop('Murmur locations', axis=1, inplace=True)

In [3]:
# 1. Map the Age feature
age_mapping = {'Neonate': 1, 'Infant': 2, 'Child': 3, 'Adolescent': 4, 'Young adult': 5}
df_clean['Age'] = df_clean['Age'].map(age_mapping)
df_clean['Age'].fillna(-1, inplace=True)

# 2. Map the Sex feature
le = LabelEncoder()
df_clean['Sex'] = le.fit_transform(df_clean['Sex'])

# 3. Map the Pregnancy status feature
df_clean['Pregnancy status'] = df_clean['Pregnancy status'].map({False: 0, True: 1})

# 4. Handle missing values in Height and Weight
df_clean['Height'].fillna((df_clean['Height'].mean()), inplace=True)
df_clean['Weight'].fillna((df_clean['Weight'].mean()), inplace=True)

# 5. Map the Murmur feature
df_clean['Murmur'] = df_clean['Murmur'].map({'Present': 1, 'Absent': 0, 'Unknown': 2})

# 6. Handle the 'Murmur locations' feature
df_clean['Murmur locations'] = df_clean['Murmur locations'].str.split('+')
locations = ['PV', 'TV', 'AV', 'MV']
for location in locations:
    df_clean[location] = df_clean['Murmur locations'].apply(lambda x: 1 if x is not np.nan and location in x else 0)
df_clean.drop('Murmur locations', axis=1, inplace=True)

# 7. Map the 'Most audible location' feature
df_clean['Most audible location'] = df_clean['Most audible location'].map({np.nan: 0, 'PV': 1, 'TV': 2, 'AV': 3, 'MV': 4})

# 8. Map the Outcome feature
df_clean['Outcome'] = df_clean['Outcome'].map({'Normal': 0, 'Abnormal': 1})

# 9. Map the Campaign feature
df_clean['Campaign'] = df_clean['Campaign'].map({'CC2014': 0, 'CC2015': 1})

# 10. Map other string features
string_features = ['Systolic murmur timing', 'Systolic murmur shape', 'Systolic murmur grading', 'Systolic murmur pitch', 'Systolic murmur quality', 
                   'Diastolic murmur timing', 'Diastolic murmur shape', 'Diastolic murmur grading', 'Diastolic murmur pitch', 'Diastolic murmur quality']
for feature in string_features:
    df_clean[feature] = df_clean[feature].astype('category')
    df_clean[feature] = df_clean[feature].cat.codes
    df_clean[feature].fillna(-1, inplace=True)

In [4]:
def decoder_for_gpt3(input, max_length):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model='gpt-3.5-turbo',
                # model='gpt-4',
                messages=[
                    {"role": "system", "content": "Given a hypothetical patient profile, explore and articulate possible hypotheses, correlations, or insights that may arise based on common medical knowledge and assumptions. Consider the following attributes in your analysis: Age, Sex, Pregnancy status, Height, Weight, Presence of murmur, Most audible location of the murmur, Systolic and Diastolic murmur characteristics, Auscultation locations, and Campaign data. Note: The data is theoretical and should be treated as a fictional case study. Please provide your response in a consistent paragraph format."},
                    {"role": "user", "content": input}
                ],
                max_tokens=max_length,
                temperature=1,
            )
            # return response["choices"][0]['message']['content']
            content = response["choices"][0]['message']['content']
            return content
                
        except openai.error.RateLimitError as e:
            retry_time = e.retry_after if hasattr(e, 'retry_after') else 30
            print(f"Rate limit exceeded. Retrying in {retry_time} seconds...")
            time.sleep(retry_time)
            
        except openai.error.ServiceUnavailableError as e:
            retry_time = 10  # Adjust the retry time as needed
            print(f"Service is unavailable. Retrying in {retry_time} seconds...")
            time.sleep(retry_time)
            
        except openai.error.APIError as e:
            retry_time = e.retry_after if hasattr(e, 'retry_after') else 30
            print(f"API error occurred. Retrying in {retry_time} seconds...")
            time.sleep(retry_time)

        except OSError as e:
            retry_time = 5  # Adjust the retry time as needed
            print(f"Connection error occurred: {e}. Retrying in {retry_time} seconds...")
            time.sleep(retry_time)
        except TimeoutError as e:
            retry_time = 60  # Adjust the retry time as needed
            print(f"Timeout error occurred: {e}. Retrying in {retry_time} seconds...")
            time.sleep(retry_time)
        except BaseException as e:
            retry_time = 60  # Adjust the retry time as needed
            print(f"Timeout error occurred: {e}. Retrying in {retry_time} seconds...")
            time.sleep(retry_time)

        except openai.error.OpenAIError as e:
            raise e

In [5]:
df_clean.columns.to_list

<bound method IndexOpsMixin.tolist of Index(['Age', 'Sex', 'Height', 'Weight', 'Pregnancy status', 'Murmur',
       'Most audible location', 'Systolic murmur timing',
       'Systolic murmur shape', 'Systolic murmur grading',
       'Systolic murmur pitch', 'Systolic murmur quality',
       'Diastolic murmur timing', 'Diastolic murmur shape',
       'Diastolic murmur grading', 'Diastolic murmur pitch',
       'Diastolic murmur quality', 'Outcome', 'Campaign', 'PV', 'TV', 'AV',
       'MV'],
      dtype='object')>

In [6]:
df_clean.head(5)

Unnamed: 0,Age,Sex,Height,Weight,Pregnancy status,Murmur,Most audible location,Systolic murmur timing,Systolic murmur shape,Systolic murmur grading,...,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,PV,TV,AV,MV
0,3.0,0,98.0,15.9,0,0,0,-1,-1,-1,...,-1,-1,-1,-1,1,1,0,0,0,0
1,3.0,0,103.0,13.1,0,1,2,1,2,2,...,-1,-1,-1,-1,1,1,1,1,1,1
2,3.0,1,115.0,19.1,0,2,0,-1,-1,-1,...,-1,-1,-1,-1,1,1,0,0,0,0
3,3.0,1,98.0,15.9,0,1,2,1,3,0,...,-1,-1,-1,-1,1,1,0,1,0,0
4,3.0,1,87.0,11.2,0,1,1,0,3,1,...,-1,-1,-1,-1,1,1,1,1,1,1


In [7]:
prompt_template_circor=("""
I'm crafting a contextual narrative to analyze a specific data point from a dataset.

Given a dataset related to:
The CirCor DigiScope Phonocardiogram Dataset, which encompasses 5272 heart sound recordings from 1568 subjects aged between 0 and 21 years. The dataset, notable for being the largest publicly available pediatric heart sound dataset, is utilized in the George B. Moody PhysioNet Challenge 2022 on Heart Murmur Detection from Phonocardiogram Recordings, focusing on text-based data rather than audible files.

With the record:
A specific data point record represent a subject of Age category '{Age}', Sex '{Sex}', Height '{Height}' cm, Weight '{Weight}' kg, Pregnancy status '{Pregnancy status}', Murmur '{Murmur}', Most audible location '{Most audible location}', Systolic murmur timing '{Systolic murmur timing}', Systolic murmur shape '{Systolic murmur shape}', Systolic murmur grading '{Systolic murmur grading}', Systolic murmur pitch '{Systolic murmur pitch}', Systolic murmur quality '{Systolic murmur quality}', Diastolic murmur timing '{Diastolic murmur timing}', Diastolic murmur shape '{Diastolic murmur shape}', Diastolic murmur grading '{Diastolic murmur grading}', Diastolic murmur pitch '{Diastolic murmur pitch}', Diastolic murmur quality '{Diastolic murmur quality}', Outcome '{Outcome}', Campaign '{Campaign}', and various binary indicators for murmur locations (e.g., PV '{PV}', TV '{TV}', AV '{AV}', MV '{MV}').

and schema:
The schema includes variables such as Age (encoded categorical: [Neonate, Infant, Child, Adolescent, Young adult]), Sex (binary: [Female, Male]), Height (continuous: > 0), Weight (continuous: > 0), Pregnancy status (binary: [True, False]), Murmur (encoded categorical: [Present, Absent, Unknown]), PV, TV, AV, MV (binary: [True, False]), Most audible location (encoded categorical: [PV, TV, AV, MV, Phc]), various encoded murmur characteristics (categorical), Outcome (binary: [Normal, Abnormal]), and Campaign (binary: [CC2014, CC2015]).

I will create a narrative focusing on the outcome variable {Outcome}, adhering to the following constraints:
1. The narrative will be concise, not exceeding 350 words.
2. It will be highly relevant to {Outcome}, providing insights or context about how the variables in the specific record might relate to or impact {Outcome}.

Here's my analysis:
""")

In [8]:
openai.api_key='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'

In [9]:
prompts = df_clean.apply(lambda row: prompt_template_circor.format(**row), axis=1)
df_clean['analysis']= prompts.apply(lambda x: decoder_for_gpt3(x, max_length = 1000))

df_clean['analysis'].to_csv('/data/chenxi/llm-feature-engeneering/response/circor_response.csv', index=False)

Timeout error occurred: . Retrying in 60 seconds...
