In [7]:
# %% Initial setup
import pyodbc
import pandas as pd
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score, f1_score
import matplotlib.pyplot as plt 
from pathlib import Path

In [2]:
# %% Establish SQL server connection
SERVER = 'INSERT SERVER NAME'
conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+SERVER,
                      Trusted_Connection='Yes')

conn.autocommit = True
cursor = conn.cursor()

In [5]:
# Purpose:
#   create a date-joined dataframe for a given-immunosuppressive condition
# Params:
#   sql_filename: string, represents the name of a file containing a SQL query to pull IR ID, diagnosis code, and code date for a given condition
# Produces:
#   joined_df: a dataframe containing IR ID, diagnosis code, code date, study ID, and admission date selected for a given immunosuppressive condition
def generate_df(sql_filename):
    # Read in SQL query
    queries_path = './edw_queries/'
    with open((queries_path + sql_filename),'r') as file:
        query = file.read()

    # Execute the SQL query and store the results in a dataframe
    # This dataframe will have IR ID, diagnosis code, code date
    dx_date_df = pd.read_sql(query, conn)

    # Load in the admission dates
    query_enr = ''' 
        select distinct patient_ir_id, pt_study_id, admission_datetime
        from FSM_SCRIPT.fsm_script_dm.basic_endpoints'''
    # This dataframe will have IR ID, study ID, admission date
    enrollment_date_df = pd.read_sql(query_enr, conn)

    # Left join to the above query results (only keep the ones that were pulled with this immnuosuppressive condition)
    joined_df = pd.merge(left = dx_date_df, right = enrollment_date_df, 
                         left_on = 'ir_id', right_on = 'patient_ir_id', 
                         how = "left")

    # Convert date columns to type datetime
    joined_df['start_date_key'] = pd.to_datetime(joined_df['start_date_key'])
    joined_df['admission_datetime'] = pd.to_datetime(joined_df['admission_datetime'])

    return joined_df

# Purpose:
#   generate a confusion matrix for ICD code-based identification at a certain threshold of a given immunosuppressive condition and return operating characteristics
# Params:
#   joined_df: dataframe, as returned by generate_df() for a given immunosuppressive condition
#   immune_condition: string, how the immunosuppressive condition is named in the prediction column
#   code_frequency: integer, threshold for number of ICD code occurrences qualifying as a case
#   gold_label: string, how the immunosuppressive condition is named in the gold standard column
# Produces:
#   final_df: dataframe, has study ID, all immune condition predictions, all immune condition gold labels
#   metrics_df: dataframe, has all calculated metrics
def generate_cm(joined_df, immune_condition, code_frequency, gold_label):

    # Filter joined_df for diagnosis codes before admission only
    filtered_df = joined_df[joined_df['start_date_key'] < joined_df['admission_datetime']]

    # Group and isolate those with the codes of interest on at least 'code_frequency' dates
    grouped = filtered_df.groupby('pt_study_id')['start_date_key'].nunique()
    unique_patients = grouped[grouped >= code_frequency].index.tolist()

    # Represent 'unique_patients' as a dataframe
    prediction_df = pd.DataFrame({
        'study_id': unique_patients,
        immune_condition: [1] * len(unique_patients) # immune_condition is how the immunosuppressive condition is named for the prediction column
    })
    prediction_df['study_id'] = prediction_df['study_id'].astype('int64')

    # Load in the IC label data
    # Query for SCRIPT 1.0 patients
    redcap='''
            SET NOCOUNT ON
            drop table if exists #imc
            select * 
            into #imc 
            from (
            select distinct pt_study_id,type_immunocomp
            from FSM_SCRIPT.fsm_script_redcap_dm.redcap_PROJECTID_demographics
            ) x

            select pt_study_id,
                case when type_immunocomp like '%Acute leukemia%' then 1 end as Leukemia,
                case when type_immunocomp like '%Azathioprine%' then 1 end as Azathioprine,
                case when type_immunocomp like '%Chronic corticosteroids%' then 1 end as Chronic_corticosteroids,
                case when type_immunocomp like '%Cyclosporine%' then 1 end as Cyclosporine,
                case when type_immunocomp like '%Cytoxan%' then 1 end as Cytoxan,
                case when type_immunocomp like '%HIV%' then 1 end as HIV,
                case when type_immunocomp like '%Immunoglobulin deficiency%' then 1 end as Immunoglobulin_deficiency,
                case when type_immunocomp like '%Lymphoma%' then 1 end as Lymphoma,
                case when type_immunocomp like '%Mycophenolate (MMF)%' then 1 end as Mycophenolate,
                case when type_immunocomp like '%Multiple myeloma%' then 1 end as Myeloma,
                case when type_immunocomp like '%Myelosuppressive chemotherapy%' then 1 end as Myelosuppressive_chemo,
                case when type_immunocomp like '%Rituximab%' then 1 end as Rituximab,
                case when type_immunocomp like '%Solid organ transplant%' then 1 end as SOT,
                case when type_immunocomp like '%Stem cell transplant%' then 1 end as Stem_cell_transplant,
                case when type_immunocomp like '%Tacrolimus%' then 1 end as Tacrolimus
            from #imc
            '''
    
    # Same query as above but for SCRIPT 2 patients
    redcap2='''
            SET NOCOUNT ON
            drop table if exists #imc2
            select * 
            into #imc2
            from (
            select distinct record_id,emr_ic_type
            from FSM_SCRIPT.fsm_script_redcap_dm.redcap_PROJECTID_emr_info
            ) x

            select record_id as pt_study_id,
                case when emr_ic_type like '%Acute leukemia%' then 1 end as Leukemia,
                case when emr_ic_type like '%Azathioprine%' then 1 end as Azathioprine,
                case when emr_ic_type like '%Chronic corticosteroids%' then 1 end as Chronic_corticosteroids,
                case when emr_ic_type like '%Cyclosporine%' then 1 end as Cyclosporine,
                case when emr_ic_type like '%Cytoxan%' then 1 end as Cytoxan,
                case when emr_ic_type like '%HIV%' then 1 end as HIV,
                case when emr_ic_type like '%Immunoglobulin deficiency%' then 1 end as Immunoglobulin_deficiency,
                case when emr_ic_type like '%Lymphoma%' then 1 end as Lymphoma,
                case when emr_ic_type like '%Mycophenolate (MMF)%' then 1 end as Mycophenolate,
                case when emr_ic_type like '%Multiple myeloma%' then 1 end as Myeloma,
                case when emr_ic_type like '%Myelosuppressive chemotherapy%' then 1 end as Myelosuppressive_chemo,
                case when emr_ic_type like '%Rituximab%' then 1 end as Rituximab,
                case when emr_ic_type like '%Solid organ transplant%' then 1 end as SOT,
                case when emr_ic_type like '%Stem cell transplant%' then 1 end as Stem_cell_transplant,
                case when emr_ic_type like '%Tacrolimus%' then 1 end as Tacrolimus
            from #imc2
            '''
    # This dataframe will have study ID and then every immunosuppressive condition listed above and named as above
    labels_df_1 = pd.read_sql(redcap, conn)
    labels_df_1['pt_study_id'] = labels_df_1['pt_study_id'].astype(int)

    # Same dataframe as above but for SCRIPT 2 patients
    labels_df_2 = pd.read_sql(redcap2, conn)
    labels_df_2['pt_study_id'] = labels_df_2['pt_study_id'].astype(int)

    # Put together the labels for SCRIPT 1 and SCRIPT 2
    labels_df = pd.concat([labels_df_1, labels_df_2], ignore_index=True)

    # Right join prediction_df to labels_df to put predictions and labels together
    final_df = pd.merge(left = prediction_df, right = labels_df, left_on = 'study_id', right_on = 'pt_study_id', how = 'right')
    final_df = final_df.fillna(0)

    # Make sure the types match for confusion matrix input
    final_df[gold_label] = final_df[gold_label].astype(int)
    final_df[immune_condition] = final_df[immune_condition].astype(int)

    # Create and display the confusion matrix
    cm = confusion_matrix(final_df[gold_label], final_df[immune_condition])

    # Calculate metrics
    tn, fp, fn, tp = cm.ravel()
    sensitivity = recall_score(final_df[gold_label], final_df[immune_condition])
    specificity = tn / (tn + fp)
    ppv = precision_score(final_df[gold_label], final_df[immune_condition])
    npv = tn / (tn + fn)
    accuracy = accuracy_score(final_df[gold_label], final_df[immune_condition])
    f1 = f1_score(final_df[gold_label], final_df[immune_condition])

    # Draw confusion matrix (uncomment as needed)
    # cm1 = cm[::-1, ::-1]

    # plt.figure(figsize=(6, 6))
    # sns.heatmap(cm1, annot=True, fmt='d', cmap='Purples',
    #           xticklabels=['ICD 1', 'ICD 0'],
    #           yticklabels=['REDCap 1', 'REDCap 0'])
    # plt.title(f'Confusion Matrix - {immune_condition}')
    # plt.ylabel('Actual')
    # plt.xlabel('Predicted')
    # plt.savefig(f'confusion_matrix_{immune_condition}_{code_frequency}.png')
    # plt.close()

    # Create DataFrame with metrics
    metrics_df = pd.DataFrame({
        'Comparison': [immune_condition],
        'True Negative': [tn],
        'False Positive': [fp],
        'False Negative': [fn],
        'True Positive': [tp],
        'Sensitivity': [sensitivity],
        'Specificity': [specificity],
        'PPV': [ppv],
        'NPV': [npv],
        'Accuracy': [accuracy],
        'F1 Score': [f1]
    })

    return final_df, metrics_df

# Purpose: shares all params with generate_cm() except it does matrices for 0 < x <= code_frequency instead of just x = code_frequency (for a given immunosuppressive condition still)
# Params:
#   joined_df: dataframe, as returned by generate_df() for a given immunosuppressive condition
#   immune_condition: string, how the immunosuppressive condition is named in the prediction column
#   code_frequency: integer, threshold for number of ICD code occurrences qualifying as a case
#   gold_label: string, how the immunosuppressive condition is named in the gold standard column
#   only_include_llm_patients: boolean, True performs analyses on only the 558 patients with notes available, False performs on everyone
#   is_corrections_applied: boolean, True applies manual chart review corrections to gold standard labels, False leaves it as it is
# Produces:
#   combined_metrics_df: dataframe, has all calculated metrics for 0 < x <= code_frequency
def run_multiple_cm(joined_df, immune_condition, code_frequency, gold_label, only_include_llm_patients = False, is_corrections_applied = False):
    results = []
    # Run generate_cm for 0 < x <= code_frequency
    for i in range(1, code_frequency + 1):
        _, metrics_df = generate_cm(joined_df, immune_condition, i, gold_label, only_include_llm_patients, is_corrections_applied)
        metrics_df['Threshold'] = i
        results.append(metrics_df)
    
    combined_metrics_df = pd.concat(results, ignore_index=True)
    return combined_metrics_df

# Purpose: plot sensitivity and specificity over threshold of ICD code identification for a given immunosuppressive condition
# Params:
#   results_df: dataframe, of format combined_metrics_df from run_multiple_cm()
# Produces: none
def plot_sensitivity_specificity(results_df):

    condition_name = results_df['Comparison'].iloc[0]

    plt.figure(figsize=(10, 6))
    plt.plot(results_df['Threshold'], results_df['Sensitivity'], marker='o', label='Sensitivity')
    plt.plot(results_df['Threshold'], results_df['Specificity'], marker='o', label='Specificity')
    plt.xlabel('Threshold')
    plt.ylabel('Score')
    plt.title(f'Sensitivity and Specificity for {condition_name}')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
# %% Run all metrics and plot results with only_include_llm_patients = False and is_corrections_applied = True
conditions = ['HIV', 'SOT', 'Stem_cell_transplant', 'Leukemia', 'Lymphoma', 'Myeloma', 'Immunoglobulin_deficiency']

for condition in conditions:
    prediction_df = generate_df(f'{condition}.sql')
    results = run_multiple_cm(prediction_df, f'{condition}_pred', 10, condition, only_include_llm_patients=True, is_corrections_applied=True)
    plot_sensitivity_specificity(results)

    # Export each results dataframe to a CSV for use in figure generation
    results.to_csv(f'./{condition}_results.csv', index=False)

In [None]:
# Compile corrected metrics for SOT n=1 through n=4

# Initialize an empty list to store individual metric DataFrames
metrics_list = []

for i in range(1, 5):
    predictions_df = generate_df(f'SOT.sql')
    # Generate metrics for each prediction (change generate_cm to only return metrics_df)
    metrics_df = generate_cm(predictions_df, 'SOT_pred', i, 'SOT', is_corrections_applied=True)
    # Append the metrics DataFrame to the list
    metrics_list.append(metrics_df)

# Concatenate all individual metric DataFrames into a single DataFrame
final_metrics_df = pd.concat(metrics_list, ignore_index=True)

# Save metrics as a CSV in this directory for subsequent figure generation
final_metrics_df.to_csv(Path(r'./metrics.csv'), index = False)