In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np

data =  pd.read_csv('E:\\InternetDownload\\finalwithvte_balanced_13Apr_n10.csv')
data['gender_encoded'] = data['gender'].apply(lambda x: 0 if x == 'F' else 1)
data['encoded_icutypes'] = data['first_careunit'].apply(lambda x: 0 if x == 'Medical/Surgical Intensive Care Unit (MICU/SICU)' else 1)
#covariates in the regression

covariate_columns = ['real_age', 'gender_encoded', 'congestive_heart_failure', 'have_cancer',
                     'severe_liver_disease', 'have_sepsis', 'first_day_sofa', 'weight', 'have_diabetes', 
                     'avg_creat', 'have_at_least_1_ventil', 'encoded_icutypes']

treatment_columns = ['encoded_input']

In [3]:
data.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,label,count,...,major_bleeding,have_sepsis,encoded_input,have_diabetes,have_cancer,creat_clr,mortality,length_of_stay,gender_encoded,encoded_icutypes
0,10002155,28994087,31090461,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2130-09-24 00:50:00,2130-09-27 22:13:41,3.891447,Heparin Sodium (Prophylaxis),1.0,...,0,0,1,0,1,0,0 days 00:00:00,5 days 20:56:00,0,0
1,10003019,22774359,30676350,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2175-10-08 18:58:00,2175-10-09 11:59:16,0.709213,Heparin Sodium (Prophylaxis),1.0,...,0,1,1,0,1,1,0 days 00:00:00,9 days 03:34:00,1,0
2,10003400,20214994,32128372,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2137-02-25 23:37:19,2137-03-10 21:29:36,12.911308,Heparin Sodium (Prophylaxis),1.0,...,0,0,1,0,1,1,0 days 00:00:00,23 days 05:45:00,0,0
3,10004733,27411876,39635619,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2174-12-04 11:28:24,2174-12-12 20:03:01,8.357373,Heparin Sodium (Prophylaxis),1.0,...,0,0,1,0,0,1,0 days 00:00:00,23 days 02:32:00,1,0
4,10007795,28477357,31921355,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2136-04-22 18:01:13,2136-04-23 19:13:58,1.050521,Heparin Sodium (Prophylaxis),1.0,...,0,0,1,0,0,1,0 days 00:00:00,21 days 20:02:00,0,0


In [4]:
data.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'first_careunit', 'last_careunit',
       'intime', 'outtime', 'los', 'label', 'count', 'admittime', 'dischtime',
       'admission_type', 'admit_provider_id', 'admission_location',
       'discharge_location_x', 'insurance', 'language', 'marital_status',
       'race', 'edregtime', 'edouttime', 'hospital_expire_flag', 'gender',
       'anchor_age', 'anchor_year', 'anchor_year_group', 'dod', 'real_age',
       'input', 'first_day_sofa', 'age_score', 'myocardial_infarct',
       'congestive_heart_failure', 'peripheral_vascular_disease',
       'cerebrovascular_disease', 'chronic_pulmonary_disease',
       'peptic_ulcer_disease', 'renal_disease', 'severe_liver_disease',
       'charlson_comorbidity_index', 'avg_pl', 'age', 'weight_admit', 'weight',
       'weight_min', 'weight_max', 'avgw', 'avg_creat', 'creat_score',
       'discharge_location_y', 'died', 'if_antithrombotic_agents',
       'if_vasopressin', 'have_at_least_1_ventil', 'have_vte',

In [5]:
def con_log_reg(outcome_variable, matching_variable):
    # Create a new column to hold the matching group for each case
    data[matching_variable] = 0

    # Loop over each positive case and find 10 negative cases to match with
    for idx, row in data[data[outcome_variable] == 1].iterrows():
        # Find all the negative cases that have the same value for the matching variables
        matching_cases = data[(data[outcome_variable] == 0) &
                              (data[matching_variable] == 0) &
                              (data['real_age'] == row['real_age']) &
                              #(data['gender_encoded'] == row['gender_encoded']) &
                              (data['encoded_icutypes'] == row['encoded_icutypes']) 
                              ]
        if len(matching_cases) >= 10:
            # Randomly select 10 negative cases from the matching cases
            matching_idx = np.random.choice(matching_cases.index, size=10, replace=False)

            # Set the matching group for the positive case and its matched negative cases
            data.loc[idx, matching_variable] = 1
            data.loc[matching_idx, matching_variable] = 1

    # Filter the data to only include the matched cases
    matched_data = data[data[matching_variable] == 1]

    # Fit the conditional logistic regression model
    model = sm.Logit(matched_data[outcome_variable], matched_data[treatment_columns + covariate_columns])
    results = model.fit()

    # Print the summary of the regression results
    print(results.summary())
    # Extract the coefficients and standard errors from the model results
    coef = results.params
    se = results.bse

    # Calculate the odds ratios and confidence intervals
    odds_ratios = np.exp(coef)
    conf_intervals = np.exp(coef - 1.96 * se), np.exp(coef + 1.96 * se)

    # Print the results
    print('Odds ratios:')
    print(odds_ratios)
    print('Confidence intervals:')
    print(conf_intervals)


## have_vte

In [6]:
# outcome and matching variable
outcome_variable = 'have_vte'
matching_variable = 'matching_variable'
con_log_reg(outcome_variable, matching_variable)

Optimization terminated successfully.
         Current function value: 0.277220
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               have_vte   No. Observations:                 2387
Model:                          Logit   Df Residuals:                     2374
Method:                           MLE   Df Model:                           12
Date:                Sun, 16 Apr 2023   Pseudo R-squ.:                 0.09000
Time:                        19:54:21   Log-Likelihood:                -661.72
converged:                       True   LL-Null:                       -727.17
Covariance Type:            nonrobust   LLR p-value:                 4.101e-22
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
encoded_input               -2.0900      0.168    -12.412      0.000      -2.420

## major_bleeding

In [7]:
outcome_variable1 = 'major_bleeding'
matching_variable1 = 'matching_variable1'
con_log_reg(outcome_variable1, matching_variable1)

Optimization terminated successfully.
         Current function value: 0.256548
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:         major_bleeding   No. Observations:                  143
Model:                          Logit   Df Residuals:                      130
Method:                           MLE   Df Model:                           12
Date:                Sun, 16 Apr 2023   Pseudo R-squ.:                  0.1579
Time:                        19:54:22   Log-Likelihood:                -36.686
converged:                       True   LL-Null:                       -43.563
Covariance Type:            nonrobust   LLR p-value:                    0.3167
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
encoded_input                0.6619      1.070      0.619      0.536      -1.435

## HIT

In [8]:
outcome_variable2 = 'HIT'
matching_variable2 = 'matching_variable2'
con_log_reg(outcome_variable2, matching_variable2)

Optimization terminated successfully.
         Current function value: 0.279023
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                    HIT   No. Observations:                   99
Model:                          Logit   Df Residuals:                       86
Method:                           MLE   Df Model:                           12
Date:                Sun, 16 Apr 2023   Pseudo R-squ.:                 0.08408
Time:                        19:54:23   Log-Likelihood:                -27.623
converged:                       True   LL-Null:                       -30.159
Covariance Type:            nonrobust   LLR p-value:                    0.9556
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
encoded_input               -1.0114      1.183     -0.855      0.393      -3.331