# Identification of variables associated with ICD-11 diagnosis 

vs variables associated with ICD-10 diagnosis

In [None]:
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/stroke_research/geneva_stroke_incidence/data/PPGSS_3_ICD10vsICD11.xlsx'

In [None]:
df = pd.read_excel(data_path)

In [None]:
df.head()

In [None]:
# replace all empty spaces in column names with ""
df.columns = df.columns.str.replace(' ', '')
# remove . from column names
df.columns = df.columns.str.replace('.', '')

# rename Prestrokedisability(Rankin) to PreStrokeDisability
df.rename(columns={'Prestrokedisability(Rankin)': 'PreStrokeDisability'}, inplace=True)
df.rename(columns={'Age(calc)': 'Age'}, inplace=True)

In [None]:
predictors = [
    "Age",
"Sex",
"PreStrokeDisability",
"NIHonadmission",
"MedHistHypertension",
"MedHistDiabetes",
"MedHistSmoking",
# "MedHistAtrialFibr",
"MedHistCHD",
#     etiology should probably not be used (because known very colinear with medhistatrialfibr)
"Etiology"
              ]


In [None]:
df['ICD10vsICD11'].value_counts()

In [None]:
f'ICD10vsICD11 ~ {" + ".join(predictors)}'

In [None]:
from patsy.highlevel import dmatrices

y,X = dmatrices(f'ICD10vsICD11 ~ {" + ".join(predictors)}', data = df, return_type = 'dataframe') 

In [None]:
# Logistic regression with ICD10vsICD11 as dependent variable and predictors as independent variables

model = sm.Logit(y['ICD10vsICD11[ICD11]'], X)
result = model.fit()

In [None]:
print(result.summary())

In [None]:
result.pvalues[-1]

In [None]:
result.conf_int().reset_index()[0]

In [None]:
# get all coefficients as a dataframe, along with std, z and confidence intervals 
coefficients = result.params.to_frame().reset_index()

coefficients['std'] = result.bse.to_frame().reset_index()[0]
coefficients['z'] = result.tvalues.to_frame().reset_index()[0]

coefficients['CI_low'] = result.conf_int().reset_index()[0]
coefficients['CI_high'] = result.conf_int().reset_index()[1]

coefficients.columns = ['Predictor', 'Coefficient', 'std', 'z', 'CI_low', 'CI_high']


In [None]:
coefficients

In [None]:
# coefficients.to_excel('/Users/jk1/Downloads/ICD11_predictors.xlsx')

In [None]:
# frequency of MedHistAtrialFibr for ICD10 vs ICD11
df.groupby('ICD10vsICD11')['Etiology'].value_counts(normalize=True)

In [None]:
fig = plt.figure(figsize=(10, 10))

# create a boxplot for NIHonadmission for both ICD10 and ICD11
ax = sns.boxplot(x='ICD10vsICD11', y='NIHonadmission', data=df, showfliers=False, hue='ICD10vsICD11')

ax.set_ylabel('NIH on admission')
ax.set_xlabel('ICD10 vs ICD11')

# add bar with p-value
x1, x2 = 0, 1
y_max = 19
y, h, col = y_max + y_max/50, y_max/50, 'k'
plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, f'p < 0.0001', ha='center', va='bottom', color=col)


In [None]:
# fig.savefig('/Users/jk1/Downloads/ICD10vsICD11_NIH_on_admission.png')

In [None]:
# get median and IQR for both categories
df.groupby('ICD10vsICD11')['NIHonadmission'].describe()

In [None]:
# make a boxplot with NIHonadmission for with ICD10 vs all patients

fig = plt.figure(figsize=(10, 10))

ICD10_df = df[df['ICD10'] == 1]
ICD10_df['cat'] = 'ICD10'
ICD11_df = df[df['ICD11'] == 1]
ICD11_df['cat'] = 'ICD11'

temp_df = pd.concat([ICD10_df, ICD11_df])
temp_df.reset_index(inplace=True)

ax = sns.boxplot(x='cat', y='NIHonadmission', data=temp_df, showfliers=False, hue='cat')

In [None]:
# fig.savefig('/Users/jk1/Downloads/ICD10vsICD11_NIH_on_admission.png')

In [None]:
# get median and IQR for both categories
temp_df.groupby('cat')['NIHonadmission'].describe()

### Test whole group vs subgroup
instead of subgroup ICD10 vs subgroup ICD11-ICD10  

In [None]:
import os
os.environ["R_HOME"] = "/Library/Frameworks/R.framework/Resources"
from pymer4.models import Lmer

model = Lmer(f'ICD10  ~  {" + ".join(predictors)} + (1|ID)',
             data=df, family = 'binomial')

In [None]:
model.fit()

In [None]:
model.coefs

In [None]:
# model.coefs.to_excel('/Users/jk1/Downloads/mixed_effects_ICD10_predictors.xlsx')

In [None]:
# boxplot age vs cat in temp_df

fig = plt.figure(figsize=(10, 10))

ax = sns.boxplot(x='cat', y='Age', data=temp_df, showfliers=False, hue='cat')


In [None]:
temp_df.groupby('cat')['Age'].describe()

In [None]:
temp_df.groupby('cat')['Etiology'].value_counts(normalize=True)