In [None]:
import pandas as pd
import numpy as np

path1 = 'G:/共病/数据/multimorbidity_net_nodes_with_community_labels.csv'
path2 = 'G:/eicu-crd/completed_data.csv'
path3 = 'G:/eicu-crd/multimorbidity.csv'
df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)
df3 = pd.read_csv(path3)

In [None]:
# 步骤1：创建ICD-10编码到社区的映射
icd_to_community = pd.Series(df1.community.values, index=df1.id).to_dict()

# 步骤2：定义一个函数来确定每个patientunitstayid的社区
def determine_community(multimorbidity):
    communities = set()
    for icd_code in multimorbidity.split(','):
        community = icd_to_community.get(icd_code.strip())
        if community:
            communities.add(str(community))  # 将社区编号转换为字符串
    # 将社区合并为一个字符串，用逗号分隔
    return ', '.join(sorted(communities))

# 步骤3：应用函数到df3
df3['community'] = df3['multimorbidity_icd10'].apply(determine_community)

In [None]:
machine_learning_data = pd.merge(df3, df2, on='patientunitstayid', how='left')

In [None]:
machine_learning_data.head()

In [None]:
community_counts = machine_learning_data['community'].value_counts()
community_counts

In [None]:
machine_learning_data = machine_learning_data[machine_learning_data['community'] != '']
community_counts = machine_learning_data['community'].value_counts()
community_counts

In [None]:
# 使用BMI
machine_learning_data['BMI'] = machine_learning_data['weight'] / ((machine_learning_data['height'] / 100) ** 2)
machine_learning_data = machine_learning_data.drop(columns=['height', 'weight'])

In [None]:
machine_learning_data.columns

In [None]:
# 基于医学专家的分箱
machine_learning_data['age_category'] = pd.cut(machine_learning_data['age'],
                                               bins=[0, 18, 40, 60, 80, 90, np.inf],
                                               labels=['Children and Adolescents', 'Young Adults', 'Middle-aged', 'Senior', 'Elderly', 'Unknown'])
machine_learning_data.drop(columns=['age'], inplace=True)
machine_learning_data['BMI_category'] = pd.cut(machine_learning_data['BMI'],
                                               bins=[-np.inf, 18.5, 24.9, 29.9, 34.9, 39.9, np.inf],
                                               labels=['Underweight', 'Normal Weight', 'Overweight', 'Obesity Class I', 'Obesity Class II', 'Severe Obesity'])
machine_learning_data.drop(columns=['BMI'], inplace=True)
machine_learning_data['HLoS_category'] = pd.cut(machine_learning_data['HLoS'],
                                                bins=[-np.inf, 3, 10, np.inf],
                                                labels=['Short Stay', 'Medium Stay', 'Long Stay'])
machine_learning_data.drop(columns=['HLoS'], inplace=True)
machine_learning_data['ULoS_category'] = pd.cut(machine_learning_data['ULoS'],
                                                bins=[-np.inf, 1, 3, np.inf],
                                                labels=['Short Stay', 'Medium Stay', 'Long Stay'])
machine_learning_data.drop(columns=['ULoS'], inplace=True)
machine_learning_data['apachescore_category'] = pd.cut(machine_learning_data['apachescore'],
                                                       bins=[-np.inf, 39, 69, np.inf],
                                                       labels=['Mild', 'Moderate', 'Severe'])
machine_learning_data.drop(columns=['apachescore'], inplace=True)
machine_learning_data['SBP_category'] = pd.cut(machine_learning_data['SBP'],
                                               bins=[-np.inf, 90, 120, 140, np.inf],
                                               labels=['Low', 'Normal', 'Prehypertension', 'Hypertension'])
machine_learning_data.drop(columns=['SBP'], inplace=True)
machine_learning_data['DBP_category'] = pd.cut(machine_learning_data['DBP'],
                                               bins=[-np.inf, 60, 80, 90, np.inf],
                                               labels=['Low', 'Normal', 'Prehypertension', 'Hypertension'])
machine_learning_data.drop(columns=['DBP'], inplace=True)
machine_learning_data['MeanBP_category'] = pd.cut(machine_learning_data['MeanBP'],
                                                  bins=[-np.inf, 65, 85, 100, np.inf],
                                                  labels=['Low', 'Normal', 'Prehypertension', 'Hypertension'])
machine_learning_data.drop(columns=['MeanBP'], inplace=True)
machine_learning_data['sao2_category'] = pd.cut(machine_learning_data['sao2'],
                                                bins=[-np.inf, 95, 100],
                                                labels=['Mild Hypoxemia', 'Normal'])
machine_learning_data.drop(columns=['sao2'], inplace=True)
machine_learning_data['heartrate_category'] = pd.cut(machine_learning_data['heartrate'],
                                              bins=[-np.inf, 60, 100, 156, np.inf],
                                              labels=['Bradycardia', 'Normal', 'Tachycardia', 'Extreme Tachycardia'])
machine_learning_data.drop(columns=['heartrate'], inplace=True)
machine_learning_data['respiration_category'] = pd.cut(machine_learning_data['respiration'],
                                              bins=[-np.inf, 12, 20, 38, np.inf],
                                              labels=['Bradypnea', 'Normal', 'Tachypnea', 'Extreme Tachypnea'])
machine_learning_data.drop(columns=['respiration'], inplace=True)
machine_learning_data['gcsscore_category'] = pd.cut(machine_learning_data['gcsscore'],
                                               bins=[-np.inf, 7, 13, 15],
                                               labels=['Severe Coma', 'Moderate Coma', 'Alert'])
machine_learning_data.drop(columns=['gcsscore'], inplace=True)
machine_learning_data['Urine_category'] = pd.cut(machine_learning_data['Urine'],
                                              bins=[-np.inf, 500, 1477, np.inf],
                                              labels=['Low Output', 'Normal', 'High Output'])
machine_learning_data.drop(columns=['Urine'], inplace=True)
machine_learning_data['BUN_category'] = pd.cut(machine_learning_data['BUN'],
                                               bins=[-np.inf, 20, 83, np.inf],
                                               labels=['Normal', 'Elevated', 'Very High'])
machine_learning_data.drop(columns=['BUN'], inplace=True)
machine_learning_data['Hct_category'] = pd.cut(machine_learning_data['Hct'],
                                               bins=[-np.inf, 37, 50, np.inf],
                                               labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['Hct'], inplace=True)
def hgb_category(row):
    if row['gender'] == 'Male':
        bins = [2, 11, 16, np.inf]
        labels = ['Low', 'Normal', 'High']
    else:
        bins = [2, 11, 15, np.inf]
        labels = ['Low', 'Normal', 'High']
    return pd.cut([row['Hgb']], bins=bins, labels=labels)[0]
machine_learning_data['Hgb_category'] = machine_learning_data.apply(hgb_category, axis=1)
machine_learning_data.drop(columns=['Hgb'], inplace=True)
machine_learning_data['MCH_category'] = pd.cut(machine_learning_data['MCH'],
                                               bins=[-np.inf, 27, 32, np.inf],
                                               labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['MCH'], inplace=True)
machine_learning_data['MCHC_category'] = pd.cut(machine_learning_data['MCHC'],
                                                bins=[-np.inf, 32, 36, np.inf],
                                                labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['MCHC'], inplace=True)
machine_learning_data['MCV_category'] = pd.cut(machine_learning_data['MCV'],
                                               bins=[-np.inf, 86, 98, np.inf],
                                               labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['MCV'], inplace=True)
machine_learning_data['MPV_category'] = pd.cut(machine_learning_data['MPV'],
                                               bins=[-np.inf, 8, 12, np.inf],
                                               labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['MPV'], inplace=True)
def rbc_category(row):
    if row['gender'] == 'Male':
        bins = [0.7, 3.1, 5.4, np.inf]
        labels = ['Low', 'Normal', 'High']
    else:
        bins = [0.7, 3.2, 4.8, np.inf]
        labels = ['Low', 'Normal', 'High']
    return pd.cut([row['RBC']], bins=bins, labels=labels)[0]
machine_learning_data['RBC_category'] = machine_learning_data.apply(rbc_category, axis=1)
machine_learning_data.drop(columns=['RBC'], inplace=True)
machine_learning_data['RDW_category'] = pd.cut(machine_learning_data['RDW'],
                                               bins=[-np.inf, 13, np.inf],
                                               labels=['Normal', 'High'])
machine_learning_data.drop(columns=['RDW'], inplace=True)
machine_learning_data['WBC_category'] = pd.cut(machine_learning_data['WBC'],
                                               bins=[-np.inf, 4, 10, np.inf],
                                               labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['WBC'], inplace=True)
machine_learning_data['AnionGap_category'] = pd.cut(machine_learning_data['AnionGap'],
                                              bins=[-np.inf, 7, 16, np.inf],
                                              labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['AnionGap'], inplace=True)
machine_learning_data['BG_category'] = pd.cut(machine_learning_data['BG'],
                                              bins=[-np.inf, 70, 140, np.inf],
                                              labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['BG'], inplace=True)
machine_learning_data['bicarbonate_category'] = pd.cut(machine_learning_data['bicarbonate'],
                                                       bins=[-np.inf, 22, 29, np.inf],
                                                       labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['bicarbonate'], inplace=True)
machine_learning_data['calcium_category'] = pd.cut(machine_learning_data['calcium'],
                                                    bins=[-np.inf, 8.5, 10.2, np.inf],
                                                    labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['calcium'], inplace=True)
machine_learning_data['chloride_category'] = pd.cut(machine_learning_data['chloride'],
                                                     bins=[-np.inf, 96, 107, np.inf],
                                                     labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['chloride'], inplace=True)
def creatinine_category(row):
    if row['gender'] == 'Male':
        bins = [0.08, 0.9, 1.3, np.inf]
        labels = ['Low', 'Normal', 'High']
    else:
        bins = [0.08, 0.91, 1.1, np.inf]
        labels = ['Low', 'Normal', 'High']
    return pd.cut([row['creatinine']], bins=bins, labels=labels)[0]
machine_learning_data['creatinine_category'] = machine_learning_data.apply(creatinine_category, axis=1)
machine_learning_data.drop(columns=['creatinine'], inplace=True)
machine_learning_data['glucose_category'] = pd.cut(machine_learning_data['glucose'],
                                                  bins=[-np.inf, 70, 140, np.inf],
                                                  labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['glucose'], inplace=True)
machine_learning_data['platelets_category'] = pd.cut(machine_learning_data['platelets'],
                                                     bins=[-np.inf, 150, 450, np.inf],
                                                     labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['platelets'], inplace=True)
machine_learning_data['potassium_category'] = pd.cut(machine_learning_data['potassium'],
                                                     bins=[-np.inf, 3.5, 5.2, np.inf],
                                                     labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['potassium'], inplace=True)
machine_learning_data['sodium_category'] = pd.cut(machine_learning_data['sodium'],
                                                  bins=[-np.inf, 135, 145, np.inf],
                                                  labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['sodium'], inplace=True)
machine_learning_data['temperature_category'] = pd.cut(machine_learning_data['temperature'],
                                                  bins=[-np.inf, 36.1, 37.2, np.inf],
                                                  labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['temperature'], inplace=True)

In [None]:
machine_learning_data.head()

In [None]:
community_counts = machine_learning_data['community'].value_counts()
community_counts

In [None]:
machine_learning_data = machine_learning_data.iloc[:, 2:]
first_column = machine_learning_data.iloc[:, 0]
machine_learning_data = machine_learning_data.iloc[:, 1:]
machine_learning_data[first_column.name] = first_column
machine_learning_data.head()

In [None]:
machine_learning_data.to_csv('G:/共病/数据/machine_learning_data.csv', index=False)

In [1]:
import pandas as pd
import numpy as np
path4 = 'G:/共病/数据/machine_learning_data.csv'
machine_learning_data = pd.read_csv(path4)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

In [3]:
machine_learning_data['community'] = machine_learning_data['community'].apply(lambda x: 0 if x == '2' else 1)

In [4]:
X = machine_learning_data.drop('community', axis=1)
y = machine_learning_data['community']

In [5]:
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [7]:
# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)
logreg_predictions = logreg_model.predict(X_test)

In [8]:
# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)

In [9]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

In [10]:
# XGBoost
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

In [11]:
# Hyperparameter tuning for XGBoost using GridSearchCV and cross-validation
parameters = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1]
}

In [12]:
grid_search = GridSearchCV(XGBClassifier(), parameters, cv=3, scoring='roc_auc')
grid_search.fit(X_train, y_train)

In [13]:
best_parameters = grid_search.best_params_
best_model = grid_search.best_estimator_

In [14]:
# Calculate AUROC for each model
logreg_roc_auc = roc_auc_score(y_test, logreg_model.predict_proba(X_test)[:, 1])
nb_roc_auc = roc_auc_score(y_test, nb_model.predict_proba(X_test)[:, 1])
rf_roc_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
xgb_roc_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])
# Calculate AUROC for the best XGBoost model
best_xgb_roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])

# Combine accuracy and AUROC results
model_performance = {
    'Logistic Regression': {'AUROC': logreg_roc_auc},
    'Naive Bayes': {'AUROC': nb_roc_auc},
    'Random Forest': {'AUROC': rf_roc_auc},
    'XGBoost': {'AUROC': xgb_roc_auc},
    'XGBoost with GridSearchCV': {'AUROC': best_xgb_roc_auc}
}

In [15]:
model_performance, best_parameters

({'Logistic Regression': {'AUROC': 0.6621169180486716},
  'Naive Bayes': {'AUROC': 0.6283672574572617},
  'Random Forest': {'AUROC': 0.6547465567896291},
  'XGBoost': {'AUROC': 0.6621180337214898},
  'XGBoost with GridSearchCV': {'AUROC': 0.6762307719019475}},
 {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.9})

In [16]:
import shap

In [17]:
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_train)
shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame([encoder.get_feature_names_out(), shap_sum]).T
importance_df.columns = ['feature', 'shap_value']
importance_df = importance_df.sort_values('shap_value', ascending=False)
top_features = importance_df.head(15)['feature']



In [18]:
import matplotlib.pyplot as plt
plt.figure()
shap.summary_plot(shap_values, X_train, feature_names=encoder.get_feature_names_out(), max_display=15, show=False)
plt.savefig('G:/共病/图片/fig8_shap_summary_plot.png', dpi=600, bbox_inches='tight')
plt.close()