In [None]:
import pandas as pd
import numpy as np

path1 = 'G:/共病/数据/multimorbidity_net_nodes_with_community_labels.csv'
path2 = 'G:/eicu-crd/completed_data.csv'
path3 = 'G:/eicu-crd/multimorbidity.csv'
df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)
df3 = pd.read_csv(path3)

In [None]:
# Step 1: Create a map of ICD-10 encoding to the community
icd_to_community = pd.Series(df1.community.values, index=df1.id).to_dict()

# Step 2: Define a function to identify each patientunitstayid's community
def determine_community(multimorbidity):
    communities = set()
    for icd_code in multimorbidity.split(','):
        community = icd_to_community.get(icd_code.strip())
        if community:
            communities.add(str(community))  # Converts the community number to a string
    # Combine the community into a string, separated by commas
    return ', '.join(sorted(communities))

# Step 3: Apply the function to df3
df3['community'] = df3['multimorbidity_icd10'].apply(determine_community)

In [None]:
machine_learning_data = pd.merge(df3, df2, on='patientunitstayid', how='left')

In [None]:
machine_learning_data.head()

In [None]:
community_counts = machine_learning_data['community'].value_counts()
community_counts

In [None]:
machine_learning_data = machine_learning_data[machine_learning_data['community'] != '']
community_counts = machine_learning_data['community'].value_counts()
community_counts

In [None]:
# Use BMI
machine_learning_data['BMI'] = machine_learning_data['weight'] / ((machine_learning_data['height'] / 100) ** 2)
machine_learning_data = machine_learning_data.drop(columns=['height', 'weight'])

In [None]:
machine_learning_data.columns

In [None]:
# Binning based on medical experts
machine_learning_data['age_category'] = pd.cut(machine_learning_data['age'],
                                               bins=[0, 18, 40, 60, 80, 90, np.inf],
                                               labels=['Children and Adolescents', 'Young Adults', 'Middle-aged', 'Senior', 'Elderly', 'Unknown'])
machine_learning_data.drop(columns=['age'], inplace=True)
machine_learning_data['BMI_category'] = pd.cut(machine_learning_data['BMI'],
                                               bins=[-np.inf, 18.5, 24.9, 29.9, 34.9, 39.9, np.inf],
                                               labels=['Underweight', 'Normal Weight', 'Overweight', 'Obesity Class I', 'Obesity Class II', 'Severe Obesity'])
machine_learning_data.drop(columns=['BMI'], inplace=True)
machine_learning_data['HLoS_category'] = pd.cut(machine_learning_data['HLoS'],
                                                bins=[-np.inf, 3, 10, np.inf],
                                                labels=['Short Stay', 'Medium Stay', 'Long Stay'])
machine_learning_data.drop(columns=['HLoS'], inplace=True)
machine_learning_data['ULoS_category'] = pd.cut(machine_learning_data['ULoS'],
                                                bins=[-np.inf, 1, 3, np.inf],
                                                labels=['Short Stay', 'Medium Stay', 'Long Stay'])
machine_learning_data.drop(columns=['ULoS'], inplace=True)
machine_learning_data['apachescore_category'] = pd.cut(machine_learning_data['apachescore'],
                                                       bins=[-np.inf, 39, 69, np.inf],
                                                       labels=['Mild', 'Moderate', 'Severe'])
machine_learning_data.drop(columns=['apachescore'], inplace=True)
machine_learning_data['SBP_category'] = pd.cut(machine_learning_data['SBP'],
                                               bins=[-np.inf, 90, 120, 140, np.inf],
                                               labels=['Low', 'Normal', 'Prehypertension', 'Hypertension'])
machine_learning_data.drop(columns=['SBP'], inplace=True)
machine_learning_data['DBP_category'] = pd.cut(machine_learning_data['DBP'],
                                               bins=[-np.inf, 60, 80, 90, np.inf],
                                               labels=['Low', 'Normal', 'Prehypertension', 'Hypertension'])
machine_learning_data.drop(columns=['DBP'], inplace=True)
machine_learning_data['MeanBP_category'] = pd.cut(machine_learning_data['MeanBP'],
                                                  bins=[-np.inf, 65, 85, 100, np.inf],
                                                  labels=['Low', 'Normal', 'Prehypertension', 'Hypertension'])
machine_learning_data.drop(columns=['MeanBP'], inplace=True)
machine_learning_data['sao2_category'] = pd.cut(machine_learning_data['sao2'],
                                                bins=[-np.inf, 95, 100],
                                                labels=['Mild Hypoxemia', 'Normal'])
machine_learning_data.drop(columns=['sao2'], inplace=True)
machine_learning_data['heartrate_category'] = pd.cut(machine_learning_data['heartrate'],
                                              bins=[-np.inf, 60, 100, 156, np.inf],
                                              labels=['Bradycardia', 'Normal', 'Tachycardia', 'Extreme Tachycardia'])
machine_learning_data.drop(columns=['heartrate'], inplace=True)
machine_learning_data['respiration_category'] = pd.cut(machine_learning_data['respiration'],
                                              bins=[-np.inf, 12, 20, 38, np.inf],
                                              labels=['Bradypnea', 'Normal', 'Tachypnea', 'Extreme Tachypnea'])
machine_learning_data.drop(columns=['respiration'], inplace=True)
machine_learning_data['gcsscore_category'] = pd.cut(machine_learning_data['gcsscore'],
                                               bins=[-np.inf, 7, 13, 15],
                                               labels=['Severe Coma', 'Moderate Coma', 'Alert'])
machine_learning_data.drop(columns=['gcsscore'], inplace=True)
machine_learning_data['Urine_category'] = pd.cut(machine_learning_data['Urine'],
                                              bins=[-np.inf, 500, 1477, np.inf],
                                              labels=['Low Output', 'Normal', 'High Output'])
machine_learning_data.drop(columns=['Urine'], inplace=True)
machine_learning_data['BUN_category'] = pd.cut(machine_learning_data['BUN'],
                                               bins=[-np.inf, 20, 83, np.inf],
                                               labels=['Normal', 'Elevated', 'Very High'])
machine_learning_data.drop(columns=['BUN'], inplace=True)
machine_learning_data['Hct_category'] = pd.cut(machine_learning_data['Hct'],
                                               bins=[-np.inf, 37, 50, np.inf],
                                               labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['Hct'], inplace=True)
def hgb_category(row):
    if row['gender'] == 'Male':
        bins = [2, 11, 16, np.inf]
        labels = ['Low', 'Normal', 'High']
    else:
        bins = [2, 11, 15, np.inf]
        labels = ['Low', 'Normal', 'High']
    return pd.cut([row['Hgb']], bins=bins, labels=labels)[0]
machine_learning_data['Hgb_category'] = machine_learning_data.apply(hgb_category, axis=1)
machine_learning_data.drop(columns=['Hgb'], inplace=True)
machine_learning_data['MCH_category'] = pd.cut(machine_learning_data['MCH'],
                                               bins=[-np.inf, 27, 32, np.inf],
                                               labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['MCH'], inplace=True)
machine_learning_data['MCHC_category'] = pd.cut(machine_learning_data['MCHC'],
                                                bins=[-np.inf, 32, 36, np.inf],
                                                labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['MCHC'], inplace=True)
machine_learning_data['MCV_category'] = pd.cut(machine_learning_data['MCV'],
                                               bins=[-np.inf, 86, 98, np.inf],
                                               labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['MCV'], inplace=True)
machine_learning_data['MPV_category'] = pd.cut(machine_learning_data['MPV'],
                                               bins=[-np.inf, 8, 12, np.inf],
                                               labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['MPV'], inplace=True)
def rbc_category(row):
    if row['gender'] == 'Male':
        bins = [0.7, 3.1, 5.4, np.inf]
        labels = ['Low', 'Normal', 'High']
    else:
        bins = [0.7, 3.2, 4.8, np.inf]
        labels = ['Low', 'Normal', 'High']
    return pd.cut([row['RBC']], bins=bins, labels=labels)[0]
machine_learning_data['RBC_category'] = machine_learning_data.apply(rbc_category, axis=1)
machine_learning_data.drop(columns=['RBC'], inplace=True)
machine_learning_data['RDW_category'] = pd.cut(machine_learning_data['RDW'],
                                               bins=[-np.inf, 13, np.inf],
                                               labels=['Normal', 'High'])
machine_learning_data.drop(columns=['RDW'], inplace=True)
machine_learning_data['WBC_category'] = pd.cut(machine_learning_data['WBC'],
                                               bins=[-np.inf, 4, 10, np.inf],
                                               labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['WBC'], inplace=True)
machine_learning_data['AnionGap_category'] = pd.cut(machine_learning_data['AnionGap'],
                                              bins=[-np.inf, 7, 16, np.inf],
                                              labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['AnionGap'], inplace=True)
machine_learning_data['BG_category'] = pd.cut(machine_learning_data['BG'],
                                              bins=[-np.inf, 70, 140, np.inf],
                                              labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['BG'], inplace=True)
machine_learning_data['bicarbonate_category'] = pd.cut(machine_learning_data['bicarbonate'],
                                                       bins=[-np.inf, 22, 29, np.inf],
                                                       labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['bicarbonate'], inplace=True)
machine_learning_data['calcium_category'] = pd.cut(machine_learning_data['calcium'],
                                                    bins=[-np.inf, 8.5, 10.2, np.inf],
                                                    labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['calcium'], inplace=True)
machine_learning_data['chloride_category'] = pd.cut(machine_learning_data['chloride'],
                                                     bins=[-np.inf, 96, 107, np.inf],
                                                     labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['chloride'], inplace=True)
def creatinine_category(row):
    if row['gender'] == 'Male':
        bins = [0.08, 0.9, 1.3, np.inf]
        labels = ['Low', 'Normal', 'High']
    else:
        bins = [0.08, 0.91, 1.1, np.inf]
        labels = ['Low', 'Normal', 'High']
    return pd.cut([row['creatinine']], bins=bins, labels=labels)[0]
machine_learning_data['creatinine_category'] = machine_learning_data.apply(creatinine_category, axis=1)
machine_learning_data.drop(columns=['creatinine'], inplace=True)
machine_learning_data['glucose_category'] = pd.cut(machine_learning_data['glucose'],
                                                  bins=[-np.inf, 70, 140, np.inf],
                                                  labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['glucose'], inplace=True)
machine_learning_data['platelets_category'] = pd.cut(machine_learning_data['platelets'],
                                                     bins=[-np.inf, 150, 450, np.inf],
                                                     labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['platelets'], inplace=True)
machine_learning_data['potassium_category'] = pd.cut(machine_learning_data['potassium'],
                                                     bins=[-np.inf, 3.5, 5.2, np.inf],
                                                     labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['potassium'], inplace=True)
machine_learning_data['sodium_category'] = pd.cut(machine_learning_data['sodium'],
                                                  bins=[-np.inf, 135, 145, np.inf],
                                                  labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['sodium'], inplace=True)
machine_learning_data['temperature_category'] = pd.cut(machine_learning_data['temperature'],
                                                  bins=[-np.inf, 36.1, 37.2, np.inf],
                                                  labels=['Low', 'Normal', 'High'])
machine_learning_data.drop(columns=['temperature'], inplace=True)

In [None]:
machine_learning_data.head()

In [None]:
community_counts = machine_learning_data['community'].value_counts()
community_counts

In [None]:
machine_learning_data = machine_learning_data.iloc[:, 2:]
first_column = machine_learning_data.iloc[:, 0]
machine_learning_data = machine_learning_data.iloc[:, 1:]
machine_learning_data[first_column.name] = first_column
machine_learning_data.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
machine_learning_data['community'] = machine_learning_data['community'].apply(lambda x: x.split(', '))
machine_learning_data = machine_learning_data.join(pd.DataFrame(mlb.fit_transform(machine_learning_data.pop('community')),
                          columns=mlb.classes_,
                          index=machine_learning_data.index))
columns = list(machine_learning_data.columns[:-3]) + ['community1', 'community2', 'community3']
machine_learning_data.columns = columns

In [None]:
machine_learning_data.to_csv('G:/共病/数据/machine_learning_data.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
path4 = 'G:/共病/数据/machine_learning_data.csv'
machine_learning_data = pd.read_csv(path4)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import f1_score, classification_report
from sklearn.neural_network import MLPClassifier

In [None]:
# Splitting the dataset into features and labels
X = machine_learning_data.iloc[:, :-3]  # All columns except the last three
y = machine_learning_data[['community1', 'community2', 'community3']]  # The last three columns are the labels

In [None]:
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
# Initializing models
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)
mlp = MLPClassifier(random_state=42, max_iter=300)

In [None]:
# Classifier Chains with a base estimator
classifier_chain_dt = ClassifierChain(decision_tree, order='random', random_state=42)
classifier_chain_rf = ClassifierChain(random_forest, order='random', random_state=42)
classifier_chain_mlp = ClassifierChain(mlp, order='random', random_state=42)

In [None]:
# Training the models
classifier_chain_dt.fit(X_train, y_train)
classifier_chain_rf.fit(X_train, y_train)
classifier_chain_mlp.fit(X_train, y_train)

In [None]:
# Making predictions
y_pred_dt = classifier_chain_dt.predict(X_test)
y_pred_rf = classifier_chain_rf.predict(X_test)
y_pred_mlp = classifier_chain_mlp.predict(X_test)

In [None]:
# Calculating F1 scores
f1_dt = f1_score(y_test, y_pred_dt, average='micro')
f1_rf = f1_score(y_test, y_pred_rf, average='micro')
f1_mlp = f1_score(y_test, y_pred_mlp, average='micro')

In [None]:
print("Decision Tree F1 Scores by Label:")
report_dt = classification_report(y_test, y_pred_dt, target_names=['Community 1', 'Community 2', 'Community 3'], output_dict=True)
for label, metrics in report_dt.items():
    for metric, value in metrics.items():
        if isinstance(value, float):
            metrics[metric] = round(value, 4)
print(report_dt)


print("Random Forest F1 Scores by Label:")
report_rf = classification_report(y_test, y_pred_rf, target_names=['Community 1', 'Community 2', 'Community 3'], output_dict=True)
for label, metrics in report_rf.items():
    for metric, value in metrics.items():
        if isinstance(value, float):
            metrics[metric] = round(value, 4)
print(report_rf)

print("MLP F1 Scores by Label:")
report_mlp = classification_report(y_test, y_pred_mlp, target_names=['Community 1', 'Community 2', 'Community 3'], output_dict=True)
for label, metrics in report_mlp.items():
    for metric, value in metrics.items():
        if isinstance(value, float):
            metrics[metric] = round(value, 4)
print(report_mlp)

In [None]:
import shap

In [None]:
label_index = 2  # Modify this index to analyze different tags
rf_model = classifier_chain_rf.estimators_[label_index]

In [None]:
# Convert X_sample to a DataFrame
X_sample_df = pd.DataFrame(X_test[:100], columns=encoder.get_feature_names_out())
# Ensure your sample has the correct shape and features
X_sample_df = X_sample_df.reindex(columns=encoder.get_feature_names_out(), fill_value=0)
# Convert X_sample_df back to a NumPy array (if necessary)
X_sample = X_sample_df.to_numpy()

In [None]:
# Create SHAP explainer with interventional feature perturbation
explainer = shap.TreeExplainer(rf_model, feature_perturbation='interventional')

In [None]:
# Get SHAP values with additivity check disabled
shap_values = explainer.shap_values(X_sample, check_additivity=False)

In [None]:
# Check if shap_values is a list (multi-output model)
if isinstance(shap_values, list):
    # Aggregate SHAP values across outputs (e.g., summing absolute values)
    shap_values = np.sum(np.abs(shap_values), axis=0)

In [None]:
# Calculate mean absolute SHAP values for feature importance
shap_sum = np.abs(shap_values).mean(axis=0)
# Verify shapes
print("Shape of shap_sum:", shap_sum.shape)
print("Shape of feature names:", encoder.get_feature_names_out().shape)

In [None]:
# Create a DataFrame for feature importance
importance_df = pd.DataFrame({
    'feature': encoder.get_feature_names_out(),
    'shap_value': shap_sum
})
importance_df = importance_df.sort_values('shap_value', ascending=False)

In [None]:
# Display top 15 features with their SHAP values
top_features = importance_df.head(15)
print(top_features)