In [18]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from hcuppy.elixhauser import ElixhauserEngine
import json
import networkx as nx
import holoviews as hv
from holoviews import opts, dim
import scipy.stats as stats
from scipy.stats import powerlaw
from sklearn.mixture import GaussianMixture
from scipy.stats import norm
from collections import defaultdict

path = 'G:/eicu-crd'

# Identification of multimorbidity

## 与MIMIC不同，eICU数据集更适合我们研究共病，因为患者的诊断往往由多个ICD代码组成。除了诊断之外，apachePredVar表中还列出了几种常见的合并症。这些疾病包括艾滋病、肝功能衰竭、肝硬化、糖尿病、免疫抑制、白血病、淋巴瘤和转移性癌症。此代码将利用diagnosis表和apachePredVar表构建eICU的共病。并在这里进行初步的分析。

In [2]:
# Import diagnosis and apachePredVar table
completed_df = pd.read_csv(path + '/completed_data.csv')
diagnosis_cols = ['patientunitstayid', 'diagnosisstring', 'icd9code']
diagnosis_df = pd.read_csv(path + '/diagnosis.csv', usecols=diagnosis_cols)
apacheprevar_cols = ['patientunitstayid', 'aids', 'hepaticfailure', 'cirrhosis', 'diabetes', 
                     'immunosuppression', 'leukemia', 'lymphoma', 'metastaticcancer']
apachepredvar_df = pd.read_csv(path + '/apachePredVar.csv', usecols=apacheprevar_cols)

In [3]:
# Define common comorbidities that are included in apachepredvar_df
comorbidities_columns = ['aids', 'hepaticfailure', 'cirrhosis', 'diabetes', 'immunosuppression', 
                         'leukemia', 'lymphoma', 'metastaticcancer']
comorbidities_icd9_map = {
    'aids': '042',
    'hepaticfailure': '570.0',
    'cirrhosis': '571.2',
    'diabetes': '250.0',
    'immunosuppression': '279.9',
    'leukemia': '208.9',
    'lymphoma': '202.9',
    'metastaticcancer': '196.9'
}
# Merge two dataframes
multimorbidity = apachepredvar_df[['patientunitstayid'] + comorbidities_columns].merge(diagnosis_df, on='patientunitstayid', how='left')

tqdm.pandas(bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')
# Extract multimorbidity patterns（string and icd9）
def extract_comorbidities(row):
    comorbidities = [col for col in comorbidities_columns if row[col] == 1]
    # Get comorbidities (string)
    comorbidities_string = '|'.join(comorbidities)
    # Get comorbidities (icd9)
    comorbidities_icd9 = ', '.join([comorbidities_icd9_map[col] for col in comorbidities])
    diagnosis_result = str(row['diagnosisstring']) if pd.notna(row['diagnosisstring']) else ''
    diagnosis_icd9_result = str(row['icd9code']) if pd.notna(row['icd9code']) else ''
    
    # Improved logic
    if diagnosis_result:
        final_string = diagnosis_result + ('|' + comorbidities_string if comorbidities_string else '')
    else:
        final_string = comorbidities_string

    if diagnosis_icd9_result:
        final_icd9 = diagnosis_icd9_result + (', ' + comorbidities_icd9 if comorbidities_icd9 else '')
    else:
        final_icd9 = comorbidities_icd9

    return pd.Series([final_string, final_icd9], index=['multimorbidity_patterns_string', 'multimorbidity_patterns_icd9'])

# Use progress_apply instead of apply, for progress bar functionality
multimorbidity[['multimorbidity_patterns_string', 'multimorbidity_patterns_icd9']] = multimorbidity.progress_apply(extract_comorbidities, axis=1)
## Delete rows that have only one diseased row line, a disease, which removes 'multimorbidity_patterns' column does not contain '|' line
# multimorbidity = multimorbidity[multimorbidity['multimorbidity_patterns'].str.contains('|', regex=False)]
multimorbidity = multimorbidity[['patientunitstayid', 'multimorbidity_patterns_string', 'multimorbidity_patterns_icd9']]

100%|████████████████████| 2535918/2535918 [03:22<00:00, 12527.65it/s]                                                 


In [4]:
multimorbidity.replace('', np.nan, inplace=True)
multimorbidity.dropna(subset=['multimorbidity_patterns_string', 'multimorbidity_patterns_icd9'], how='all', inplace=True)

In [5]:
# Group by patientunitstayid and aggregate by concatenating diagnosis results
aggregated_multimorbidity = multimorbidity.groupby('patientunitstayid').agg({
    'multimorbidity_patterns_string': lambda x: '|'.join(filter(None, x.astype(str))),
    'multimorbidity_patterns_icd9': lambda x: ', '.join(filter(None, x.astype(str)))
}).reset_index()

In [6]:
unique_diseases = set()
for patterns in aggregated_multimorbidity['multimorbidity_patterns_string']:
    diseases = patterns.split('|')
    for disease in diseases:
        unique_diseases.add(disease.strip())

## 统计所有疾病，通过LLM，gpt-4来标注其是否为慢性病以及对应的icd10code。prompt engineering

In [7]:
df_unique_diseases = pd.DataFrame(list(unique_diseases), columns=['Disease'])
# df_unique_diseases.to_excel(path + '_unique_diseases.xlsx', index=False)

In [8]:
dic_of_chronic_diseases = pd.read_excel('G:/eicu-crd_unique_diseases.xlsx')

In [9]:
# Filter out the data belonging to chronic diseases
chronic_diseases = dic_of_chronic_diseases[dic_of_chronic_diseases['ChronicDisease'] == 1]

# Create a mapping from disease names to their ICD-10 codes
disease_to_icd10 = dict(zip(chronic_diseases['Disease'], chronic_diseases['ICD-10 Code']))

def filter_and_map(diseases_str):
    diseases_list = diseases_str.split('|')
    filtered_diseases = [disease_to_icd10[disease] for disease in diseases_list if disease in disease_to_icd10]
    # Remove duplicate ICD-10 codes using set and then convert back to list
    unique_filtered_diseases = list(set(filtered_diseases))
    return ', '.join(unique_filtered_diseases)

aggregated_multimorbidity['multimorbidity_icd10'] = aggregated_multimorbidity['multimorbidity_patterns_string'].apply(filter_and_map)

# Remove the multimorbidity_patterns_icd9 column
aggregated_multimorbidity = aggregated_multimorbidity.drop(columns=['multimorbidity_patterns_icd9'])
aggregated_multimorbidity = aggregated_multimorbidity.drop(columns=['multimorbidity_patterns_string'])
aggregated_multimorbidity['multimorbidity_icd10'] = aggregated_multimorbidity['multimorbidity_icd10'].str.replace('ICD-10 varies, ', '').str.replace(', ICD-10 varies', '')
aggregated_multimorbidity = aggregated_multimorbidity[~aggregated_multimorbidity.applymap(lambda x: x == '' or pd.isna(x)).any(axis=1)]
aggregated_multimorbidity = aggregated_multimorbidity[aggregated_multimorbidity.applymap(lambda x: ',' in str(x)).any(axis=1)]
aggregated_multimorbidity.to_csv(path + '/multimorbidity.csv', index=False)

  aggregated_multimorbidity = aggregated_multimorbidity[~aggregated_multimorbidity.applymap(lambda x: x == '' or pd.isna(x)).any(axis=1)]
  aggregated_multimorbidity = aggregated_multimorbidity[aggregated_multimorbidity.applymap(lambda x: ',' in str(x)).any(axis=1)]


# Exploration of the distribution of multimorbidity

In [10]:
# Draw the curve of frequency distribution.
multimorbidity_pattern_counts = aggregated_multimorbidity['multimorbidity_icd10'].value_counts()
values = multimorbidity_pattern_counts.values
# Calculate the number of multimorbidity patterns for each occurrence
frequency_counts = np.bincount(values)
# Get the index of the nonzero element, which is the number of occurrences of each element
non_zero_indices = np.nonzero(frequency_counts)[0]

output_data = pd.DataFrame({
    'Frequency': non_zero_indices,
    'Number_of_Multimorbidity': frequency_counts[non_zero_indices]
})
# Output data points to a CSV file
output_path = 'G:/共病/数据/multimorbidity_distribution_data.csv'
output_data.to_csv(output_path, index=False)

### 频数分布曲线告诉了我们ICU中的共病分布是遵循幂律分布的，具体来说它有3个特点：（1）在ICU中非常常见的共病占少数；（2）大多数共病的出现频次相对较低；（3）两图中的长尾部分阐明了在ICU中有大量的不常见但仍然存在的共病。
### 对于ICU来说，这意味着：

+ 对于最常见的那部分共病，ICU可能需要针对性的、高效的预防和治疗策略，因为这些常见的可能对大多数病人都很相关。

+ 对于长尾部分的共病，即使它们相对少见，但因为种类繁多，总体上会影响到很多病人。这就需要ICU具备更广泛的专业知识和能力来处理这些相对罕见但种类繁多的情况。

### 这种分布也强调了在医学研究和治疗策略制定中，既要重视高频发生的情况，也不能忽略那些低频但多样的情况。

# 共病网络由R语言构建

### 首先构建疾病的共现矩阵，即基于同时出现的病例计数来构建。

In [27]:
# Step 1: Make a list of all unique diseases across all patients
all_diseases = set()
for multimorbidity_list in aggregated_multimorbidity['multimorbidity_icd10']:
    all_diseases.update(multimorbidity_list.split(', '))
# Convert the set to a list to have a consistent order
all_diseases = sorted(list(all_diseases))

In [28]:
# Step 2: Initialize a co-occurrence matrix with zeros
prevalence_matrix = pd.DataFrame(0.0, index=all_diseases, columns=all_diseases)

In [29]:
# Step 3: Create a dictionary to hold the number of occurrences of each unique multimorbidity
multimorbidity_occurrences = defaultdict(int)
for multimorbidity in aggregated_multimorbidity['multimorbidity_icd10']:
    # This creates a frozenset which is hashable and can be used as a dictionary key
    diseases_set = frozenset(multimorbidity.split(', '))
    multimorbidity_occurrences[diseases_set] += 1

In [30]:
# Step 4: Calculate the prevalence matrix
# Note: Since the prevalence matrix is symmetric, we only need to calculate the upper triangle.
for i, disease1 in tqdm(enumerate(prevalence_matrix.index), total=len(prevalence_matrix.index), bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'):
    for j, disease2 in enumerate(prevalence_matrix.columns[i+1:], start=i+1):  # Start from i+1 to avoid self-pairing and to calculate only the upper triangle
        # Find all unique multimorbidities that contain both disease1 and disease2
        M_ij = {m for m in multimorbidity_occurrences.keys() if disease1 in m and disease2 in m}
        
        # Calculate the sum of occurrences for all unique multimorbidities in M_ij
        sum_occurrences = sum(multimorbidity_occurrences[m] for m in M_ij)
        
        # Calculate the prevalence for disease1, disease2 by dividing the sum of occurrences by the number of multimorbidities in M_ij
        prevalence = sum_occurrences / len(M_ij) if M_ij else 0  # Avoid division by zero
        
        # Update the prevalence matrix for both (disease1, disease2) and (disease2, disease1) since it's symmetric
        prevalence_matrix.at[disease1, disease2] = prevalence
        prevalence_matrix.at[disease2, disease1] = prevalence

100%|████████████████████| 321/321 [00:31<00:00, 10.05it/s]                                                            


In [31]:
# Step 5: Save the prevalence matrix to a CSV file
prevalence_matrix.to_csv('G:/共病/数据/prevalence_matrix.csv')

## 计算患者的共病严重程度，通过aggregated_multimorbidity表中的multimorbidity_icd10计算Elixhauser comorbidity score，Elixhauser comorbidity score是一个评估患者并发症严重程度的打分系统。正的得分越高可能意味着预后较差（负的得分越低意味着预后预后反而越好）。
### 使用hcuppy计算Elixhauser comorbidity score中的mortality得分。

In [34]:
ee = ElixhauserEngine()
def calculate_elixhauser_scores(icd10_string):
    # ICD codes were extracted, and ICD codes and descriptions were separated by ', '
    icd_codes = icd10_string.split(', ')
    # Eliminating empty strings
    icd_codes = [code for code in icd_codes if code]
    # Calculate the Elixhauser score
    result = ee.get_elixhauser(icd_codes)
    # readmission and mortality scores are returned
    return pd.Series([result['rdmsn_scr'], result['mrtlt_scr']])

# Create a dictionary to hold the Elixhauser mortality score for each unique multimorbidity
multimorbidity_severity = defaultdict(int)
for multimorbidity in aggregated_multimorbidity['multimorbidity_icd10']:
    diseases_set = frozenset(multimorbidity.split(', '))
    # Calculate the Elixhauser score for mortality
    icd_codes = list(diseases_set)
    result = ee.get_elixhauser(icd_codes)
    mortality_score = result['mrtlt_scr']
    # Store the mortality score for the multimorbidity
    multimorbidity_severity[diseases_set] = mortality_score
    
# Initialize death score matrices
severity_matrix = pd.DataFrame(
    np.zeros_like(prevalence_matrix, dtype=float),
    index=prevalence_matrix.index,
    columns=prevalence_matrix.columns
)

# Iterate over each element of the severity_matrix matrix
for i, disease1 in enumerate(tqdm(severity_matrix.index, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')):
    for j, disease2 in enumerate(severity_matrix.columns[i+1:], start=i+1):  # Calculate only upper triangle
        # Find all unique multimorbidities that contain both disease1 and disease2
        M_ij = {m for m in multimorbidity_severity.keys() if disease1 in m and disease2 in m}
        
        # Calculate the sum of Elixhauser mortality scores for all unique multimorbidities in M_ij
        sum_severity = sum(multimorbidity_severity[m] for m in M_ij)
        
        # Calculate the severity for disease1, disease2 by dividing the sum of scores by the number of multimorbidities in M_ij
        severity = sum_severity / len(M_ij) if M_ij else 0  # Use np.nan for pairs with no multimorbidities
        
        # Update the severity matrix
        severity_matrix.at[disease1, disease2] = severity
        severity_matrix.at[disease2, disease1] = severity  # Symmetric update

100%|████████████████████| 321/321 [00:31<00:00, 10.03it/s]                                                            


In [35]:
severity_matrix.to_csv('G:/共病/数据/severity_matrix.csv')

### 根据final_chronic_diseases_dict对前面统计的aggregated_multimorbidity['multimorbidity_icd10']进行统计，对于每个共病m，通过dic计算每例患者共病包含几类不同的疾病组，将疾病组的数量作为共病复杂性评分C。然后通过co_occurrence_matrix构建complexity_matrix。对于complexity_matrix中的元素(i, j)，计算包含两种疾病i和疾病j的共病的平均复杂度评分。

In [40]:
complexity_matrix= pd.DataFrame(
    np.zeros_like(co_occurrence_matrix, dtype=float),
    index=co_occurrence_matrix.index,
    columns=co_occurrence_matrix.columns
)

def calculate_complexity_scores(icd_codes, disease_categories):
    # Create a mapping from ICD-10 code to Category
    icd_to_category = disease_categories.set_index('ICD-10 Code')['Category'].to_dict()
    # Find the unique categories for the multimorbidity's diseases
    unique_categories = set(icd_to_category[icd] for icd in icd_codes if icd in icd_to_category)
    # The complexity score is the number of unique categories
    return len(unique_categories)

dic = pd.read_excel('G:/final_chronic_diseases_dict.xlsx')

# Calculate the complexity score for each multimorbidity
multimorbidity_complexity = {}
for multimorbidity in aggregated_multimorbidity['multimorbidity_icd10']:
    icd_codes = multimorbidity.split(', ')
    complexity_score = calculate_complexity_scores(icd_codes, dic)
    diseases_set = frozenset(icd_codes)
    multimorbidity_complexity[diseases_set] = complexity_score

# Iterate over each element of the severity_matrix matrix
for i, disease1 in enumerate(tqdm(complexity_matrix.index, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')):
    for j, disease2 in enumerate(complexity_matrix.columns[i+1:], start=i+1):  # Calculate only upper triangle
        # Find all unique multimorbidities that contain both disease1 and disease2
        M_ij = {m for m in multimorbidity_complexity.keys() if disease1 in m and disease2 in m}
        
        # Calculate the sum of Elixhauser mortality scores for all unique multimorbidities in M_ij
        sum_complexity = sum(multimorbidity_complexity[m] for m in M_ij)
        
        # Calculate the severity for disease1, disease2 by dividing the sum of scores by the number of multimorbidities in M_ij
        complexity = sum_complexity / len(M_ij) if M_ij else 0  # Use np.nan for pairs with no multimorbidities
        
        # Update the severity matrix
        complexity_matrix.at[disease1, disease2] = complexity
        complexity_matrix.at[disease2, disease1] = complexity  # Symmetric update

100%|████████████████████| 321/321 [00:31<00:00, 10.33it/s]                                                            


In [41]:
complexity_matrix.to_csv('G:/共病/数据/complexity_matrix.csv')

In [42]:
complexity_array = complexity_matrix.to_numpy()
severity_array = severity_matrix.to_numpy()
prevalence_array = prevalence_matrix.to_numpy()
multimorbidity_array = complexity_array * severity_array * prevalence_array
multimorbidity_matrix = pd.DataFrame(multimorbidity_array,
                              index=prevalence_matrix.index,
                              columns=prevalence_matrix.columns)
multimorbidity_matrix.to_csv('G:/共病/数据/multimorbidity_matrix.csv')

# 根据APs计算边的标签

### 首先对三个子矩阵的分布情况进行确认：如果数据呈正态分布，使用等距分箱法；如果数据呈现偏态分布，使用等频分箱法；如果数据呈长尾分布，使用基于分位数的分箱法。

In [47]:
severity_path = 'G:/共病/图片/supplementary_fig1_severity_distribution.png'
prevalence_path = 'G:/共病/图片/supplementary_fig1_prevalence_distribution.png'
complexity_path = 'G:/共病/图片/supplementary_fig1_complexity_distribution.png'

In [51]:
severity_flattened_data = severity_matrix.where(np.triu(np.ones(severity_matrix.shape), k=1).astype(bool)).stack().values
severity_flattened_data_nonzero = severity_flattened_data[severity_flattened_data != 0]
mu, std = stats.norm.fit(severity_flattened_data_nonzero)
plt.figure(figsize=(2, 2))
n, bins, patches = plt.hist(severity_flattened_data_nonzero, bins=30, color='#7362AC', edgecolor='#CECFE6', linewidth=0.25)


# Adding a 'best fit' line
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=1)
plt.title(title)
plt.xlabel('Value')
plt.ylabel('Frequency')

# Save the figure
plt.savefig(severity_path, dpi=600, bbox_inches='tight')
plt.close()

In [49]:
prevalence_flattened_data = prevalence_matrix.where(np.triu(np.ones(prevalence_matrix.shape), k=1).astype(bool)).stack().values
prevalence_flattened_data_nonzero = prevalence_flattened_data[prevalence_flattened_data != 0]
plt.figure(figsize=(2, 2))
plt.hist(prevalence_flattened_data_nonzero, bins=30, color='#E35508', edgecolor='#FCC38E', linewidth=0.25)
plt.title('Distribution of Prevalence Matrix Elements')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.savefig(prevalence_path, dpi=600, bbox_inches='tight')
plt.close()

In [50]:
complexity_flattened_data = complexity_matrix.where(np.triu(np.ones(complexity_matrix.shape), k=1).astype(bool)).stack().values
complexity_flattened_data_nonzero = complexity_flattened_data[complexity_flattened_data != 0]
plt.figure(figsize=(2, 2))
plt.hist(complexity_flattened_data_nonzero, bins=30, color='#2E9750', edgecolor='#B7E3B2', linewidth=0.25)
plt.title('Distribution of Complexity Matrix Elements')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.savefig(complexity_path, dpi=600, bbox_inches='tight')
plt.close()

In [60]:
# Create the bins
min_val = min(severity_flattened_data_nonzero)
max_val = max(severity_flattened_data_nonzero)
severity_bins = np.linspace(start=min_val, stop=max_val, num=11)
severity_binned = pd.cut(severity_flattened_data_nonzero, bins=severity_bins, include_lowest=True)
severity_binned.value_counts()

(-13.001, -7.0]      23
(-7.0, -1.0]        341
(-1.0, 5.0]        1275
(5.0, 11.0]        3077
(11.0, 17.0]       3464
(17.0, 23.0]       2003
(23.0, 29.0]        742
(29.0, 35.0]        173
(35.0, 41.0]         90
(41.0, 47.0]         13
Name: count, dtype: int64

In [61]:
prevalence_binned = pd.qcut(prevalence_flattened_data_nonzero, q=35, duplicates='drop')
prevalence_binned.value_counts()

(0.999, 1.016]    8521
(1.016, 1.041]     330
(1.041, 1.062]     349
(1.062, 1.083]     308
(1.083, 1.114]     325
(1.114, 1.155]     327
(1.155, 1.204]     327
(1.204, 1.297]     328
(1.297, 1.5]       395
(1.5, 6.5]         261
Name: count, dtype: int64

In [70]:
def equal_frequency_binning(data, num_bins):
    # Sort the data
    sorted_data = np.sort(data)
    
    # Calculate the number of data points per bin
    bin_size = len(sorted_data) // num_bins
    
    # Create bins
    bins = [sorted_data[i * bin_size: (i + 1) * bin_size] for i in range(num_bins)]
    
    # Handle the case where the number of data points is not perfectly divisible by num_bins
    # by including the remainder in the last bin
    remainder = len(sorted_data) % num_bins
    if remainder > 0:
        bins[-1] = np.concatenate((bins[-1], sorted_data[-remainder:]))
    
    return bins
bins = equal_frequency_binning(complexity_flattened_data_nonzero, 10)

# Output the bins
for i, b in enumerate(bins):
    print(f"Bin {i+1}: Range ({b[0]} - {b[-1]}), Count: {len(b)}")
    print(b)  # Print the actual data in each bin (optional)

Bin 1: Range (1.0 - 2.857142857142857), Count: 1147
[1.         1.         1.         ... 2.85714286 2.85714286 2.85714286]
Bin 2: Range (2.857142857142857 - 3.111111111111111), Count: 1147
[2.85714286 2.86013986 2.86075949 ... 3.11111111 3.11111111 3.11111111]
Bin 3: Range (3.1153846153846154 - 3.6666666666666665), Count: 1147
[3.11538462 3.11764706 3.11764706 ... 3.66666667 3.66666667 3.66666667]
Bin 4: Range (3.6666666666666665 - 4.0), Count: 1147
[3.66666667 3.66666667 3.66666667 ... 4.         4.         4.        ]
Bin 5: Range (4.0 - 4.111111111111111), Count: 1147
[4.         4.         4.         ... 4.11111111 4.11111111 4.11111111]
Bin 6: Range (4.111111111111111 - 4.5), Count: 1147
[4.11111111 4.11111111 4.11111111 ... 4.5        4.5        4.5       ]
Bin 7: Range (4.5 - 4.847222222222222), Count: 1147
[4.5        4.5        4.5        ... 4.84615385 4.84615385 4.84722222]
Bin 8: Range (4.848484848484849 - 5.0), Count: 1147
[4.84848485 4.85       4.85       ... 5.         

In [71]:
severity_intervals = pd.IntervalIndex.from_tuples([
    (-13.001, -7.0), (-7.0, -1.0), (-1.0, 5.0), (5.0, 11.0),
    (11.0, 17.0), (17.0, 23.0), (23.0, 29.0), (29.0, 35.0),
    (35.0, 41.0), (41.0, 47.0)
])

In [72]:
prevalence_intervals = pd.IntervalIndex.from_tuples([
    (0.999, 1.016), (1.016, 1.041), (1.041, 1.062), (1.062, 1.083),
    (1.083, 1.114), (1.114, 1.155), (1.155, 1.204), (1.204, 1.297),
    (1.297, 1.5), (1.5, 6.5)
])

In [73]:
complexity_intervals = pd.IntervalIndex.from_tuples([
    (0.99, 2.86), (2.86, 3.11), (3.11, 3.67), (3.67, 4.00),
    (4.00, 4.11), (4.11, 4.50), (4.50, 4.85), (4.85, 5.00), 
    (5.00, 5.78), (5.78, 9.00)
])

In [74]:
def assign_rating(value, intervals):
    for i, interval in enumerate(intervals, start=1):
        if interval.left <= value <= interval.right:
            return i
    return 0  # 如果值不在任何区间内
def label_matrix(base_matrix, intervals):
    labeled_matrix = pd.DataFrame(
        np.zeros_like(base_matrix, dtype=float),
        index=base_matrix.index,
        columns=base_matrix.columns,
    )
    for i in base_matrix.index:
        for j in base_matrix.columns:
            if base_matrix.loc[i, j] != 0:
                labeled_matrix.loc[i, j] = assign_rating(base_matrix.loc[i, j], intervals)
    return labeled_matrix

labeled_severity_matrix = label_matrix(severity_matrix, severity_intervals)
labeled_prevalence_matrix = label_matrix(prevalence_matrix, prevalence_intervals)
labeled_complexity_matrix = label_matrix(complexity_matrix, complexity_intervals)

In [75]:
def determine_ap(severity, prevalence, complexity):
    if severity >= 9:
        if prevalence >= 8 or complexity >= 7:
            return 'H'
        else:
            return 'M' if complexity == 1 else 'L'
    elif severity >= 7:
        if prevalence >= 8 or complexity >= 7:
            return 'H'
        else:
            return 'M' if complexity <= 4 else 'L'
    elif severity >= 4:
        if prevalence >= 8 or complexity >= 5:
            return 'H'
        elif complexity >= 2:
            return 'M'
        else:
            return 'L'
    elif severity >= 2:
        if prevalence >= 8 and complexity >= 7:
            return 'M'
        else:
            return 'L'
    else:
        return 'L'
def calculate_ap_matrix(severity_matrix, prevalence_matrix, complexity_matrix):
    ap_matrix = pd.DataFrame(
        np.empty(severity_matrix.shape, dtype=str),
        index=severity_matrix.index,
        columns=severity_matrix.columns
    )
    for i in severity_matrix.index:
        for j in severity_matrix.columns:
            severity_rating = severity_matrix.loc[i, j]
            prevalence_rating = prevalence_matrix.loc[i, j]
            complexity_rating = complexity_matrix.loc[i, j]
            ap_matrix.loc[i, j] = determine_ap(severity_rating, prevalence_rating, complexity_rating)
    return ap_matrix
ap_matrix = calculate_ap_matrix(labeled_severity_matrix, labeled_prevalence_matrix, labeled_complexity_matrix)

In [76]:
ap_matrix.to_csv('G:/共病/数据/ap_matrix.csv')