In [1]:
from pymongo import MongoClient
import pandas as pd

In [23]:
client = MongoClient('mongodb://localhost:27017/')
db = client['recommender_system']
admissions = db['admissions']
patients = db['patients']
diagnoses_icd = db['diagnoses_icd']
procedures_icd = db['procedures_icd']
d_icd_diagnoses = db['d_icd_diagnoses']
d_icd_procedures = db['d_icd_procedures']
nies = db['nies'] 

In [11]:
df_admissions = pd.DataFrame(list(admissions.find()))
df_patients = pd.DataFrame(list(patients.find()))
df_diagnoses_icd = pd.DataFrame(list(diagnoses_icd.find()))
df_procedures_icd = pd.DataFrame(list(procedures_icd.find()))
df_d_icd_diagnoses = pd.DataFrame(list(d_icd_diagnoses.find()))
df_d_icd_procedures = pd.DataFrame(list(d_icd_procedures.find()))

In [27]:
df_nies = pd.DataFrame(list(nies.find()))
print('nies:', nies.count_documents({})) 
print(df_nies.head())

nies: 4776
                        _id  agegroup  Gender  sex  disability  ethnicgrp  \
0  6855811336e4026fa6fae289         3       1    1           0        0.0   
1  6855811336e4026fa6fae28a         2       2    2           0        0.0   
2  6855811336e4026fa6fae28b         1       2    2           0        0.0   
3  6855811336e4026fa6fae28c         2       1    1           1        0.0   
4  6855811336e4026fa6fae28d         3       1    1           1        0.0   

   HealthRegion  Source_of_Admission_Category  insure  medcard  ...  xadmtype  \
0             1                             1     1.0      0.0  ...         1   
1             1                             2     0.0      1.0  ...         1   
2             1                             1     1.0      1.0  ...         2   
3             1                             1     0.0      0.0  ...         1   
4             1                             1     0.0      1.0  ...         2   

   AdmTypeBinary  ScoreOVERALL  satisfa

In [15]:
print('admissions:', admissions.count_documents({}))
print(df_adm.head())
print('patients:', patients.count_documents({}))
print(df_patients.head())
print('diagnoses_icd:', diagnoses_icd.count_documents({}))
print(df_diagnoses_icd.head())  
print('procedures_icd:', procedures_icd.count_documents({}))
print(df_procedures_icd.head())
print('d_icd_diagnoses:', d_icd_diagnoses.count_documents({}))
print(df_d_icd_diagnoses.head())
print('d_icd_procedures:', d_icd_procedures.count_documents({}))    
print(df_d_icd_procedures.head())

admissions: 283
                        _id  subject_id   hadm_id            admittime  \
0  67c4cb3a44a7421d696860bf    10046543  21402025  2155-03-15 21:22:00   
1  67c4cb3a44a7421d6968776b    10188106  28288574  2155-03-26 23:55:00   
2  67c4cb3a44a7421d6968776c    10188106  29174671  2155-03-22 17:26:00   
3  67c4cb3a44a7421d69687aa1    10207914  21364683  2155-03-22 05:06:00   
4  67c4cb3a44a7421d696880ba    10247468  21915776  2155-03-13 04:10:00   

             dischtime  admission_type                  admission_location  \
0  2155-03-20 17:00:00  EU OBSERVATION               WALK-IN/SELF REFERRAL   
1  2155-05-28 19:30:00          URGENT  INTERNAL TRANSFER TO OR FROM PSYCH   
2  2155-03-26 23:55:00        EW EMER.                      EMERGENCY ROOM   
3  2155-03-28 18:00:00        EW EMER.                      EMERGENCY ROOM   
4  2155-03-17 15:25:00        EW EMER.                      EMERGENCY ROOM   

  insurance language marital_status   race            edregtime  \
0  

Logic:
1. map addmission.hadm_id with patients to get demographics 
2. map addmission.hadm_id with diagnoses_icd.hadm_id get list of diagnoses
3. classify major category of diagnoses based on below list 

'Q45_QO1' - 'Tumour or cancer'
'Q45_QO2' - 'Heart condition'
 'Q45_QO3' - 'Lung condition'
 'Q45_QO4' - 'Neurological condition'
 'Q45_QO5' - 'Orthopaedic condition'
 'Q45_QO6' - 'COVID 19'
 'Q45_QO7' - 'Infection (other than COVID 19)'
 'Q45_QO8' - 'Digestive system condition'
 'Q45_QO9' - 'Diabetes and related problems'
 'Q45_QO10'- 'Adverse reaction or poising'
 'Q45_QO11' - 'Injury and or accident'
 'Q45_QO12'- 'Mental health issue'
 'Q45_QO13' - 'Tests and or investigations'
 'Q45_QO14' - 'Dont know or wasnt told'
 'Q45_QO15' - 'Other'

4. map satisfaction score for each patient by random sampling after applying filter with [admission_type, gender, insurance, age, icd_codes] 
5. get list of procedues for each patient
6. construct matrix
    1. rows: patient
    2. columns: procedures
    3. satisfaction index

In [290]:
merged_data = pd.merge(df_admissions, df_patients, on='subject_id', how='left')

In [291]:
diagnoses_merged = pd.merge(merged_data, df_diagnoses_icd, on=['hadm_id', 'subject_id'], how='left')

In [292]:
diagnoses_merged['icd_code'] = diagnoses_merged['icd_code'].fillna('Unknown')

In [293]:
icd_to_category = {
    '250': 'Q45_QO9',  # Diabetes and related problems
    '390-459': 'Q45_QO2',  # Heart condition
    '460-519': 'Q45_QO3',  # Lung condition
    '320-389': 'Q45_QO4',  # Neurological condition
    '710-739': 'Q45_QO5',  # Orthopaedic condition
    'U07.1': 'Q45_QO6',  # COVID-19 (ICD-10 example, not in sample)
    '001-139': 'Q45_QO7',  # Infection (other than COVID-19)
    '520-579': 'Q45_QO8',  # Digestive system condition
    '140-239': 'Q45_QO1',  # Tumour or cancer
    '960-989': 'Q45_QO10',  # Adverse reaction or poisoning
    '800-999': 'Q45_QO11',  # Injury and or accident
    '290-319': 'Q45_QO12',  # Mental health issue
    'V01-V91': 'Q45_QO13',  # Tests and or investigations
    'Unknown': 'Q45_QO14'  # Dont know or wasnt told
}
category_labels = {
    'Q45_QO1': 'Tumour or cancer',
    'Q45_QO2': 'Heart condition',
    'Q45_QO3': 'Lung condition',
    'Q45_QO4': 'Neurological condition',
    'Q45_QO5': 'Orthopaedic condition',
    'Q45_QO6': 'COVID 19',
    'Q45_QO7': 'Infection (other than COVID 19)',
    'Q45_QO8': 'Digestive system condition',
    'Q45_QO9': 'Diabetes and related problems',
    'Q45_QO10': 'Adverse reaction or poising',
    'Q45_QO11': 'Injury and or accident',
    'Q45_QO12': 'Mental health issue',
    'Q45_QO13': 'Tests and or investigations',
    'Q45_QO14': 'Dont know or wasnt told',
    'Q45_QO15': 'Other'
}

In [294]:
def classify_diagnosis(icd_code):
    for code_range, category in icd_to_category.items():
        if '-' in code_range:
            start, end = code_range.split('-')
            try:
                if start <= str(icd_code) <= end:
                    return category_labels[category]
            except:
                continue
        elif code_range == str(icd_code):
            return category_labels[category]
    return category_labels['Q45_QO15']  # Default to 'Other'


In [295]:
diagnoses_merged['condition_label'] = diagnoses_merged['icd_code'].apply(classify_diagnosis)

In [296]:
diagnoses_grouped = diagnoses_merged.groupby(['hadm_id', 'subject_id'])['condition_label'].apply(lambda x: ', '.join(set(x))).reset_index()

In [297]:
diagnoses_grouped

Unnamed: 0,hadm_id,subject_id,condition_label
0,20037816,12547294,"Digestive system condition, Lung condition, Other"
1,20067171,13584937,"Heart condition, Other"
2,20120850,15668092,"Heart condition, Other"
3,20165734,18879099,"Neurological condition, Mental health issue, H..."
4,20187769,11750274,"Neurological condition, Tumour or cancer, Hear..."
...,...,...,...
278,29866935,15497616,"Neurological condition, Other, Tests and or in..."
279,29880300,18845673,"Digestive system condition, Lung condition, Ot..."
280,29880455,14677148,"Neurological condition, Heart condition, Other"
281,29953111,17079680,"Injury and or accident, Other"


In [298]:
# Merge back with demographic data
patient_data = pd.merge(merged_data, diagnoses_grouped, on=['hadm_id', 'subject_id'], how='left')

In [299]:
patient_data.head()

Unnamed: 0,_id_x,subject_id,hadm_id,admittime,dischtime,admission_type,admission_location,insurance,language,marital_status,...,hospital_expire_flag,discharge_location,deathtime,_id_y,gender,anchor_age,anchor_year,anchor_year_group,dod,condition_label
0,67c4cb3a44a7421d696860bf,10046543,21402025,2155-03-15 21:22:00,2155-03-20 17:00:00,EU OBSERVATION,WALK-IN/SELF REFERRAL,Medicare,ENGLISH,SINGLE,...,0,,,6853ae6587fd170a1dfebea0,F,91,2155,2017 - 2019,NaT,Other
1,67c4cb3a44a7421d6968776b,10188106,28288574,2155-03-26 23:55:00,2155-05-28 19:30:00,URGENT,INTERNAL TRANSFER TO OR FROM PSYCH,Other,ENGLISH,SINGLE,...,0,ACUTE HOSPITAL,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,"Digestive system condition, Neurological condi..."
2,67c4cb3a44a7421d6968776c,10188106,29174671,2155-03-22 17:26:00,2155-03-26 23:55:00,EW EMER.,EMERGENCY ROOM,Other,ENGLISH,SINGLE,...,0,PSYCH FACILITY,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,"Digestive system condition, Neurological condi..."
3,67c4cb3a44a7421d69687aa1,10207914,21364683,2155-03-22 05:06:00,2155-03-28 18:00:00,EW EMER.,EMERGENCY ROOM,Other,ENGLISH,SINGLE,...,0,HOME,,6853ae6587fd170a1dfebea2,M,22,2155,2008 - 2010,NaT,"Injury and or accident, Mental health issue, O..."
4,67c4cb3a44a7421d696880ba,10247468,21915776,2155-03-13 04:10:00,2155-03-17 15:25:00,EW EMER.,EMERGENCY ROOM,Other,ENGLISH,SINGLE,...,0,HOME,,6853ae6587fd170a1dfebea3,M,23,2155,2008 - 2010,NaT,"Digestive system condition, Neurological condi..."


In [300]:
df_nies.head()

Unnamed: 0,_id,agegroup,Gender,sex,disability,ethnicgrp,HealthRegion,Source_of_Admission_Category,insure,medcard,...,AdmTypeBinary,ScoreOVERALL,satisfaction_score,condition_label,icd10_from,icd10_to,icd9_from,icd9_to,WaitTimeCats,gender
0,6855811336e4026fa6fae289,3,1,1,0,0.0,1,1,1.0,0.0,...,0.0,3.0,0.395,Diabetes and related problems,E08,E13,250.0,250.0,,M
1,6855811336e4026fa6fae28a,2,2,2,0,0.0,1,2,0.0,1.0,...,0.0,10.0,0.991667,Orthopaedic condition,M00,M99,710.0,739.0,,F
2,6855811336e4026fa6fae28b,1,2,2,0,0.0,1,1,1.0,1.0,...,1.0,9.0,0.935,Other,,,,,,F
3,6855811336e4026fa6fae28c,2,1,1,1,0.0,1,1,0.0,0.0,...,0.0,10.0,0.991667,Heart condition,I00,I99,390.0,459.0,1.0,M
4,6855811336e4026fa6fae28d,3,1,1,1,0.0,1,1,0.0,1.0,...,1.0,10.0,0.983333,Tumour or cancer,C00,D49,140.0,239.0,,M


In [301]:
df_nies['gender'] = df_nies['Gender'].map({1: 'M', 2: 'F'})
admission_type_map = {
    'EU OBSERVATION': 0.0,
    'URGENT': 0.0,
    'EW EMER.': 1.0
}
patient_data['AdmTypeBinary'] = patient_data['admission_type'].map(admission_type_map).fillna(0.0)

# Calculate average satisfaction_score by admission_type, gender, condition_label in df_nies
nies_avg_scores = df_nies.groupby(['AdmTypeBinary', 'gender', 'condition_label'])['satisfaction_score'].mean().reset_index()

In [None]:


patient_data = patient_data.assign(condition_label=patient_data['condition_label'].str.split(', ')).explode('condition_label')

In [289]:
patient_data

Unnamed: 0,_id_x,subject_id,hadm_id,admittime,dischtime,admission_type,admission_location,insurance,language,marital_status,...,discharge_location,deathtime,_id_y,gender,anchor_age,anchor_year,anchor_year_group,dod,condition_label,AdmTypeBinary
0,67c4cb3a44a7421d696860bf,10046543,21402025,2155-03-15 21:22:00,2155-03-20 17:00:00,EU OBSERVATION,WALK-IN/SELF REFERRAL,Medicare,ENGLISH,SINGLE,...,,,6853ae6587fd170a1dfebea0,F,91,2155,2017 - 2019,NaT,Other,0.0
1,67c4cb3a44a7421d6968776b,10188106,28288574,2155-03-26 23:55:00,2155-05-28 19:30:00,URGENT,INTERNAL TRANSFER TO OR FROM PSYCH,Other,ENGLISH,SINGLE,...,ACUTE HOSPITAL,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,Digestive system condition,0.0
1,67c4cb3a44a7421d6968776b,10188106,28288574,2155-03-26 23:55:00,2155-05-28 19:30:00,URGENT,INTERNAL TRANSFER TO OR FROM PSYCH,Other,ENGLISH,SINGLE,...,ACUTE HOSPITAL,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,Neurological condition,0.0
1,67c4cb3a44a7421d6968776b,10188106,28288574,2155-03-26 23:55:00,2155-05-28 19:30:00,URGENT,INTERNAL TRANSFER TO OR FROM PSYCH,Other,ENGLISH,SINGLE,...,ACUTE HOSPITAL,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,Lung condition,0.0
1,67c4cb3a44a7421d6968776b,10188106,28288574,2155-03-26 23:55:00,2155-05-28 19:30:00,URGENT,INTERNAL TRANSFER TO OR FROM PSYCH,Other,ENGLISH,SINGLE,...,ACUTE HOSPITAL,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,Heart condition,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,67c4cb4544a7421d696edb27,19895627,24253199,2155-03-12 08:00:00,2155-03-21 17:15:00,ELECTIVE,PHYSICIAN REFERRAL,Other,?,SINGLE,...,REHAB,,6853ae6587fd170a1dfebfa4,F,53,2154,2011 - 2013,NaT,Lung condition,0.0
282,67c4cb4544a7421d696ee300,19936109,24673093,2155-03-14 18:39:00,2155-03-18 15:32:00,DIRECT EMER.,PHYSICIAN REFERRAL,Medicare,ENGLISH,MARRIED,...,HOME,,6853ae6587fd170a1dfebfa5,F,80,2154,2008 - 2010,2155-11-05,Lung condition,0.0
282,67c4cb4544a7421d696ee300,19936109,24673093,2155-03-14 18:39:00,2155-03-18 15:32:00,DIRECT EMER.,PHYSICIAN REFERRAL,Medicare,ENGLISH,MARRIED,...,HOME,,6853ae6587fd170a1dfebfa5,F,80,2154,2008 - 2010,2155-11-05,Tests and or investigations,0.0
282,67c4cb4544a7421d696ee300,19936109,24673093,2155-03-14 18:39:00,2155-03-18 15:32:00,DIRECT EMER.,PHYSICIAN REFERRAL,Medicare,ENGLISH,MARRIED,...,HOME,,6853ae6587fd170a1dfebfa5,F,80,2154,2008 - 2010,2155-11-05,Heart condition,0.0


In [272]:
# Merge with nies_avg_scores
patient_data = pd.merge(
    patient_data,
    nies_avg_scores,
    left_on=['AdmTypeBinary', 'gender', 'condition_label'],
    right_on=['AdmTypeBinary', 'gender', 'condition_label'],
    how='left'
)

In [197]:
# Aggregate satisfaction_score per hadm_id (average across conditions if multiple)
patient_data = patient_data.groupby(['hadm_id', 'subject_id', 'admission_type', 'insurance', 'gender', 'anchor_age', 'AdmTypeBinary'])['satisfaction_score'].mean().reset_index()

In [273]:
patient_data

Unnamed: 0,_id_x,subject_id,hadm_id,admittime,dischtime,admission_type,admission_location,insurance,language,marital_status,...,deathtime,_id_y,gender,anchor_age,anchor_year,anchor_year_group,dod,condition_label,AdmTypeBinary,satisfaction_score
0,67c4cb3a44a7421d696860bf,10046543,21402025,2155-03-15 21:22:00,2155-03-20 17:00:00,EU OBSERVATION,WALK-IN/SELF REFERRAL,Medicare,ENGLISH,SINGLE,...,,6853ae6587fd170a1dfebea0,F,91,2155,2017 - 2019,NaT,Other,0.0,0.744600
1,67c4cb3a44a7421d6968776b,10188106,28288574,2155-03-26 23:55:00,2155-05-28 19:30:00,URGENT,INTERNAL TRANSFER TO OR FROM PSYCH,Other,ENGLISH,SINGLE,...,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,Digestive system condition,0.0,0.743242
2,67c4cb3a44a7421d6968776b,10188106,28288574,2155-03-26 23:55:00,2155-05-28 19:30:00,URGENT,INTERNAL TRANSFER TO OR FROM PSYCH,Other,ENGLISH,SINGLE,...,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,Neurological condition,0.0,0.791841
3,67c4cb3a44a7421d6968776b,10188106,28288574,2155-03-26 23:55:00,2155-05-28 19:30:00,URGENT,INTERNAL TRANSFER TO OR FROM PSYCH,Other,ENGLISH,SINGLE,...,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,Lung condition,0.0,0.830406
4,67c4cb3a44a7421d6968776b,10188106,28288574,2155-03-26 23:55:00,2155-05-28 19:30:00,URGENT,INTERNAL TRANSFER TO OR FROM PSYCH,Other,ENGLISH,SINGLE,...,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,Heart condition,0.0,0.848871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870,67c4cb4544a7421d696edb27,19895627,24253199,2155-03-12 08:00:00,2155-03-21 17:15:00,ELECTIVE,PHYSICIAN REFERRAL,Other,?,SINGLE,...,,6853ae6587fd170a1dfebfa4,F,53,2154,2011 - 2013,NaT,Lung condition,0.0,0.804887
871,67c4cb4544a7421d696ee300,19936109,24673093,2155-03-14 18:39:00,2155-03-18 15:32:00,DIRECT EMER.,PHYSICIAN REFERRAL,Medicare,ENGLISH,MARRIED,...,,6853ae6587fd170a1dfebfa5,F,80,2154,2008 - 2010,2155-11-05,Lung condition,0.0,0.804887
872,67c4cb4544a7421d696ee300,19936109,24673093,2155-03-14 18:39:00,2155-03-18 15:32:00,DIRECT EMER.,PHYSICIAN REFERRAL,Medicare,ENGLISH,MARRIED,...,,6853ae6587fd170a1dfebfa5,F,80,2154,2008 - 2010,2155-11-05,Tests and or investigations,0.0,
873,67c4cb4544a7421d696ee300,19936109,24673093,2155-03-14 18:39:00,2155-03-18 15:32:00,DIRECT EMER.,PHYSICIAN REFERRAL,Medicare,ENGLISH,MARRIED,...,,6853ae6587fd170a1dfebfa5,F,80,2154,2008 - 2010,2155-11-05,Heart condition,0.0,0.806162


In [200]:
# Fill missing satisfaction scores with 0
patient_data['satisfaction_score'] = patient_data['satisfaction_score'].fillna(0)

In [201]:
patient_data

Unnamed: 0,hadm_id,subject_id,admission_type,insurance,gender,anchor_age,AdmTypeBinary,satisfaction_score
0,20037816,12547294,EW EMER.,Medicare,F,22,1.0,0.853095
1,20067171,13584937,SURGICAL SAME DAY ADMISSION,Medicare,M,70,0.0,0.826236
2,20120850,15668092,EW EMER.,Other,M,71,1.0,0.894080
3,20165734,18879099,EU OBSERVATION,Medicaid,M,48,0.0,0.817131
4,20187769,11750274,EU OBSERVATION,Medicare,F,76,0.0,0.786521
...,...,...,...,...,...,...,...,...
278,29866935,15497616,EW EMER.,Medicare,M,71,1.0,0.868635
279,29880300,18845673,EW EMER.,Medicaid,M,42,1.0,0.858438
280,29880455,14677148,EU OBSERVATION,Other,F,59,0.0,0.768171
281,29953111,17079680,EW EMER.,Other,F,46,1.0,0.850469


In [202]:
procedures_merged = pd.merge(patient_data, df_procedures_icd, on=['hadm_id', 'subject_id'], how='left')

In [75]:
procedures_merged['icd_code_proc'] = procedures_merged['icd_code'].fillna('No_Procedure')

In [203]:
procedures_merged = procedures_merged.dropna(subset=['icd_code'])

In [204]:
procedures_merged.head()

Unnamed: 0,hadm_id,subject_id,admission_type,insurance,gender,anchor_age,AdmTypeBinary,satisfaction_score,_id,seq_num,chartdate,icd_code,icd_version
1,20067171,13584937,SURGICAL SAME DAY ADMISSION,Medicare,M,70,0.0,0.826236,684f6a3168f898b125637368,1.0,2155-03-17,3734,9.0
2,20067171,13584937,SURGICAL SAME DAY ADMISSION,Medicare,M,70,0.0,0.826236,684f6a3168f898b125637369,2.0,2155-03-17,3727,9.0
3,20120850,15668092,EW EMER.,Other,M,71,1.0,0.89408,684f69e968f898b125637183,1.0,2155-03-13,66,9.0
4,20120850,15668092,EW EMER.,Other,M,71,1.0,0.89408,684f69e968f898b125637184,2.0,2155-03-13,3607,9.0
5,20120850,15668092,EW EMER.,Other,M,71,1.0,0.89408,684f69e968f898b125637185,3.0,2155-03-13,45,9.0


In [205]:
# Aggregate procedures per patient/hadm_id

procedures_grouped = procedures_merged.groupby(['hadm_id', 'subject_id'])['icd_code'].apply(lambda x: ', '.join(set(map(str, x)))).reset_index()

# procedures_grouped = procedures_merged.groupby(['hadm_id', 'subject_id'])['icd_code'].apply(lambda x: ', '.join(set(x))).reset_index()

In [206]:
procedures_grouped

Unnamed: 0,hadm_id,subject_id,icd_code
0,20067171,13584937,"3727, 3734"
1,20120850,15668092,"45, 66, 3722, 3607, 8856, 41"
2,20276010,14969719,159
3,20296910,14629406,"6529, 741"
4,20329019,11069955,"4513, 3995, 5491, 3891, 4233, 3895, 9390, 3893"
...,...,...,...
158,29649567,11992390,5A1935Z
159,29686645,12701907,"0QPGX5Z, 0QSJ04Z, 0QSG04Z, 0QPJX5Z"
160,29866935,15497616,8051
161,29953111,17079680,4496


# Matrix Construction

In [89]:
all_procedures = set()
for codes in procedures_merged['icd_code'].dropna():
    for code in str(codes).split(', '):
        all_procedures.add(code)

In [92]:
matrix = pd.pivot_table(
    procedures_merged,
    values='satisfaction_score',
    index=['hadm_id', 'subject_id', 'gender', 'anchor_age', 'admission_type', 'insurance'],
    columns='icd_code',
    aggfunc='mean',
    fill_value=0
)

In [93]:
matrix = matrix.reset_index()

In [97]:

# Save matrix to CSV
matrix.to_csv('patient_procedure_matrix.csv', index=False)

# Display the matrix
print(matrix)

# Save processed patient data
patient_data.to_csv('processed_patient_data.csv', index=False)

icd_code   hadm_id  subject_id gender  anchor_age  \
0         20067171    13584937      M          70   
1         20120850    15668092      M          71   
2         20276010    14969719      F          59   
3         20296910    14629406      F          36   
4         20329019    11069955      F          70   
..             ...         ...    ...         ...   
158       29649567    11992390      F          29   
159       29686645    12701907      M          20   
160       29866935    15497616      M          71   
161       29953111    17079680      F          46   
162       29997422    15060292      M          32   

icd_code               admission_type insurance   14   17   34   40  ...  \
0         SURGICAL SAME DAY ADMISSION  Medicare  0.0  0.0  0.0  0.0  ...   
1                            EW EMER.     Other  0.0  0.0  0.0  0.0  ...   
2                            EW EMER.     Other  0.0  0.0  0.0  0.0  ...   
3                              URGENT  Medicaid  0.0  0.0  

In [98]:
matrix_collaborative = pd.pivot_table(
    procedures_merged,
    values='satisfaction_score',
    index=['hadm_id', 'subject_id'],
    columns='icd_code',
    aggfunc='mean',
    fill_value=0
)

# Reset index for clarity
matrix_collaborative = matrix_collaborative.reset_index()


In [99]:
matrix_collaborative

icd_code,hadm_id,subject_id,14,17,34,40,41,45,46,59,...,8E0W4CZ,B2011ZZ,B2101ZZ,B2111ZZ,B211YZZ,B2131ZZ,B214YZZ,B2181ZZ,B218YZZ,B410YZZ
0,20067171,13584937,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20120850,15668092,0.0,0.0,0.0,0.0,0.89408,0.89408,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20276010,14969719,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20296910,14629406,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20329019,11069955,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,29649567,11992390,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159,29686645,12701907,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
160,29866935,15497616,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161,29953111,17079680,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
patient_features = patient_data[['hadm_id', 'subject_id', 'gender', 'anchor_age', 'admission_type', 'insurance']].copy()

In [101]:
patient_features = patient_features.drop_duplicates(subset=['hadm_id', 'subject_id'])

In [102]:
patient_features = pd.merge(patient_features, diagnoses_grouped[['hadm_id', 'subject_id', 'condition_label']], 
                          on=['hadm_id', 'subject_id'], how='left')

In [103]:
patient_features = pd.get_dummies(patient_features, columns=['gender', 'admission_type', 'insurance', 'condition_label'], 
                                prefix=['gender', 'adm_type', 'ins', 'cond'])

In [107]:
procedure_features = df_procedures_icd[['icd_code']].drop_duplicates()

In [106]:
df_procedures_icd

Unnamed: 0,_id,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version
0,684f68df68f898b125636a1d,15183993,25345025,1,2155-03-23,8622,9
1,684f68e368f898b125636a27,19368299,21869570,1,2155-03-14,3979,9
2,684f68e368f898b125636a28,19368299,21869570,2,2155-03-12,8844,9
3,684f68e368f898b125636a29,19368299,21869570,3,2155-03-12,9929,9
4,684f68e368f898b125636a2a,19368299,21869570,4,2155-03-14,8844,9
...,...,...,...,...,...,...,...
505,684f6adc68f898b125637813,11422670,20827125,4,2155-03-12,06BQ4ZZ,10
506,684f6adc68f898b125637814,11422670,20827125,5,2155-03-12,5A1221Z,10
507,684f6adc68f898b125637815,11422670,20827125,6,2155-03-09,B211YZZ,10
508,684f6ae368f898b12563784e,12481952,27017213,1,2155-03-21,5A1935Z,10


In [108]:
procedure_features['procedure_category'] = procedure_features['icd_code'].apply(
    lambda x: 'Surgical' if x in ['8622', '3979'] else 'Diagnostic' if x in ['8844', '9929'] else 'Other'
)

In [109]:
procedure_features = pd.get_dummies(procedure_features, columns=['procedure_category'], prefix='proc_cat')

In [110]:
matrix_collaborative.to_csv('patient_procedure_matrix_collaborative.csv', index=False)

In [111]:
matrix_collaborative

icd_code,hadm_id,subject_id,14,17,34,40,41,45,46,59,...,8E0W4CZ,B2011ZZ,B2101ZZ,B2111ZZ,B211YZZ,B2131ZZ,B214YZZ,B2181ZZ,B218YZZ,B410YZZ
0,20067171,13584937,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20120850,15668092,0.0,0.0,0.0,0.0,0.89408,0.89408,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20276010,14969719,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20296910,14629406,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20329019,11069955,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,29649567,11992390,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159,29686645,12701907,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
160,29866935,15497616,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161,29953111,17079680,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Collaborative Filtering

In [113]:
from sklearn.metrics.pairwise import cosine_similarity

In [114]:
matrix_values = matrix_collaborative.drop(columns=['hadm_id', 'subject_id']).values
cosine_sim = cosine_similarity(matrix_values)

In [119]:
# Convert to DataFrame for easier handling
cosine_sim_df = pd.DataFrame(cosine_sim, index=matrix_collaborative['hadm_id'], columns=matrix_collaborative['hadm_id'])

In [120]:
cosine_sim_df

hadm_id,20067171,20120850,20276010,20296910,20329019,20424437,20434050,20435341,20587396,20636315,...,29209508,29371059,29413133,29511412,29588131,29649567,29686645,29866935,29953111,29997422
hadm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20067171,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
20120850,0.0,1.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.182574,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
20276010,0.0,0.0,1.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
20296910,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
20329019,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.158114,0.0,0.353553,...,0.0,0.000000,0.204124,0.0,0.0,0.0,0.0,0.0,0.0,0.176777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29649567,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.000000
29686645,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
29866935,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.000000
29953111,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.000000


In [121]:
procedures_merged

Unnamed: 0,hadm_id,subject_id,admission_type,insurance,gender,anchor_age,AdmTypeBinary,satisfaction_score,_id,seq_num,chartdate,icd_code,icd_version
1,20067171,13584937,SURGICAL SAME DAY ADMISSION,Medicare,M,70,0.0,0.826236,684f6a3168f898b125637368,1.0,2155-03-17,3734,9.0
2,20067171,13584937,SURGICAL SAME DAY ADMISSION,Medicare,M,70,0.0,0.826236,684f6a3168f898b125637369,2.0,2155-03-17,3727,9.0
3,20120850,15668092,EW EMER.,Other,M,71,1.0,0.894080,684f69e968f898b125637183,1.0,2155-03-13,66,9.0
4,20120850,15668092,EW EMER.,Other,M,71,1.0,0.894080,684f69e968f898b125637184,2.0,2155-03-13,3607,9.0
5,20120850,15668092,EW EMER.,Other,M,71,1.0,0.894080,684f69e968f898b125637185,3.0,2155-03-13,45,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,29997422,15060292,DIRECT EMER.,Other,M,32,0.0,0.815646,684f6a4068f898b1256373cd,1.0,2155-03-06,7769,9.0
626,29997422,15060292,DIRECT EMER.,Other,M,32,0.0,0.815646,684f6a4068f898b1256373ce,2.0,2155-03-08,7769,9.0
627,29997422,15060292,DIRECT EMER.,Other,M,32,0.0,0.815646,684f6a4068f898b1256373cf,3.0,2155-03-11,9659,9.0
628,29997422,15060292,DIRECT EMER.,Other,M,32,0.0,0.815646,684f6a4068f898b1256373d0,4.0,2155-03-11,8659,9.0


In [122]:
k = 3  # Number of nearest neighbors
satisfaction_threshold = 0.5  # Minimum satisfaction score for recommendation
recommendations = []
for hadm_id in matrix_collaborative['hadm_id']:
    # Get top k similar patients (excluding self)
    sim_scores = cosine_sim_df[hadm_id].sort_values(ascending=False)[1:k+1]
    neighbor_hadm_ids = sim_scores.index
    
    # Get procedures and satisfaction scores from neighbors
    neighbor_data = procedures_merged[procedures_merged['hadm_id'].isin(neighbor_hadm_ids)]
    neighbor_procs = neighbor_data[neighbor_data['satisfaction_score'] > satisfaction_threshold][['icd_code','icd_version', 'satisfaction_score']]
    
    # Aggregate procedures by average satisfaction score
    proc_scores = neighbor_procs.groupby('icd_code')['satisfaction_score'].mean().reset_index()
    proc_scores = proc_scores.sort_values(by='satisfaction_score', ascending=False)
    
    # Filter out procedures already performed by the patient
    patient_procs = procedures_merged[procedures_merged['hadm_id'] == hadm_id]['icd_code'].unique()
    recommended_procs = proc_scores[~proc_scores['icd_code'].isin(patient_procs)]
    
    # Add top recommendations (e.g., top 3 procedures)
    top_procs = recommended_procs.head(3)[['icd_code', 'satisfaction_score']].to_dict('records')
    recommendations.append({
        'hadm_id': hadm_id,
        'subject_id': matrix_collaborative[matrix_collaborative['hadm_id'] == hadm_id]['subject_id'].iloc[0],
        'recommended_procedures': [proc['icd_code'] for proc in top_procs],
        'satisfaction_scores': [proc['satisfaction_score'] for proc in top_procs]
    })

In [123]:
recommendations

[{'hadm_id': 20067171,
  'subject_id': 13584937,
  'recommended_procedures': [3721, 3521, 3956],
  'satisfaction_scores': [0.8262359523305459,
   0.7852162049694945,
   0.7852162049694945]},
 {'hadm_id': 20120850,
  'subject_id': 15668092,
  'recommended_procedures': [40, 8853, 9920],
  'satisfaction_scores': [0.8778231572201762,
   0.8778231572201762,
   0.8778231572201762]},
 {'hadm_id': 20276010,
  'subject_id': 14969719,
  'recommended_procedures': [3893, 4516, 4523],
  'satisfaction_scores': [0.8744700669626941,
   0.8744700669626941,
   0.8744700669626941]},
 {'hadm_id': 20296910,
  'subject_id': 14629406,
  'recommended_procedures': ['0DJ08ZZ', 3727, 3734],
  'satisfaction_scores': [0.85046875, 0.8262359523305459, 0.8262359523305459]},
 {'hadm_id': 20329019,
  'subject_id': 11069955,
  'recommended_procedures': [4823, 5011, 3897],
  'satisfaction_scores': [0.8864695576961541,
   0.8864695576961541,
   0.8487720323769811]},
 {'hadm_id': 20424437,
  'subject_id': 16944689,
  'reco

# Rebuilding patient_data

In [148]:
patient_data = pd.merge(merged_data, diagnoses_grouped, on=['hadm_id', 'subject_id'], how='left')

In [149]:
df_nies['gender'] = df_nies['Gender'].map({1: 'M', 2: 'F'})
admission_type_map = {'EU OBSERVATION': 0.0, 'URGENT': 0.0, 'EW EMER.': 1.0}
patient_data['AdmTypeBinary'] = patient_data['admission_type'].map(admission_type_map).fillna(0.0)
# patient_data = patient_data.assign(condition_label=patient_data['condition_label'].apply(lambda x: x if isinstance(x, list) else []))
# patient_data = patient_data.explode('condition_label')
# nies_avg_scores = df_nies.groupby(['AdmTypeBinary', 'gender', 'condition_label'])['satisfaction_score'].mean().reset_index()
# patient_data = pd.merge(patient_data, nies_avg_scores, 
#                       left_on=['AdmTypeBinary', 'gender', 'condition_label'],
#                       right_on=['AdmTypeBinary', 'gender', 'condition_label'], how='left')
# patient_data = patient_data.groupby(['hadm_id', 'subject_id', 'admission_type', 'insurance', 'gender', 'anchor_age', 'AdmTypeBinary'])['satisfaction_score'].mean().reset_index()
# patient_data['satisfaction_score'] = patient_data['satisfaction_score'].fillna(0)

In [151]:
patient_data = patient_data.assign(condition_label=patient_data['condition_label'].apply(lambda x: x if isinstance(x, list) else []))
patient_data = patient_data.explode('condition_label')

In [153]:
patient_data.head()

Unnamed: 0,_id_x,subject_id,hadm_id,admittime,dischtime,admission_type,admission_location,insurance,language,marital_status,...,discharge_location,deathtime,_id_y,gender,anchor_age,anchor_year,anchor_year_group,dod,condition_label,AdmTypeBinary
0,67c4cb3a44a7421d696860bf,10046543,21402025,2155-03-15 21:22:00,2155-03-20 17:00:00,EU OBSERVATION,WALK-IN/SELF REFERRAL,Medicare,ENGLISH,SINGLE,...,,,6853ae6587fd170a1dfebea0,F,91,2155,2017 - 2019,NaT,,0.0
1,67c4cb3a44a7421d6968776b,10188106,28288574,2155-03-26 23:55:00,2155-05-28 19:30:00,URGENT,INTERNAL TRANSFER TO OR FROM PSYCH,Other,ENGLISH,SINGLE,...,ACUTE HOSPITAL,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,,0.0
2,67c4cb3a44a7421d6968776c,10188106,29174671,2155-03-22 17:26:00,2155-03-26 23:55:00,EW EMER.,EMERGENCY ROOM,Other,ENGLISH,SINGLE,...,PSYCH FACILITY,,6853ae6587fd170a1dfebea1,M,39,2151,2011 - 2013,NaT,,1.0
3,67c4cb3a44a7421d69687aa1,10207914,21364683,2155-03-22 05:06:00,2155-03-28 18:00:00,EW EMER.,EMERGENCY ROOM,Other,ENGLISH,SINGLE,...,HOME,,6853ae6587fd170a1dfebea2,M,22,2155,2008 - 2010,NaT,,1.0
4,67c4cb3a44a7421d696880ba,10247468,21915776,2155-03-13 04:10:00,2155-03-17 15:25:00,EW EMER.,EMERGENCY ROOM,Other,ENGLISH,SINGLE,...,HOME,,6853ae6587fd170a1dfebea3,M,23,2155,2008 - 2010,NaT,,1.0


In [133]:

# Step 7: Create patient features with simplified condition encoding
patient_features = patient_data[['hadm_id', 'subject_id', 'gender', 'anchor_age', 'admission_type', 'insurance']].drop_duplicates()
patient_features = pd.merge(patient_features, diagnoses_grouped[['hadm_id', 'subject_id', 'condition_label']], 
                          on=['hadm_id', 'subject_id'], how='left')

# Create binary columns for each condition
all_conditions = set(category_labels.values())
for condition in all_conditions:
    patient_features[f'cond_{condition}'] = patient_features['condition_label'].apply(
        lambda x: 1 if isinstance(x, list) and condition in x else 0
    )

# Drop the original condition_label column
patient_features = patient_features.drop(columns=['condition_label'])

# One-hot encode categorical features
patient_features = pd.get_dummies(patient_features, columns=['gender', 'admission_type', 'insurance'], 
                                prefix=['gender', 'adm_type', 'ins'])

In [132]:
patient_data

Unnamed: 0,hadm_id,subject_id,admission_type,insurance,gender,anchor_age,AdmTypeBinary,satisfaction_score
0,20037816,12547294,EW EMER.,Medicare,F,22,1.0,0.853095
1,20067171,13584937,SURGICAL SAME DAY ADMISSION,Medicare,M,70,0.0,0.826236
2,20120850,15668092,EW EMER.,Other,M,71,1.0,0.894080
3,20165734,18879099,EU OBSERVATION,Medicaid,M,48,0.0,0.817131
4,20187769,11750274,EU OBSERVATION,Medicare,F,76,0.0,0.786521
...,...,...,...,...,...,...,...,...
278,29866935,15497616,EW EMER.,Medicare,M,71,1.0,0.868635
279,29880300,18845673,EW EMER.,Medicaid,M,42,1.0,0.858438
280,29880455,14677148,EU OBSERVATION,Other,F,59,0.0,0.768171
281,29953111,17079680,EW EMER.,Other,F,46,1.0,0.850469


In [134]:
procedures_merged['icd_code_proc'] = procedures_merged['icd_code'].fillna('No_Procedure')

In [135]:
patient_features = patient_data[['hadm_id', 'subject_id', 'gender', 'anchor_age', 'admission_type', 'insurance']].drop_duplicates()
patient_features = pd.merge(patient_features, diagnoses_grouped[['hadm_id', 'subject_id', 'condition_label']], 
                          on=['hadm_id', 'subject_id'], how='left')

# Create binary columns for each condition
all_conditions = set(category_labels.values())
for condition in all_conditions:
    patient_features[f'cond_{condition}'] = patient_features['condition_label'].apply(
        lambda x: 1 if isinstance(x, list) and condition in x else 0
    )

# Drop the original condition_label column
patient_features = patient_features.drop(columns=['condition_label'])

# One-hot encode categorical features
patient_features = pd.get_dummies(patient_features, columns=['gender', 'admission_type', 'insurance'], 
                                prefix=['gender', 'adm_type', 'ins'])

In [136]:
patient_features

Unnamed: 0,hadm_id,subject_id,anchor_age,cond_Digestive system condition,cond_Neurological condition,cond_Lung condition,cond_Heart condition,cond_Infection (other than COVID 19),cond_Other,cond_COVID 19,...,adm_type_DIRECT OBSERVATION,adm_type_ELECTIVE,adm_type_EU OBSERVATION,adm_type_EW EMER.,adm_type_OBSERVATION ADMIT,adm_type_SURGICAL SAME DAY ADMISSION,adm_type_URGENT,ins_Medicaid,ins_Medicare,ins_Other
0,20037816,12547294,22,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,True,False
1,20067171,13584937,70,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,True,False
2,20120850,15668092,71,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,False,True
3,20165734,18879099,48,0,0,0,0,0,0,0,...,False,False,True,False,False,False,False,True,False,False
4,20187769,11750274,76,0,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,29866935,15497616,71,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,True,False
279,29880300,18845673,42,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
280,29880455,14677148,59,0,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,False,True
281,29953111,17079680,46,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,False,True


In [138]:
patient_data

Unnamed: 0,hadm_id,subject_id,admission_type,insurance,gender,anchor_age,AdmTypeBinary,satisfaction_score
0,20037816,12547294,EW EMER.,Medicare,F,22,1.0,0.853095
1,20067171,13584937,SURGICAL SAME DAY ADMISSION,Medicare,M,70,0.0,0.826236
2,20120850,15668092,EW EMER.,Other,M,71,1.0,0.894080
3,20165734,18879099,EU OBSERVATION,Medicaid,M,48,0.0,0.817131
4,20187769,11750274,EU OBSERVATION,Medicare,F,76,0.0,0.786521
...,...,...,...,...,...,...,...,...
278,29866935,15497616,EW EMER.,Medicare,M,71,1.0,0.868635
279,29880300,18845673,EW EMER.,Medicaid,M,42,1.0,0.858438
280,29880455,14677148,EU OBSERVATION,Other,F,59,0.0,0.768171
281,29953111,17079680,EW EMER.,Other,F,46,1.0,0.850469


In [137]:
# Step 10: Filter recommendations by patient conditions
filtered_recommendations = []
for rec in recommendations:
    hadm_id = rec['hadm_id']
    patient_conditions = patient_features[patient_features['hadm_id'] == hadm_id][[col for col in patient_features.columns if col.startswith('cond_')]].iloc[0]
    patient_conditions = [col.replace('cond_', '') for col, val in patient_conditions.items() if val == 1]
    valid_procs = []
    valid_scores = []
    for proc, score in zip(rec['recommended_procedures'], rec['satisfaction_scores']):
        proc_patients = procedures_merged[procedures_merged['icd_code_proc'] == proc]['hadm_id']
        proc_conditions = patient_data[patient_data['hadm_id'].isin(proc_patients)]['condition_label'].str.split(', ').explode().unique()
        if any(cond in patient_conditions for cond in proc_conditions):
            valid_procs.append(proc)
            valid_scores.append(score)
    filtered_recommendations.append({
        'hadm_id': hadm_id,
        'subject_id': rec['subject_id'],
        'recommended_procedures': valid_procs[:3],
        'satisfaction_scores': valid_scores[:3]
    })

KeyError: 'condition_label'

# Re Writing

In [208]:
# Step 6: Create patient-procedure matrix
matrix_collaborative = pd.pivot_table(
    procedures_merged,
    values='satisfaction_score',
    index=['hadm_id', 'subject_id'],
    columns='icd_code',
    aggfunc='mean',
    fill_value=0
)

In [209]:
# Step 7: Create patient features with simplified condition encoding
patient_features = patient_data[['hadm_id', 'subject_id', 'gender', 'anchor_age', 'admission_type', 'insurance']].drop_duplicates()
patient_features = pd.merge(patient_features, diagnoses_grouped[['hadm_id', 'subject_id', 'condition_label']], 
                          on=['hadm_id', 'subject_id'], how='left')


In [210]:
# Create binary columns for each condition
all_conditions = set(category_labels.values())
for condition in all_conditions:
    patient_features[f'cond_{condition}'] = patient_features['condition_label'].apply(
        lambda x: 1 if isinstance(x, list) and condition in x else 0
    )

In [259]:
all_conditions

{'Adverse reaction or poising',
 'COVID 19',
 'Diabetes and related problems',
 'Digestive system condition',
 'Dont know or wasnt told',
 'Heart condition',
 'Infection (other than COVID 19)',
 'Injury and or accident',
 'Lung condition',
 'Mental health issue',
 'Neurological condition',
 'Orthopaedic condition',
 'Other',
 'Tests and or investigations',
 'Tumour or cancer'}

In [211]:
# Drop the original condition_label column
patient_features = patient_features.drop(columns=['condition_label'])


In [212]:
# One-hot encode categorical features
patient_features = pd.get_dummies(patient_features, columns=['gender', 'admission_type', 'insurance'], 
                                prefix=['gender', 'adm_type', 'ins'])

In [232]:
matrix_collaborative.head()

Unnamed: 0_level_0,icd_code,14,17,34,40,41,45,46,59,66,159,...,8E0W4CZ,B2011ZZ,B2101ZZ,B2111ZZ,B211YZZ,B2131ZZ,B214YZZ,B2181ZZ,B218YZZ,B410YZZ
hadm_id,subject_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
20067171,13584937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20120850,15668092,0.0,0.0,0.0,0.0,0.89408,0.89408,0.0,0.0,0.89408,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20276010,14969719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.85372,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20296910,14629406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20329019,11069955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [216]:
# Step 8: Compute cosine similarity
# matrix_values = matrix_collaborative.drop(columns=['hadm_id', 'subject_id']).values
# cosine_sim = cosine_similarity(matrix_values)
# cosine_sim_df = pd.DataFrame(cosine_sim, index=matrix_collaborative['hadm_id'], columns=matrix_collaborative['hadm_id'])


matrix_values = matrix_collaborative.values
cosine_sim = cosine_similarity(matrix_values)

# Use multi-index (hadm_id + subject_id) as index for the similarity matrix
cosine_sim_df = pd.DataFrame(cosine_sim, 
                              index=matrix_collaborative.index, 
                              columns=matrix_collaborative.index)

In [None]:
# matrix = matrix_collaborative.reset_index()

In [226]:
cosine_sim_df = pd.DataFrame(cosine_sim, 
                              index=matrix_collaborative.index, 
                              columns=matrix_collaborative.index)

In [233]:
matrix

icd_code,hadm_id,subject_id,14,17,34,40,41,45,46,59,...,8E0W4CZ,B2011ZZ,B2101ZZ,B2111ZZ,B211YZZ,B2131ZZ,B214YZZ,B2181ZZ,B218YZZ,B410YZZ
0,20067171,13584937,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20120850,15668092,0.0,0.0,0.0,0.0,0.89408,0.89408,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20276010,14969719,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20296910,14629406,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20329019,11069955,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,29649567,11992390,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159,29686645,12701907,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
160,29866935,15497616,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161,29953111,17079680,0.0,0.0,0.0,0.0,0.00000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [234]:
matrix['hadm_id']

0      20067171
1      20120850
2      20276010
3      20296910
4      20329019
         ...   
158    29649567
159    29686645
160    29866935
161    29953111
162    29997422
Name: hadm_id, Length: 163, dtype: int64

In [235]:

# Step 9: Find k-nearest neighbors and recommend procedures
k = 3
satisfaction_threshold = 0.5
recommendations = []

for hadm_id in matrix['hadm_id']:
    sim_scores = cosine_sim_df[hadm_id].sort_values(ascending=False)[1:k+1]
    neighbor_hadm_ids = sim_scores.index
    neighbor_data = procedures_merged[procedures_merged['hadm_id'].isin(neighbor_hadm_ids)]
    neighbor_procs = neighbor_data[neighbor_data['satisfaction_score'] > satisfaction_threshold][['icd_code_proc', 'satisfaction_score']]
    proc_scores = neighbor_procs.groupby('icd_code_proc')['satisfaction_score'].mean().reset_index()
    proc_scores = proc_scores.sort_values(by='satisfaction_score', ascending=False)
    patient_procs = procedures_merged[procedures_merged['hadm_id'] == hadm_id]['icd_code_proc'].unique()
    recommended_procs = proc_scores[~proc_scores['icd_code_proc'].isin(patient_procs)]
    top_procs = recommended_procs.head(3)[['icd_code_proc', 'satisfaction_score']].to_dict('records')
    recommendations.append({
        'hadm_id': hadm_id,
        'subject_id': matrix[matrix['hadm_id'] == hadm_id]['subject_id'].iloc[0],
        'recommended_procedures': [proc['icd_code_proc'] for proc in top_procs],
        'satisfaction_scores': [proc['satisfaction_score'] for proc in top_procs]
    })

TypeError: DataFrame.sort_values() missing 1 required positional argument: 'by'

In [244]:
print('matrix:')
print(matrix.head())
print('cosine_sim_df:')
print(cosine_sim_df.head())

matrix:
icd_code   hadm_id  subject_id   14   17   34   40       41       45   46  \
0         20067171    13584937  0.0  0.0  0.0  0.0  0.00000  0.00000  0.0   
1         20120850    15668092  0.0  0.0  0.0  0.0  0.89408  0.89408  0.0   
2         20276010    14969719  0.0  0.0  0.0  0.0  0.00000  0.00000  0.0   
3         20296910    14629406  0.0  0.0  0.0  0.0  0.00000  0.00000  0.0   
4         20329019    11069955  0.0  0.0  0.0  0.0  0.00000  0.00000  0.0   

icd_code   59  ...  8E0W4CZ  B2011ZZ  B2101ZZ  B2111ZZ  B211YZZ  B2131ZZ  \
0         0.0  ...      0.0      0.0      0.0      0.0      0.0      0.0   
1         0.0  ...      0.0      0.0      0.0      0.0      0.0      0.0   
2         0.0  ...      0.0      0.0      0.0      0.0      0.0      0.0   
3         0.0  ...      0.0      0.0      0.0      0.0      0.0      0.0   
4         0.0  ...      0.0      0.0      0.0      0.0      0.0      0.0   

icd_code  B214YZZ  B2181ZZ  B218YZZ  B410YZZ  
0             0.0      0.

In [247]:
cosine_sim_df = pd.DataFrame(
    cosine_similarity(matrix_values),
    index=matrix['hadm_id'],
    columns=matrix['hadm_id']
)

In [250]:
print(type(cosine_sim_df[20067171]))

<class 'pandas.core.series.Series'>


In [252]:
k = 3
satisfaction_threshold = 0.5
recommendations = []

for hadm_id in matrix['hadm_id']:
    sim_scores = cosine_sim_df[hadm_id].sort_values(ascending=False)[1:k+1]  # skip self
    neighbor_hadm_ids = sim_scores.index
    neighbor_data = procedures_merged[procedures_merged['hadm_id'].isin(neighbor_hadm_ids)]
    
    neighbor_procs = neighbor_data[
        neighbor_data['satisfaction_score'] > satisfaction_threshold
    ][['icd_code', 'satisfaction_score']]
    
    proc_scores = neighbor_procs.groupby('icd_code')['satisfaction_score'].mean().reset_index()
    proc_scores = proc_scores.sort_values(by='satisfaction_score', ascending=False)
    
    patient_procs = procedures_merged[
        procedures_merged['hadm_id'] == hadm_id
    ]['icd_code'].unique()
    
    recommended_procs = proc_scores[~proc_scores['icd_code'].isin(patient_procs)]
    
    top_procs = recommended_procs.head(3)[['icd_code', 'satisfaction_score']].to_dict('records')
    
    recommendations.append({
        'hadm_id': hadm_id,
        'subject_id': matrix.loc[matrix['hadm_id'] == hadm_id, 'subject_id'].values[0],
        'recommended_procedures': [proc['icd_code'] for proc in top_procs],
        'satisfaction_scores': [proc['satisfaction_score'] for proc in top_procs]
    })

recommendations_df = pd.DataFrame(recommendations)


In [253]:
recommendations_df

Unnamed: 0,hadm_id,subject_id,recommended_procedures,satisfaction_scores
0,20067171,13584937,"[3721, 3521, 3956]","[0.8262359523305459, 0.7852162049694945, 0.785..."
1,20120850,15668092,"[40, 8853, 9920]","[0.8778231572201762, 0.8778231572201762, 0.877..."
2,20276010,14969719,"[3893, 4516, 4523]","[0.8744700669626941, 0.8744700669626941, 0.874..."
3,20296910,14629406,"[0DJ08ZZ, 3727, 3734]","[0.85046875, 0.8262359523305459, 0.82623595233..."
4,20329019,11069955,"[4823, 5011, 3897]","[0.8864695576961541, 0.8864695576961541, 0.848..."
...,...,...,...,...
158,29649567,11992390,"[5A1D70Z, 5A2204Z, 0BH17EZ]","[0.8612804878048781, 0.8612804878048781, 0.802..."
159,29686645,12701907,"[3893, 4516, 4523]","[0.8744700669626941, 0.8744700669626941, 0.874..."
160,29866935,15497616,"[3727, 3734, 34]","[0.8262359523305459, 0.8262359523305459, 0.792..."
161,29953111,17079680,"[3893, 4516, 4523]","[0.8744700669626941, 0.8744700669626941, 0.874..."


In [303]:
patient_features

Unnamed: 0,hadm_id,subject_id,anchor_age,cond_Digestive system condition,cond_Neurological condition,cond_Lung condition,cond_Heart condition,cond_Infection (other than COVID 19),cond_Other,cond_COVID 19,...,adm_type_DIRECT OBSERVATION,adm_type_ELECTIVE,adm_type_EU OBSERVATION,adm_type_EW EMER.,adm_type_OBSERVATION ADMIT,adm_type_SURGICAL SAME DAY ADMISSION,adm_type_URGENT,ins_Medicaid,ins_Medicare,ins_Other
0,20037816,12547294,22,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,True,False
1,20067171,13584937,70,0,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,True,False
2,20120850,15668092,71,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,False,True
3,20165734,18879099,48,0,0,0,0,0,0,0,...,False,False,True,False,False,False,False,True,False,False
4,20187769,11750274,76,0,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,29866935,15497616,71,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,True,False
279,29880300,18845673,42,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
280,29880455,14677148,59,0,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,False,True
281,29953111,17079680,46,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,False,True


In [304]:
# Step 10: Filter recommendations by patient conditions
filtered_recommendations = []
for rec in recommendations:
    hadm_id = rec['hadm_id']
    patient_conditions = patient_features[patient_features['hadm_id'] == hadm_id][[col for col in patient_features.columns if col.startswith('cond_')]].iloc[0]
    patient_conditions = [col.replace('cond_', '') for col, val in patient_conditions.items() if val == 1]
    valid_procs = []
    valid_scores = []
    for proc, score in zip(rec['recommended_procedures'], rec['satisfaction_scores']):
        proc_patients = procedures_merged[procedures_merged['icd_code'] == proc]['hadm_id']
        proc_conditions = patient_data[patient_data['hadm_id'].isin(proc_patients)]['condition_label'].str.split(', ').explode().unique()
        if any(cond in patient_conditions for cond in proc_conditions):
            valid_procs.append(proc)
            valid_scores.append(score)
    filtered_recommendations.append({
        'hadm_id': hadm_id,
        'subject_id': rec['subject_id'],
        'recommended_procedures': valid_procs[:3],
        'satisfaction_scores': valid_scores[:3]
    })


In [306]:
recommendations

[{'hadm_id': 20067171,
  'subject_id': 13584937,
  'recommended_procedures': [3721, 3521, 3956],
  'satisfaction_scores': [0.8262359523305459,
   0.7852162049694945,
   0.7852162049694945]},
 {'hadm_id': 20120850,
  'subject_id': 15668092,
  'recommended_procedures': [40, 8853, 9920],
  'satisfaction_scores': [0.8778231572201762,
   0.8778231572201762,
   0.8778231572201762]},
 {'hadm_id': 20276010,
  'subject_id': 14969719,
  'recommended_procedures': [3893, 4516, 4523],
  'satisfaction_scores': [0.8744700669626941,
   0.8744700669626941,
   0.8744700669626941]},
 {'hadm_id': 20296910,
  'subject_id': 14629406,
  'recommended_procedures': ['0DJ08ZZ', 3727, 3734],
  'satisfaction_scores': [0.85046875, 0.8262359523305459, 0.8262359523305459]},
 {'hadm_id': 20329019,
  'subject_id': 11069955,
  'recommended_procedures': [4823, 5011, 3897],
  'satisfaction_scores': [0.8864695576961541,
   0.8864695576961541,
   0.8487720323769811]},
 {'hadm_id': 20424437,
  'subject_id': 16944689,
  'reco

In [258]:
procedures_merged

Unnamed: 0,hadm_id,subject_id,admission_type,insurance,gender,anchor_age,AdmTypeBinary,satisfaction_score,_id,seq_num,chartdate,icd_code,icd_version
1,20067171,13584937,SURGICAL SAME DAY ADMISSION,Medicare,M,70,0.0,0.826236,684f6a3168f898b125637368,1.0,2155-03-17,3734,9.0
2,20067171,13584937,SURGICAL SAME DAY ADMISSION,Medicare,M,70,0.0,0.826236,684f6a3168f898b125637369,2.0,2155-03-17,3727,9.0
3,20120850,15668092,EW EMER.,Other,M,71,1.0,0.894080,684f69e968f898b125637183,1.0,2155-03-13,66,9.0
4,20120850,15668092,EW EMER.,Other,M,71,1.0,0.894080,684f69e968f898b125637184,2.0,2155-03-13,3607,9.0
5,20120850,15668092,EW EMER.,Other,M,71,1.0,0.894080,684f69e968f898b125637185,3.0,2155-03-13,45,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,29997422,15060292,DIRECT EMER.,Other,M,32,0.0,0.815646,684f6a4068f898b1256373cd,1.0,2155-03-06,7769,9.0
626,29997422,15060292,DIRECT EMER.,Other,M,32,0.0,0.815646,684f6a4068f898b1256373ce,2.0,2155-03-08,7769,9.0
627,29997422,15060292,DIRECT EMER.,Other,M,32,0.0,0.815646,684f6a4068f898b1256373cf,3.0,2155-03-11,9659,9.0
628,29997422,15060292,DIRECT EMER.,Other,M,32,0.0,0.815646,684f6a4068f898b1256373d0,4.0,2155-03-11,8659,9.0
