In [9]:
from pymongo import MongoClient
import pandas as pd
from datetime import datetime
from itertools import chain

client = MongoClient("mongodb://localhost:27017/")
db = client["recommender_system"] 
collection = db["admissions_discharges"]

In [10]:
start_date = datetime(2024, 3, 1)
end_date = datetime(2024, 3, 31, 23, 59, 59)
query = {
    "date": {
        "$gte": start_date,
        "$lte": end_date
    }
}
cursor = collection.find(query)
df = pd.DataFrame(list(cursor))

In [11]:
df.shape

(31, 19)

In [12]:
diagnoses_icds = []
emergency_ids = []
other_ids = []
for i in range(len(df)): 
    emergency_ids.append(df.iloc[i]['hadm_ids_emergency'])
    other_ids.append(df.iloc[i]['hadm_ids_other']) 

In [13]:
flattened = list(chain.from_iterable(emergency_ids))
unique_emergency_ids = list(set(flattened))
flattened = list(chain.from_iterable(other_ids))
unique_other_ids = list(set(flattened))

In [14]:
hadm_ids = []
hadm_ids.append(unique_emergency_ids)
hadm_ids.append(unique_other_ids) 
flattened = list(chain.from_iterable(hadm_ids)) 
unique_hadm_ids = list(set(flattened))

In [15]:
unique_hadm_ids

[25345025,
 20444674,
 21869570,
 27535364,
 29769736,
 25863179,
 25272331,
 23909902,
 26317840,
 21137424,
 29172242,
 21223956,
 22499348,
 24395288,
 23321627,
 22107164,
 29614622,
 27380769,
 29527588,
 22161445,
 24044584,
 28335657,
 25607722,
 27433515,
 20842540,
 28022830,
 23346223,
 28267056,
 27693106,
 28853300,
 27729462,
 27012662,
 23623224,
 28247094,
 26873913,
 20329019,
 26688057,
 21154875,
 28288574,
 27309636,
 20582983,
 27682375,
 28831305,
 27383372,
 24513106,
 26311763,
 21774933,
 21664854,
 29953111,
 21534300,
 25701469,
 24825440,
 28466275,
 25229925,
 20165734,
 27217002,
 26025579,
 21646444,
 29023853,
 28037230,
 24057964,
 21422707,
 22873205,
 24463990,
 22052982,
 21750390,
 20187769,
 23110267,
 27757692,
 21915776,
 28717697,
 20434050,
 27853440,
 23052417,
 20894341,
 21214338,
 29880455,
 20976257,
 24557197,
 20700814,
 23448206,
 22279314,
 23424661,
 20926615,
 26984601,
 20636315,
 27349660,
 27484316,
 27391643,
 25287839,
 29649567,

In [13]:
emergency_ids =  df.iloc[0]['hadm_ids_emergency']

In [14]:
for j in range(len(emergency_ids)):
    print(emergency_ids[j])

28335657
27034794
28466275
23020382
20943009
29866935


In [16]:

db_MIMIC = client["MIMIC"] 
collection_diagnoses_icd = db_MIMIC["diagnoses_icd"]
collection_procedures_icd = db_MIMIC["procedures_icd"]
collection_admissions = db_MIMIC["admissions"]
collection_patients = db_MIMIC["patients"]

In [86]:
# diagnoses_icds = []
# procedures_icds = []
for hadm_id in unique_hadm_ids: 
    print(hadm_id)
    cursor = collection_diagnoses_icd.find({"hadm_id": hadm_id})
    documents = list(cursor)
    print('diagnoses_icd',len(documents))
    for doc in documents:
        doc.pop("_id", None)  
    collection = db["diagnoses_icd"] 
    if documents: 
        result = collection.insert_many(documents)
        print(f"{len(result.inserted_ids)} records inserted.")

    
    cursor = collection_procedures_icd.find({"hadm_id": hadm_id})
    documents = list(cursor)
    print('procedures_icd',len(documents))
    for doc in documents:
        doc.pop("_id", None)  
    collection = db["procedures_icd"] 
    if documents: 
        result = collection.insert_many(documents)
        print(f"{len(result.inserted_ids)} records inserted.")
# # Convert to DataFrame
# df_diagnoses = pd.DataFrame(diagnoses_icds)
# df_procedures = pd.DataFrame(procedures_icds)  

25345025
diagnoses_icd 5
5 records inserted.
procedures_icd 1
1 records inserted.
20444674
diagnoses_icd 3
3 records inserted.
procedures_icd 0
21869570
diagnoses_icd 6
6 records inserted.
procedures_icd 6
6 records inserted.
27535364
diagnoses_icd 6
6 records inserted.
procedures_icd 0
29769736
diagnoses_icd 9
9 records inserted.
procedures_icd 0
25863179
diagnoses_icd 4
4 records inserted.
procedures_icd 2
2 records inserted.
25272331
diagnoses_icd 6
6 records inserted.
procedures_icd 1
1 records inserted.
23909902
diagnoses_icd 5
5 records inserted.
procedures_icd 0
26317840
diagnoses_icd 5
5 records inserted.
procedures_icd 0
21137424
diagnoses_icd 8
8 records inserted.
procedures_icd 5
5 records inserted.
29172242
diagnoses_icd 39
39 records inserted.
procedures_icd 19
19 records inserted.
21223956
diagnoses_icd 3
3 records inserted.
procedures_icd 0
22499348
diagnoses_icd 4
4 records inserted.
procedures_icd 2
2 records inserted.
24395288
diagnoses_icd 7
7 records inserted.
proce

In [17]:
24579787 in unique_hadm_ids

True

In [82]:
cursor = collection_diagnoses_icd.find({"hadm_id": 24579787})
documents = list(cursor)

In [18]:
admissions_cursor = collection_admissions.find({"hadm_id": {"$in": unique_hadm_ids}})
df_admissions = pd.DataFrame(list(admissions_cursor))

In [19]:
subject_ids = df_admissions['subject_id'].unique().tolist()

In [102]:
records = df_admissions.to_dict('records')  
# collection = db["admissions"]
if records: 
    collection.insert_many(records)

In [20]:
subject_ids

[10046543,
 10188106,
 10207914,
 10247468,
 10261732,
 10270108,
 10312052,
 10342727,
 10346869,
 10396938,
 10449873,
 10467410,
 10493420,
 10529115,
 10536786,
 10540284,
 10556566,
 10570079,
 10572666,
 10592833,
 10602678,
 10624561,
 10648095,
 10649183,
 10669823,
 10674504,
 10676442,
 10706500,
 10717970,
 10745565,
 10746247,
 10751258,
 10755791,
 10787196,
 10800472,
 10844079,
 10937510,
 10958253,
 11069955,
 11113889,
 11119871,
 11122975,
 11126859,
 11127819,
 11151861,
 11168491,
 11171757,
 11230090,
 11266795,
 11300135,
 11334817,
 11415869,
 11422670,
 11451795,
 11493762,
 11520986,
 11554694,
 11581862,
 11621470,
 11708854,
 11724731,
 11750274,
 11770961,
 11786902,
 11801099,
 11818101,
 11854989,
 11871004,
 11902324,
 11945540,
 11966397,
 11992390,
 12122921,
 12124962,
 12175862,
 12209542,
 12235568,
 12271405,
 12338292,
 12344190,
 12414946,
 12428722,
 12452729,
 12474949,
 12481952,
 12547294,
 12584804,
 12606543,
 12623657,
 12631532,
 12648612,

In [21]:
patients_cursor = collection_patients.find({"subject_id": {"$in": subject_ids}})
df_patients = pd.DataFrame(list(patients_cursor))

In [22]:
df_patients

Unnamed: 0,_id,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,67c5655a24c857f62da20469,10046543,F,91,2155,2017 - 2019,NaT
1,67c5655a24c857f62da214d7,10188106,M,39,2151,2011 - 2013,NaT
2,67c5655a24c857f62da2173b,10207914,M,22,2155,2008 - 2010,NaT
3,67c5655a24c857f62da21bb4,10247468,M,23,2155,2008 - 2010,NaT
4,67c5655a24c857f62da21d70,10261732,F,79,2155,2011 - 2013,2155-03-24
...,...,...,...,...,...,...,...
257,67c5656c24c857f62da65f01,19562159,F,81,2155,2017 - 2019,NaT
258,67c5656c24c857f62da667fa,19638442,F,47,2151,2008 - 2010,NaT
259,67c5656c24c857f62da66bdf,19672997,F,66,2155,2011 - 2013,2155-05-02
260,67c5656c24c857f62da685ba,19895627,F,53,2154,2011 - 2013,NaT


In [46]:
df_patients_cleaned = df_patients.where(pd.notnull(df_patients), None)
# Step 2: Convert timezone-aware datetime columns to naive datetime
for col in df_patients_cleaned.select_dtypes(include=["datetimetz"]).columns:
    df_patients_cleaned[col] = df_patients_cleaned[col].dt.tz_localize(None)

# Step 3: Convert to list of records
records = df_patients_cleaned.to_dict(orient='records')

# Step 4: Insert into MongoDB
collection = db["patients"]
if records:
    result = collection.insert_many(records)
    print(f"Inserted {len(result.inserted_ids)} records.")
else:
    print("No records to insert.")

ValueError: NaTType does not support utcoffset

In [25]:
df_patients['dod'] = df_patients['dod'].apply(lambda x: x.to_pydatetime() if pd.notnull(x) else None)

In [34]:
df_patients['dod'] = df_patients['dod'].where(~df_patients['dod'].isna(), None)
# df_patients['dod'] = df_patients['dod'].replace(pd.NaT, None)

In [39]:
df_patients['dod'].where(df_patients['dod'].notna(), None)

0            NaT
1            NaT
2            NaT
3            NaT
4     2155-03-24
         ...    
257          NaT
258          NaT
259   2155-05-02
260          NaT
261   2155-11-05
Name: dod, Length: 262, dtype: datetime64[ns]

In [42]:
df_patients.loc[:, 'dod'] = df_patients['dod'].where(df_patients['dod'].notna(), None)

In [44]:
df_patients.drop(columns=['dod'])

Unnamed: 0,_id,subject_id,gender,anchor_age,anchor_year,anchor_year_group
0,67c5655a24c857f62da20469,10046543,F,91,2155,2017 - 2019
1,67c5655a24c857f62da214d7,10188106,M,39,2151,2011 - 2013
2,67c5655a24c857f62da2173b,10207914,M,22,2155,2008 - 2010
3,67c5655a24c857f62da21bb4,10247468,M,23,2155,2008 - 2010
4,67c5655a24c857f62da21d70,10261732,F,79,2155,2011 - 2013
...,...,...,...,...,...,...
257,67c5656c24c857f62da65f01,19562159,F,81,2155,2017 - 2019
258,67c5656c24c857f62da667fa,19638442,F,47,2151,2008 - 2010
259,67c5656c24c857f62da66bdf,19672997,F,66,2155,2011 - 2013
260,67c5656c24c857f62da685ba,19895627,F,53,2154,2011 - 2013


In [47]:
records

[{'_id': ObjectId('67c5655a24c857f62da20469'),
  'subject_id': 10046543,
  'gender': 'F',
  'anchor_age': 91,
  'anchor_year': 2155,
  'anchor_year_group': '2017 - 2019',
  'dod': NaT},
 {'_id': ObjectId('67c5655a24c857f62da214d7'),
  'subject_id': 10188106,
  'gender': 'M',
  'anchor_age': 39,
  'anchor_year': 2151,
  'anchor_year_group': '2011 - 2013',
  'dod': NaT},
 {'_id': ObjectId('67c5655a24c857f62da2173b'),
  'subject_id': 10207914,
  'gender': 'M',
  'anchor_age': 22,
  'anchor_year': 2155,
  'anchor_year_group': '2008 - 2010',
  'dod': NaT},
 {'_id': ObjectId('67c5655a24c857f62da21bb4'),
  'subject_id': 10247468,
  'gender': 'M',
  'anchor_age': 23,
  'anchor_year': 2155,
  'anchor_year_group': '2008 - 2010',
  'dod': NaT},
 {'_id': ObjectId('67c5655a24c857f62da21d70'),
  'subject_id': 10261732,
  'gender': 'F',
  'anchor_age': 79,
  'anchor_year': 2155,
  'anchor_year_group': '2011 - 2013',
  'dod': Timestamp('2155-03-24 00:00:00')},
 {'_id': ObjectId('67c5655a24c857f62da21e

In [49]:
def clean_value(val):
    if isinstance(val, pd.Timestamp):
        return val.to_pydatetime() if pd.notnull(val) else None
    elif isinstance(val, pd.NaT.__class__):
        return None
    return val
df_clean = df_patients.applymap(clean_value)

  df_clean = df_patients.applymap(clean_value)


In [50]:
records = df_patients.to_dict('records')  
collection = db["patients"]
if records: 
    collection.insert_many(records)

ValueError: NaTType does not support utcoffset

In [23]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'recommender_system')

In [78]:
# Query all hadm_ids at once using $in
diagnoses_cursor = collection_diagnoses_icd.find({"hadm_id": {"$in": unique_hadm_ids}})
procedures_cursor = collection_procedures_icd.find({"hadm_id": {"$in": unique_hadm_ids}})

# Convert to lists and DataFrames
df_diagnoses = pd.DataFrame(list(diagnoses_cursor))
df_procedures = pd.DataFrame(list(procedures_cursor))


In [75]:
procedures_cursor = collection_procedures_icd.find({"hadm_id": {"$in": unique_hadm_ids}})
len(list(procedures_cursor))

510

In [79]:
records = df_diagnoses.to_dict('records')  
collection = db["procedures_icd"]
if records: 
    collection.insert_many(records)

In [80]:
records = df_procedures.to_dict('records')  
collection = db["diagnoses_icd"]
if records: 
    collection.insert_many(records)

In [50]:
df_procedures

Unnamed: 0,_id,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version
0,67c5a8ce60999f4df0732382,10188106,28288574,1,2155-05-10,2309,9
1,67c5a8ce60999f4df0732383,10188106,28288574,2,2155-05-24,4836,9
2,67c5a8ce60999f4df0732384,10188106,28288574,3,2155-05-24,4516,9
3,67c5a8ce60999f4df0732863,10207914,21364683,1,2155-03-27,8103,9
4,67c5a8ce60999f4df0732864,10207914,21364683,2,2155-03-27,353,9
...,...,...,...,...,...,...,...
505,67c5a8f460999f4df07d17bb,19936109,24673093,2,2155-03-16,3772,9
506,67c5a8f460999f4df07d17bc,19936109,24673093,3,2155-03-16,9671,9
507,67c5a8f460999f4df07d17bd,19936109,24673093,4,2155-03-16,9604,9
508,67c5a8f460999f4df07d17be,19936109,24673093,5,2155-03-16,9962,9


In [18]:
df_diagnoses[df_diagnoses['hadm_id']==28335657]

Unnamed: 0,_id,subject_id,hadm_id,seq_num,icd_code,icd_version
0,67c4cbcb44a7421d69772f32,10556566,28335657,1,5609,9
1,67c4cbcb44a7421d69772f33,10556566,28335657,2,7885,9
2,67c4cbcb44a7421d69772f34,10556566,28335657,3,7840,9
3,67c4cbcb44a7421d69772f35,10556566,28335657,4,7821,9
4,67c4cbcb44a7421d69772f36,10556566,28335657,5,27651,9
5,67c4cbcb44a7421d69772f37,10556566,28335657,6,4019,9
6,67c4cbcb44a7421d69772f38,10556566,28335657,7,53081,9
7,67c4cbcb44a7421d69772f39,10556566,28335657,8,311,9
8,67c4cbcb44a7421d69772f3a,10556566,28335657,9,49390,9
9,67c4cbcb44a7421d69772f3b,10556566,28335657,10,V4572,9
