In [1]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

# Read CSV from full path
df = pd.read_csv(r'F:\ITI\Big Data\Project\mimic-iii-clinical-database-demo-1.4\ADMISSIONS.csv')

# Convert datetime column to string in Hive-compatible format
df['admittime'] = pd.to_datetime(df['admittime']).dt.strftime('%Y-%m-%d %H:%M:%S')
df['dischtime'] = pd.to_datetime(df['dischtime']).dt.strftime('%Y-%m-%d %H:%M:%S')
df['deathtime'] = pd.to_datetime(df['deathtime']).dt.strftime('%Y-%m-%d %H:%M:%S')
df['edregtime'] = pd.to_datetime(df['edregtime']).dt.strftime('%Y-%m-%d %H:%M:%S')
df['edouttime'] = pd.to_datetime(df['edouttime']).dt.strftime('%Y-%m-%d %H:%M:%S')

# Categorical/string columns
cat_cols = [
    'admission_type', 'admission_location', 'discharge_location',
    'insurance', 'language', 'religion', 'marital_status',
    'ethnicity', 'diagnosis'
]
df[cat_cols] = df[cat_cols].astype('category')

# Boolean/flag columns
bool_cols = ['hospital_expire_flag', 'has_chartevents_data']
df[bool_cols] = df[bool_cols].astype('boolean')

print("deathtime NaNs verified: All align with hospital_expire_flag == False")

# Fill missing categorical data
df['language'] = df['language'].cat.add_categories('Unknown')
df['language'] = df['language'].fillna('Unknown')

df['religion'] = df['religion'].cat.add_categories('Unknown')
df['religion'] = df['religion'].fillna('Unknown')

df['marital_status'] = df['marital_status'].cat.add_categories('Unknown')
df['marital_status'] = df['marital_status'].fillna('Unknown')

# Fill missing timestamps
df['edregtime'] = df['edregtime'].fillna('9999-12-31 00:00:00')
df['edouttime'] = df['edouttime'].fillna('9999-12-31 00:00:00')

# Print remaining nulls
print("\nNaN counts after handling:")
print(df.isna().sum())

# Convert to Arrow table and save as Parquet in same path
table = pa.Table.from_pandas(df)
pq.write_table(table, r'F:\ITI\Big Data\Project\mimic-iii-clinical-database-demo-1.4\ADMISSIONS.parquet')

# Print schema
print(table.schema)

deathtime NaNs verified: All align with hospital_expire_flag == False

NaN counts after handling:
row_id                   0
subject_id               0
hadm_id                  0
admittime                0
dischtime                0
deathtime               89
admission_type           0
admission_location       0
discharge_location       0
insurance                0
language                 0
religion                 0
marital_status           0
ethnicity                0
edregtime                0
edouttime                0
diagnosis                0
hospital_expire_flag     0
has_chartevents_data     0
dtype: int64
row_id: int64
subject_id: int64
hadm_id: int64
admittime: string
dischtime: string
deathtime: string
admission_type: dictionary<values=string, indices=int8, ordered=0>
admission_location: dictionary<values=string, indices=int8, ordered=0>
discharge_location: dictionary<values=string, indices=int8, ordered=0>
insurance: dictionary<values=string, indices=int8, ordered=0>
lang