In [2]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

# Load the CSV
df = pd.read_csv(r'F:\ITI\Big Data\Project\mimic-iii-clinical-database-demo-1.4\LABEVENTS.csv')

# Convert charttime to datetime, then to Hive-compatible string
df['charttime'] = pd.to_datetime(df['charttime'], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
df['charttime'] = df['charttime'].fillna('9999-12-31 00:00:00')

# Fill valuenum with -1
df['valuenum'] = pd.to_numeric(df['valuenum'], errors='coerce').fillna(-1)

# Fill itemid and hadm_id with -1 and convert to nullable int
df['itemid'] = pd.to_numeric(df['itemid'], errors='coerce').fillna(-1).astype('Int64')
df['hadm_id'] = pd.to_numeric(df['hadm_id'], errors='coerce').fillna(-1).astype('Int64')

# Fill value, valueuom, flag with 'Unknown' and convert to category
for col in ['value', 'valueuom', 'flag']:
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.add_categories('Unknown')
    df[col] = df[col].fillna('Unknown')

# Print remaining NaNs
print("Remaining NaNs:")
print(df.isna().sum())

# Convert to Arrow Table and write as Parquet
table = pa.Table.from_pandas(df)
pq.write_table(table, r'F:\ITI\Big Data\Project\mimic-iii-clinical-database-demo-1.4\LABEVENTS.parquet')

# Show schema
print(table.schema)

Remaining NaNs:
row_id        0
subject_id    0
hadm_id       0
itemid        0
charttime     0
value         0
valuenum      0
valueuom      0
flag          0
dtype: int64
row_id: int64
subject_id: int64
hadm_id: int64
itemid: int64
charttime: string
value: dictionary<values=string, indices=int16, ordered=0>
valuenum: double
valueuom: dictionary<values=string, indices=int8, ordered=0>
flag: dictionary<values=string, indices=int8, ordered=0>
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1394
