In [32]:
import pandas as pd
from google.colab import drive
import os
from pathlib import Path

# 1. Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
import pandas as pd

def clean_patient_csv(filepath, sep=";"):
    """
    Clean a single patient CSV:
    - Parse time & sort
    - Drop duplicate timestamps
    - Fix numeric data types
    - Remove impossible negative values
    - Flag but don't drop outliers (glucose, heart rate)

    Returns a cleaned pandas DataFrame.
    """

    # Load file
    df = pd.read_csv(filepath, sep=sep)
    #usha
    # 1. Parse datetime and sort
    df['time'] = pd.to_datetime(df['time'], errors='coerce')
    df = df.drop_duplicates(subset=['time']).sort_values('time').reset_index(drop=True)
    #Lakshmi
    # 2. Fix numeric data types
    if 'glucose' in df.columns:
        df['glucose'] = df['glucose'].round().astype('Int64')
    if 'heart_rate' in df.columns:
        df['heart_rate'] = df['heart_rate'].round().astype('Int64')
    if 'calories' in df.columns:
        df['calories'] = df['calories'].round(1)
    if 'steps' in df.columns:
        df['steps'] = df['steps'].round().astype('Int64')
    if 'basal_rate' in df.columns:
        df['basal_rate'] = df['basal_rate'].round(3)
    if 'bolus_volume_delivered' in df.columns:
        df['bolus_volume_delivered'] = df['bolus_volume_delivered'].round(1)
    if 'carb_input' in df.columns:
        df['carb_input'] = df['carb_input'].round().astype('Int64')
    #Gauri
    # 3. Remove impossible negatives
    for col in ['glucose','calories','steps','basal_rate','bolus_volume_delivered','carb_input','heart_rate']:
        if col in df.columns:
            df.loc[df[col] < 0, col] = df.loc[df[col] < 0, col].abs()# convert to positive based on the other columns values/entries
    #Joshna
    # 4. Flag outliers (keep them for analysis)
    if 'glucose' in df.columns:
        df['flag_glucose_outlier'] = (df['glucose'] < 40) | (df['glucose'] > 400)
    if 'heart_rate' in df.columns:
        df['flag_hr_outlier'] = (df['heart_rate'] < 30) | (df['heart_rate'] > 220)

    return df


In [41]:
from tabulate import tabulate
# Folder containing your patient CSVs
folder = Path("/content/drive/MyDrive/PythonHackathon_Numpyninja/HUPA-UC Diabetes Dataset")
# Loop through all files starting with "HUP"
for f in folder.glob("HUP*.csv"):
    print(f"\n=== Processing {f.name} ===")

    # Clean the file
    df_clean = clean_patient_csv(f)

    # Print summary info
    print(df_clean.info())
    print(df_clean['flag_glucose_outlier'].value_counts())
    print(df_clean['flag_hr_outlier'].value_counts())
    print(tabulate(df_clean.head(), headers='keys', tablefmt='psql'))


=== Processing HUPA0027P.csv ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165306 entries, 0 to 165305
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   time                    165306 non-null  datetime64[ns]
 1   glucose                 165306 non-null  Int64         
 2   calories                165306 non-null  float64       
 3   heart_rate              165306 non-null  Int64         
 4   steps                   165306 non-null  Int64         
 5   basal_rate              165306 non-null  float64       
 6   bolus_volume_delivered  165306 non-null  float64       
 7   carb_input              165306 non-null  Int64         
 8   flag_glucose_outlier    165306 non-null  boolean       
 9   flag_hr_outlier         165306 non-null  boolean       
dtypes: Int64(4), boolean(2), datetime64[ns](1), float64(3)
memory usage: 11.4 MB
None
flag_glucose_outlier
False    165306