In [1]:
# Import Neccessary Libraries
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv('diabetic_data.csv')

In [3]:
# 1. Impute or Remove Missing Values

# Replace '?' with standard NaN (Not a Number)
data = data.replace('?', np.nan)

# Drop columns with high missing percentage (weight, payer_code, medical_specialty)
cols_to_drop = ['weight', 'payer_code', 'medical_specialty']
data = data.drop(columns=cols_to_drop)

# Impute 'race' with 'Unknown'
data['race'] = data['race'].fillna('Unknown')

# Drop rows with missing critical diagnosis info (diag_1, diag_2, diag_3)
data = data.dropna(subset=['diag_1', 'diag_2', 'diag_3'])

# Handle test result columns (NaN implies 'Test Not Performed')
data['max_glu_serum'] = data['max_glu_serum'].fillna('None')
data['A1Cresult'] = data['A1Cresult'].fillna('None')


In [4]:
# 2. Standardize Codes (NG code)

# Gender column standardization
gender_map = {'male': 'M', 'female': 'F'}
data['gender'] = (
    data['gender']
    .str.strip()
    .str.lower()
    .map(gender_map)
    .fillna('Unknown')
)

# Readmitted column standardization
readmitted_map = {'no': 'False', '>30': 'False', '<30': 'True'}
data['readmitted'] = (
    data['readmitted']
    .str.strip()
    .str.lower()
    .map(readmitted_map)
)



In [5]:
# 3. Detect and Flag Outliers

# Define numeric columns to check for outliers
numeric_cols = [
    'time_in_hospital', 'num_lab_procedures', 'num_procedures', 
    'num_medications', 'number_outpatient', 'number_emergency', 
    'number_inpatient', 'number_diagnoses'
]

# Initialize a mask for outliers
outlier_mask = pd.Series(False, index=data.index)

# Use IQR (Interquartile Range) Method to detect outliers
for col in numeric_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Flag rows where value is outside the [lower_bound, upper_bound]
    col_outliers = (data[col] < lower_bound) | (data[col] > upper_bound)
    outlier_mask = outlier_mask | col_outliers

# Create the new column "flagged_outlier"
data['flagged_outlier'] = outlier_mask


In [6]:
# Ensure only numerical values in num cols 
data.info()
# findings: 
## age -> in range -> string value (to be converted into int -> get middle value)
## other fields are correct (int)

<class 'pandas.core.frame.DataFrame'>
Index: 100244 entries, 1 to 101765
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              100244 non-null  int64 
 1   patient_nbr               100244 non-null  int64 
 2   race                      100244 non-null  object
 3   gender                    100244 non-null  object
 4   age                       100244 non-null  object
 5   admission_type_id         100244 non-null  int64 
 6   discharge_disposition_id  100244 non-null  int64 
 7   admission_source_id       100244 non-null  int64 
 8   time_in_hospital          100244 non-null  int64 
 9   num_lab_procedures        100244 non-null  int64 
 10  num_procedures            100244 non-null  int64 
 11  num_medications           100244 non-null  int64 
 12  number_outpatient         100244 non-null  int64 
 13  number_emergency          100244 non-null  int64 
 14  number_in

In [7]:
# extract mid value of age column
data['age'] = (
    data['age']
    .str.extract(r'(\d+)-(\d+)')
    .astype(int)
    .mean(axis=1)
)

# Change 'age' column to integer
data['age'] = data['age'].astype('int64')

In [8]:
# ensure age column is correct
data['age'].info()

<class 'pandas.core.series.Series'>
Index: 100244 entries, 1 to 101765
Series name: age
Non-Null Count   Dtype
--------------   -----
100244 non-null  int64
dtypes: int64(1)
memory usage: 1.5 MB


In [9]:
# Final Output

print(f"Total Rows: {len(data)}")
print(f"Total Outliers Flagged: {data['flagged_outlier'].sum()}")
data.to_csv('diabetic_data_final.csv', index=False)

Total Rows: 100244
Total Outliers Flagged: 34500
