In [2]:
import pandas as pd

df = pd.read_csv("F:\health_checkup.csv")
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient_ID     100 non-null    object 
 1   Name           100 non-null    object 
 2   Age            100 non-null    int64  
 3   Gender         100 non-null    object 
 4   City           100 non-null    object 
 5   BMI            100 non-null    float64
 6   BloodPressure  100 non-null    object 
 7   Sugar          100 non-null    int64  
 8   Cholesterol    100 non-null    int64  
 9   HeartRate      100 non-null    int64  
 10  CheckupDate    100 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 8.7+ KB
None
  Patient_ID    Name  Age Gender       City   BMI BloodPressure  Sugar  \
0       P001   Kavya   56      M      Delhi  24.7        145/70    148   
1       P002  Vikram   53      F       Pune  34.1        114/95    101   
2       P003   Varun   45      M  H

  df = pd.read_csv("F:\health_checkup.csv")


In [3]:
# Check missing values
print(df.isnull().sum())

# Option 1: Fill with median or mean
df['BMI'].fillna(df['BMI'].median(), inplace=True)
df['Sugar'].fillna(df['Sugar'].mean(), inplace=True)

# Option 2: Drop rows with too many missing values
df.dropna(subset=['BMI', 'Sugar', 'Cholesterol'], inplace=True)

Patient_ID       0
Name             0
Age              0
Gender           0
City             0
BMI              0
BloodPressure    0
Sugar            0
Cholesterol      0
HeartRate        0
CheckupDate      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['BMI'].fillna(df['BMI'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sugar'].fillna(df['Sugar'].mean(), inplace=True)


In [4]:
# Split BP into Systolic and Diastolic
df[['BP_Systolic', 'BP_Diastolic']] = df['BloodPressure'].str.split('/', expand=True)
df['BP_Systolic'] = df['BP_Systolic'].astype(int)
df['BP_Diastolic'] = df['BP_Diastolic'].astype(int)

# Drop old column if not needed
df.drop('BloodPressure', axis=1, inplace=True)

In [5]:
# Filter out unrealistic values
df = df[(df['BMI'] >= 15) & (df['BMI'] <= 40)]
df = df[(df['Sugar'] >= 60) & (df['Sugar'] <= 250)]
df = df[(df['Cholesterol'] >= 100) & (df['Cholesterol'] <= 300)]

In [6]:
df['CheckupDate'] = pd.to_datetime(df['CheckupDate'])

In [7]:
df.to_csv("health_checkup_cleaned.csv", index=False)

In [8]:
def risk(row):
    if row['BMI'] > 30 or row['Sugar'] > 140 or row['Cholesterol'] > 220:
        return "High"
    else:
        return "Normal"

df['RiskCategory'] = df.apply(risk, axis=1)

In [9]:
df.head()

Unnamed: 0,Patient_ID,Name,Age,Gender,City,BMI,Sugar,Cholesterol,HeartRate,CheckupDate,BP_Systolic,BP_Diastolic,RiskCategory
0,P001,Kavya,56,M,Delhi,24.7,148,161,92,2024-10-29,145,70,High
1,P002,Vikram,53,F,Pune,34.1,101,204,78,2024-08-08,114,95,High
2,P003,Varun,45,M,Hyderabad,19.3,133,264,74,2024-08-12,110,77,High
3,P004,Deepa,60,F,Kolkata,19.6,103,197,68,2024-08-17,122,90,Normal
4,P005,Priya,44,M,Bengaluru,21.9,152,233,90,2024-10-28,121,81,High


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Patient_ID    100 non-null    object        
 1   Name          100 non-null    object        
 2   Age           100 non-null    int64         
 3   Gender        100 non-null    object        
 4   City          100 non-null    object        
 5   BMI           100 non-null    float64       
 6   Sugar         100 non-null    int64         
 7   Cholesterol   100 non-null    int64         
 8   HeartRate     100 non-null    int64         
 9   CheckupDate   100 non-null    datetime64[ns]
 10  BP_Systolic   100 non-null    int32         
 11  BP_Diastolic  100 non-null    int32         
 12  RiskCategory  100 non-null    object        
dtypes: datetime64[ns](1), float64(1), int32(2), int64(4), object(5)
memory usage: 9.5+ KB
