In [11]:
# MODULE 1: DATA LOADING & BASIC PREPROCESSING


# 1️) Import necessary packages
import pandas as pd
import numpy as np

# 2️) Load the dataset

df = pd.read_csv("health_activity_data.csv")

In [12]:
# 3️) Basic overview
print(" Dataset loaded successfully!")
print("Shape of dataset:", df.shape)       
print("\nColumn Names:\n", df.columns.tolist())

# 4️) data
print("\n First 5 rows (head):")
print(df.head())

print("\n Last 5 rows (tail):")
print(df.tail())

 Dataset loaded successfully!
Shape of dataset: (1000, 16)

Column Names:
 ['ID', 'Age', 'Gender', 'Height_cm', 'Weight_kg', 'BMI', 'Daily_Steps', 'Calories_Intake', 'Hours_of_Sleep', 'Heart_Rate', 'Blood_Pressure', 'Exercise_Hours_per_Week', 'Smoker', 'Alcohol_Consumption_per_Week', 'Diabetic', 'Heart_Disease']

 First 5 rows (head):
   ID  Age  Gender  Height_cm  Weight_kg    BMI  Daily_Steps  Calories_Intake  \
0   1   56    Male        164         81  30.72         5134             1796   
1   2   69    Male        156         82  20.86        12803             1650   
2   3   46  Female        158         65  30.93        16408             1756   
3   4   32    Male        197         87  31.19        18420             2359   
4   5   60    Male        157         63  29.37        17351             2556   

   Hours_of_Sleep  Heart_Rate Blood_Pressure  Exercise_Hours_per_Week Smoker  \
0             8.6         102         137/72                      8.1     No   
1             4.

In [3]:
# 5️) Data types and summary information
print("\n Dataset Information:")
print(df.info())

print("\n Statistical Summary (Numerical Columns):")
print(df.describe())

# 6️) Checking for missing values
print("\n Missing Values per Column:")
print(df.isnull().sum())


 Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ID                            1000 non-null   int64  
 1   Age                           1000 non-null   int64  
 2   Gender                        1000 non-null   object 
 3   Height_cm                     1000 non-null   int64  
 4   Weight_kg                     1000 non-null   int64  
 5   BMI                           1000 non-null   float64
 6   Daily_Steps                   1000 non-null   int64  
 7   Calories_Intake               1000 non-null   int64  
 8   Hours_of_Sleep                1000 non-null   float64
 9   Heart_Rate                    1000 non-null   int64  
 10  Blood_Pressure                1000 non-null   object 
 11  Exercise_Hours_per_Week       1000 non-null   float64
 12  Smoker                        1000 non-n

In [13]:
# 7️) Handle missing values
# Drop rows if any essential columns have missing values
critical_cols = [
    'Age', 'Height_cm', 'Weight_kg', 'BMI',
    'Daily_Steps', 'Calories_Intake', 'Hours_of_Sleep'
]
df = df.dropna(subset=critical_cols)
df = df.reset_index(drop=True)
print("\nShape after dropping missing critical rows:", df.shape)

# Fill missing categorical values with mode
cat_cols = ['Gender', 'Smoker', 'Diabetic', 'Heart_Disease']
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

# 8️) Basic cleaning
# Converting categorical values to lowercase strings
for col in cat_cols:
    df[col] = df[col].astype(str).str.lower().str.strip()


Shape after dropping missing critical rows: (1000, 16)


In [14]:
# 9️) Quick data insights
print("\nAverage hours of sleep:", round(df['Hours_of_Sleep'].mean(), 2))
print("Average daily steps:", round(df['Daily_Steps'].mean(), 2))
print("Average calorie intake:", round(df['Calories_Intake'].mean(), 2))



Average hours of sleep: 6.91
Average daily steps: 10717.03
Average calorie intake: 2327.12


In [15]:
# 10) Saving cleaned dataset
df.to_csv("health_lifestyle_cleaned.csv", index=False)
print("\n Cleaned dataset saved as 'health_lifestyle_cleaned.csv'")


print("\n Final Head:")
print(df.head())

print("\n Final Tail:")
print(df.tail())


 Cleaned dataset saved as 'health_lifestyle_cleaned.csv'

 Final Head:
   ID  Age  Gender  Height_cm  Weight_kg    BMI  Daily_Steps  Calories_Intake  \
0   1   56    male        164         81  30.72         5134             1796   
1   2   69    male        156         82  20.86        12803             1650   
2   3   46  female        158         65  30.93        16408             1756   
3   4   32    male        197         87  31.19        18420             2359   
4   5   60    male        157         63  29.37        17351             2556   

   Hours_of_Sleep  Heart_Rate Blood_Pressure  Exercise_Hours_per_Week Smoker  \
0             8.6         102         137/72                      8.1     no   
1             4.5         103         129/65                      3.7     no   
2             4.3          74         127/68                      3.2    yes   
3             4.1         116         125/86                      8.5     no   
4             5.1         111         100