# Data Analysis of Health Metrics

This notebook contains exploratory data analysis (EDA) and visualizations of the health metrics collected from wearable devices. The goal is to understand the data distribution, identify trends, and detect any anomalies in the health metrics.

In [2]:
import pandas as pd
import numpy as np

# Simulate health data
np.random.seed(42)
data = {
    'timestamp': pd.date_range(start='2023-10-01', periods=1000, freq='T'),
    'heart_rate': np.random.randint(60, 100, 1000),
    'blood_oxygen': np.random.randint(90, 100, 1000),
    'activity_level': np.random.choice(['low', 'moderate', 'high'], 1000)
}

df_sim = pd.DataFrame(data)
df_sim.to_csv('../data/health_metrics.csv', index=False)
print("Simulated data saved to ../data/health_metrics.csv")

import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of seaborn
sns.set(style='whitegrid')

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load the health metrics dataset
df = pd.read_csv('../data/health_metrics.csv')  # Update the path as necessary

import sys
sys.path.append('../src')  # Adjust path if needed

from preprocessing import preprocess_health_data

# Only preprocess numeric columns
numeric_cols = ['heart_rate', 'blood_oxygen']
df[numeric_cols] = preprocess_health_data(df[numeric_cols])
df.head()

In [None]:
# Summary statistics of the dataset
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

In [None]:
# Visualize the distribution of heart rate
plt.figure(figsize=(10, 6))
sns.histplot(df['heart_rate'], bins=20, kde=True)
plt.title('Distribution of Heart Rate')
plt.xlabel('Heart Rate (bpm)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualize the relationship between heart rate and blood oxygen levels
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='heart_rate', y='blood_oxygen', hue='activity_level', alpha=0.7)
plt.title('Heart Rate vs Blood Oxygen Levels')
plt.xlabel('Heart Rate (bpm)')
plt.ylabel('Blood Oxygen Level (%)')
plt.legend(title='Activity Level')
plt.show()

In [None]:
# Time series analysis of heart rate
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)

plt.figure(figsize=(14, 7))
plt.plot(df['heart_rate'], label='Heart Rate', color='blue')
plt.title('Heart Rate Over Time')
plt.xlabel('Time')
plt.ylabel('Heart Rate (bpm)')
plt.legend()
plt.show()

## Conclusion

This notebook provides an initial exploration of the health metrics data. Further analysis can be conducted to derive insights and inform health recommendations.