In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
n = 500  # Number of samples

age = np.random.randint(18, 65, size=n)
gender = np.random.choice(['Male', 'Female'], size=n)
sleep_duration = np.random.normal(7, 1.5, size=n).clip(4, 10)
exercise_frequency = np.random.randint(0, 7, size=n)
caffeine_intake = np.random.normal(200, 50, size=n).clip(50, 400)
stress_level = np.random.randint(1, 11, size=n)
work_hours = np.random.normal(8, 2, size=n).clip(4, 12)
smoker = np.random.choice(['Yes', 'No'], size=n)

# Create DataFrame
data = pd.DataFrame({
    'age': age,
    'gender': gender,
    'sleep_duration': sleep_duration,
    'exercise_frequency': exercise_frequency,
    'caffeine_intake': caffeine_intake,
    'stress_level': stress_level,
    'work_hours': work_hours,
    'smoker': smoker
})

# Introduce some missing values
data.loc[data.sample(frac=0.1).index, 'sleep_duration'] = np.nan
data.loc[data.sample(frac=0.05).index, 'caffeine_intake'] = np.nan

# Display the first few rows of the dataset
data.head()

Unnamed: 0,age,gender,sleep_duration,exercise_frequency,caffeine_intake,stress_level,work_hours,smoker
0,56,Male,5.950412,3,248.545037,9,9.169006,No
1,46,Male,7.32097,4,169.127669,4,8.899312,Yes
2,32,Male,,3,104.394842,6,6.465891,No
3,60,Male,6.668546,1,219.841933,2,8.423634,No
4,25,Male,7.92125,2,275.725878,6,6.359517,Yes


# Seatwork 2 Questions

1. Univariate Analysis: Plot the histogram of at least two numerical columns.
2. Bivariate Analysis: Visualize scatter plot of a pair of numerical variable and interpret their correlation.  
3. Multivariate Analysis: Plot two pairplots: one that uses column "Gender" as hue and the other uses column "Smoker". (hint: `sns.pairplot(data.dropna(), hue='?????')`, that is, if you aim to use seaborn. You can use other packages.)
4. Categorical Data Analysis: Visualize two categorical variables (histogram)
5. Missing Items: Currently, data df does not contain missing items. The code below intentionally deletes several entries (for the sake of demonstration). Use `fillna` and `mean` method to impute missing items on the columns `sleep_duration` `caffeine_intake` on the new df `data_with_missing`. 
6. Create a short summary of what you have found in the EDA process relative to the characteristics of the sleep health lifestyle dataset.

In [10]:
import numpy as np

data_with_missing = data.copy()
np.random.seed(42)  # for reproducibility
missing_proportion = 0.1 # this means 10 %

# introduce missing values in 'sleep_duration'
sleep_duration_missing_indices = np.random.choice(data_with_missing.index, 
                                                  size=int(len(data_with_missing) * missing_proportion), 
                                                  replace=False)
data_with_missing.loc[sleep_duration_missing_indices, 'sleep_duration'] = np.nan

# introduce missing values in 'caffeine_intake'
caffeine_intake_missing_indices = np.random.choice(data_with_missing.index, 
                                                   size=int(len(data_with_missing) * missing_proportion), 
                                                   replace=False)
data_with_missing.loc[caffeine_intake_missing_indices, 'caffeine_intake'] = np.nan

# get info
data_with_missing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 500 non-null    int64  
 1   gender              500 non-null    object 
 2   sleep_duration      450 non-null    float64
 3   exercise_frequency  500 non-null    int64  
 4   caffeine_intake     450 non-null    float64
 5   stress_level        500 non-null    int64  
 6   work_hours          500 non-null    float64
 7   smoker              500 non-null    object 
dtypes: float64(3), int64(3), object(2)
memory usage: 31.4+ KB
