In [None]:
# note the gitignore file is a hidden file and so cannot be seen in jupyterlab. To see it you can run "mv .gitignore.txt gitignore.txt" in the terminal
# https://stackoverflow.com/questions/52222461/how-to-edit-gitignore-in-jupyter-lab

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
cycles = pd.read_csv("data/physiological_cycles.csv")
journals = pd.read_csv("data/journal_entries.csv")
sleeps = pd.read_csv("data/sleeps.csv")
workouts = pd.read_csv("data/workouts.csv")

In [None]:
cycles.head()

In [None]:
journals.head()

In [None]:
sleeps.head()

In [None]:
workouts.head()

Lets start by analysing each file individually and then we could potentially see if we could combine the files aswel. Then potentially look at external data on other factors that innfluence things like recovery and sleep e.g was it sunny that day etc. Look up what factors might influence these things and see if you can find data on them. It turns out cycles actually contains all of the data we need so focus on that file.

The cycles file contains information on recovery and sleep and the factors that impact these.

In [None]:
# check for na
cycles.isna().sum()

we see there are 6 rows that contain na's in a number of columns. Through further investigation we see that this occurs when a cycle's start time and end time are not recorded correctly e.g. end a cycle just before midnight and restart another cycle until midnight. This could be a problem so we want to investigate the lengths of cycle to see if any other weird things happen

In [None]:
# Now, let's show the rows containing any NaNs:
cycles[cycles.isna().any(axis=1)]

In [None]:
# A very small propotion of the rows contain nans and these are not values that I could infer in other ways so I will remove them
cycles_cleaned = cycles.dropna()
cycles_cleaned

In [None]:
# convert date columns to datetime
cycles_cleaned['Cycle start time'] = pd.to_datetime(cycles_cleaned['Cycle start time'], dayfirst=True)
cycles_cleaned['Cycle end time'] = pd.to_datetime(cycles_cleaned['Cycle end time'], dayfirst=True)
# create new date columns
cycles_cleaned['start date'] = cycles_cleaned['Cycle start time'].dt.date
cycles_cleaned['end date'] = cycles_cleaned['Cycle end time'].dt.date
# create new cycle length column
cycles_cleaned['Cycle length'] = cycles_cleaned['Cycle end time'] - cycles_cleaned['Cycle start time']

In [None]:
cycles_cleaned.head()

In [None]:
# Want to see summary stats of length of cycles to see if any naps etc.
cycles_cleaned['Cycle length'].describe()

In [None]:
# Plot the distribution of cycle length
plt.hist(cycles_cleaned['Cycle length'].dt.total_seconds() / 3600, bins=10, edgecolor='k')  # Convert timedelta to hours
plt.xlabel('Cycle Length (hours)')
plt.ylabel('Frequency')
plt.title('Distribution of Cycle Length')
plt.grid(True)
plt.show()

Through this data and all also from my understanding of how whoop is intended to work it seems that cycles are intended to be around 24 hours. Therefore I will remove any cycles with length less than 20 or greater than 30 hours. Otherwise cycles of weird lengths could skew the data e.g. a very short cycle could have a skewed recovery rate.

In [None]:
filtered_df = cycles_cleaned[(cycles_cleaned['Cycle length'].dt.total_seconds() / 3600 >= 20) &(cycles_cleaned['Cycle length'].dt.total_seconds() / 3600 <= 30)]
filtered_df

In [None]:
# Plot the distribution of cycle length to check if the cycle lengths reflect the changes we made
plt.hist(filtered_df['Cycle length'].dt.total_seconds() / 3600, bins=10, edgecolor='k')  
plt.xlabel('Cycle Length (hours)')
plt.ylabel('Frequency')
plt.title('Distribution of Cycle Length')
plt.grid(True)
plt.show()

We also want to do the same thing for sleep length

In [None]:
# Convert the date columns to datetime objects if they are not already
filtered_df['Sleep onset'] = pd.to_datetime(filtered_df['Sleep onset'])
filtered_df['Wake onset'] = pd.to_datetime(filtered_df['Wake onset'])

# Calculate the sleep length in hours
filtered_df['Sleep Length'] = (filtered_df['Wake onset'] - filtered_df['Sleep onset']).dt.total_seconds() / 3600  # Convert to hours
filtered_df.head()

In [None]:
# Plot the distribution of sleep length
plt.figure(figsize=(10, 6))
plt.hist(filtered_df['Sleep Length'], bins=20, edgecolor='k', alpha=0.7)
plt.xlabel('Sleep Length (hours)')
plt.ylabel('Frequency')
plt.title('Distribution of Sleep Length')
plt.grid(True)
plt.show()


In [None]:
# Filter rows with sleep length >= 4 hours as anything under 4 hours is more likely a nap
filtered_df = filtered_df[filtered_df['Sleep Length'] >= 4]

Remove any columns that are not important

In [None]:
filtered_df.columns

In [None]:
columns_to_drop = ['Cycle timezone']
filtered_df = filtered_df.drop(columns=columns_to_drop)

Next we can check for outliers

In [None]:
import matplotlib.pyplot as plt

# List of variables to create box plots for
variables_to_plot = ['Recovery score %', 'Resting heart rate (bpm)',
                     'Heart rate variability (ms)', 'Skin temp (celsius)', 'Blood oxygen %',
                     'Day Strain', 'Energy burned (cal)', 'Max HR (bpm)', 'Average HR (bpm)',
                     'Sleep performance %','Respiratory rate (rpm)', 'Asleep duration (min)',
                     'In bed duration (min)', 'Light sleep duration (min)',
                     'Deep (SWS) duration (min)', 'REM duration (min)',
                     'Awake duration (min)', 'Sleep need (min)', 'Sleep debt (min)',
                     'Sleep efficiency %']

# Create subplots for the box plots
fig, axes = plt.subplots(nrows=len(variables_to_plot), figsize=(8, 4 * len(variables_to_plot)))

# Create box plots for each variable
for i, variable in enumerate(variables_to_plot):
    axes[i].boxplot(filtered_df[variable])
    axes[i].set_title(f"Box Plot of {variable}")
    axes[i].set_ylabel(variable)

# Adjust the layout to prevent overlapping titles
plt.tight_layout()

# Show the plots
plt.show()


It appears that there are a number of revoery scores that are quite low so let's investigate that.

In [None]:
filtered_df[filtered_df['Recovery score %']<20]

I know (from my own experience) that a recovery score is often extremely low after drinking alcohol. Therefore it could be useful to include the journal data here. It is important here to ensure the cycles match accross the different datasets.

In [None]:
columns_to_drop = ['Cycle timezone', 'Notes']
journals = journals.drop(columns=columns_to_drop)

In [None]:
journals.head()

In [None]:
# Pivot the DataFrame to get the desired format
journals = journals.pivot(index=['Cycle start time', 'Cycle end time'], columns='Question text', values='Answered yes').reset_index()
journals

In [None]:
print("By using an inner merge as opposed to a left merge we lose",len(filtered_df)- len(journals), "of the",len(filtered_df),"rows." )

We want to combine the cycles data with the journals data. By doing this we will reduce the size of our data but I think in this situation it is acceptable because the journal questions are crucial in understanding the cycles data. However it would obviously be much better if we had more data.

In [None]:
# convert date columns to datetime
journals['Cycle start time'] = pd.to_datetime(journals['Cycle start time'], dayfirst=True)
journals['Cycle end time'] = pd.to_datetime(journals['Cycle end time'], dayfirst=True)

In [None]:
merged_df = filtered_df.merge(journals, on=['Cycle start time', 'Cycle end time'], how='inner')
#merged_df = merged_df.fillna(False)

In [None]:
merged_df

Lokk at outliers in new data

In [None]:
# List of variables to create box plots for
variables_to_plot = ['Recovery score %', 'Resting heart rate (bpm)',
                     'Heart rate variability (ms)', 'Skin temp (celsius)', 'Blood oxygen %',
                     'Day Strain', 'Energy burned (cal)', 'Max HR (bpm)', 'Average HR (bpm)',
                     'Sleep performance %','Respiratory rate (rpm)', 'Asleep duration (min)',
                     'In bed duration (min)', 'Light sleep duration (min)',
                     'Deep (SWS) duration (min)', 'REM duration (min)',
                     'Awake duration (min)', 'Sleep need (min)', 'Sleep debt (min)',
                     'Sleep efficiency %']

# Create subplots for the box plots
fig, axes = plt.subplots(nrows=len(variables_to_plot), figsize=(8, 4 * len(variables_to_plot)))

# Create box plots for each variable
for i, variable in enumerate(variables_to_plot):
    axes[i].boxplot(merged_df[variable])
    axes[i].set_title(f"Box Plot of {variable}")
    axes[i].set_ylabel(variable)

# Adjust the layout to prevent overlapping titles
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
merged_df[merged_df['Recovery score %']<40]

We will look into this more but it does seem like some factors such as alcohol can decrease recovery score so I am not going to remove these outliers for now.

In [None]:
merged_df[merged_df['Resting heart rate (bpm)']>55]


Again I am happy these outliers could related to factors such as alcohol consumption.

In [None]:
merged_df.to_csv("./data/cleaned_cyles.csv")