In [None]:
# note the gitignore file is a hidden file and so cannot be seen in jupyterlab. To see it you can run "mv .gitignore.txt gitignore.txt" in the terminal
# https://stackoverflow.com/questions/52222461/how-to-edit-gitignore-in-jupyter-lab

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
cycles = pd.read_csv("data/physiological_cycles.csv")
journals = pd.read_csv("data/journal_entries.csv")
sleeps = pd.read_csv("data/sleeps.csv")
workouts = pd.read_csv("data/workouts.csv")

In [None]:
cycles.head()

In [None]:
journals.head()

In [None]:
sleeps.head()

In [None]:
workouts.head()

Lets start by analysing each file individually and then we could potentially see if we could combine the files aswel. Then potentially look at external data on other factors that innfluence things like recovery and sleep e.g was it sunny that day etc. Look up what factors might influence these things and see if you can find data on them. It turns out cycles actually contains all of the data we need so focus on that file.

The cycles file contains information on recovery and sleep and the factors that impact these.

In [None]:
# check for na
cycles.isna().sum()

we see there are 6 rows that contain na's in a number of columns. Through further investigation we see that this occurs when a cycle's start time and end time are not recorded correctly e.g. end a cycle just before midnight and restart another cycle until midnight. This could be a problem so we want to investigate the lengths of cycle to see if any other weird things happen

In [None]:
# Now, let's show the rows containing any NaNs:
cycles[cycles.isna().any(axis=1)]

In [None]:
# A very small propotion of the rows contain nans and these are not values that I could infer in other ways so I will remove them
cycles_cleaned = cycles.dropna()
cycles_cleaned

In [None]:
# convert date columns to datetime
cycles_cleaned['Cycle start time'] = pd.to_datetime(cycles_cleaned['Cycle start time'], dayfirst=True)
cycles_cleaned['Cycle end time'] = pd.to_datetime(cycles_cleaned['Cycle end time'], dayfirst=True)
# create new date columns
cycles_cleaned['start date'] = cycles_cleaned['Cycle start time'].dt.date
cycles_cleaned['end date'] = cycles_cleaned['Cycle end time'].dt.date
# create new cycle length column
cycles_cleaned['Cycle length'] = cycles_cleaned['Cycle end time'] - cycles_cleaned['Cycle start time']

In [None]:
cycles_cleaned.head()

In [None]:
# Want to see summary stats of length of cycles to see if any naps etc.
cycles_cleaned['Cycle length'].describe()

In [None]:
# Plot the distribution of cycle length
plt.hist(cycles_cleaned['Cycle length'].dt.total_seconds() / 3600, bins=10, edgecolor='k')  # Convert timedelta to hours
plt.xlabel('Cycle Length (hours)')
plt.ylabel('Frequency')
plt.title('Distribution of Cycle Length')
plt.grid(True)
plt.show()

Through this data and all also from my understanding of how whoop is intended to work it seems that cycles are intended to be around 24 hours. Therefore I will remove any cycles with length less than 20 or greater than 30 hours. Otherwise cycles of weird lengths could skew the data e.g. a very short cycle could have a skewed recovery rate.

In [None]:
filtered_df = cycles_cleaned[(cycles_cleaned['Cycle length'].dt.total_seconds() / 3600 >= 20) &(cycles_cleaned['Cycle length'].dt.total_seconds() / 3600 <= 30)]
filtered_df

In [None]:
# Plot the distribution of cycle length to check if the cycle lengths reflect the changes we made
plt.hist(filtered_df['Cycle length'].dt.total_seconds() / 3600, bins=10, edgecolor='k')  # Convert timedelta to hours
plt.xlabel('Cycle Length (hours)')
plt.ylabel('Frequency')
plt.title('Distribution of Cycle Length')
plt.grid(True)
plt.show()

In [None]:
# check for outliers (use plots?)

In [None]:
# check dates and other data types are in correct form

In [None]:
# perform scaling

In [None]:
# perform feature engineering (maybe wait til after initial exploration to do this). Could also wait til we have considered external data to do this.

In [None]:
# check if sagemaker is synced