### lets do some visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv(".\data\cleaned_cyles.csv")

In [None]:
df.head()

# explore your data to understand its structure and distribution. Look at basic statistics, data visualizations, and summary tables.

In [None]:
df['Cycle start time'] = pd.to_datetime(df['Cycle start time'])
df['Cycle end time'] = pd.to_datetime(df['Cycle end time'])
df['start date'] = pd.to_datetime(df['start date'])
df['end date'] = pd.to_datetime(df['end date'])

In [None]:
df

In [None]:
# Investigate recovery score

In [None]:
df['Recovery score %'].describe()

In [None]:
# Investigate if recovery score has any trends over time when grouped by week and month

In [None]:

# Group by week and calculate summary statistics
weekly_summary = df.groupby(df['Cycle start time'].dt.strftime('%Y-%W')).agg(
    Average_Recovery_Score=('Recovery score %', 'mean'),
    Min_Recovery_Score=('Recovery score %', 'min'),
    Max_Recovery_Score=('Recovery score %', 'max')
).reset_index()

# Convert the week start date to a datetime object
weekly_summary['Week_Start_Date'] = pd.to_datetime(weekly_summary['Cycle start time'] + '-0', format='%Y-%W-%w')

# Create a line plot for average, min, and max recovery scores by week
plt.figure(figsize=(12, 6))

# Plot average recovery score in blue
plt.plot(weekly_summary['Week_Start_Date'], weekly_summary['Average_Recovery_Score'], label='Average', color='blue', marker='o', linestyle='-')

# Plot min recovery score in red
plt.plot(weekly_summary['Week_Start_Date'], weekly_summary['Min_Recovery_Score'], label='Minimum', color='red', marker='o', linestyle='-')

# Plot max recovery score in green
plt.plot(weekly_summary['Week_Start_Date'], weekly_summary['Max_Recovery_Score'], label='Maximum', color='green', marker='o', linestyle='-')

plt.title('Recovery Scores by Week')
plt.xlabel('Week Start Date')
plt.ylabel('Recovery Score %')
plt.grid(True)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()

# Show the plot
plt.show()


In [None]:

# Group by month and calculate summary statistics
monthly_summary = df.groupby(df['Cycle start time'].dt.strftime('%Y-%m')).agg(
    Average_Recovery_Score=('Recovery score %', 'mean'),
    Min_Recovery_Score=('Recovery score %', 'min'),
    Max_Recovery_Score=('Recovery score %', 'max')
).reset_index()

# Convert the month start date to a datetime object
monthly_summary['Month_Start_Date'] = pd.to_datetime(monthly_summary['Cycle start time'])

# Create a line plot for average, min, and max recovery scores by month
plt.figure(figsize=(12, 6))

# Plot average recovery score in blue
plt.plot(monthly_summary['Month_Start_Date'], monthly_summary['Average_Recovery_Score'], label='Average', color='blue', marker='o', linestyle='-')

# Plot min recovery score in red
plt.plot(monthly_summary['Month_Start_Date'], monthly_summary['Min_Recovery_Score'], label='Minimum', color='red', marker='o', linestyle='-')

# Plot max recovery score in green
plt.plot(monthly_summary['Month_Start_Date'], monthly_summary['Max_Recovery_Score'], label='Maximum', color='green', marker='o', linestyle='-')

plt.title('Recovery Scores by Month')
plt.xlabel('Month Start Date')
plt.ylabel('Recovery Score %')
plt.grid(True)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()

# Show the plot
plt.show()


# Create visualizations (e.g., line charts, histograms) to gain insights into trends and patterns in your physiological data over time.

In [None]:
# Create histograms to visualize the distribution of recovery scores and sleep scores. This can help you identify whether the data follows a normal distribution or has specific patterns.

In [None]:
# Select the column containing recovery scores
recovery_scores = df['Recovery score %']

# Create a histogram
plt.figure(figsize=(10, 6))
plt.hist(recovery_scores, bins=20, edgecolor='black', alpha=0.7)
plt.title('Recovery Score Distribution')
plt.xlabel('Recovery Score %')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()

# Show the histogram
plt.show()

 Recovery scores distribution appears to be left-skewed (meaning it has a longer tail on the left side), it's worth considering whether this skewness has any implications for my analysis or goals. Here are a few things to think about:

1. Understand the Skewness:

First, make sure you understand why the recovery scores are left-skewed. Is there a specific reason for this distribution? It might be due to the nature of your activities, sleep patterns, or other factors.

2. Consider the Impact:

Left-skewness can imply that the majority of your recovery scores are relatively high, with a few lower scores dragging the distribution to the left. Consider whether these lower scores are outliers or if they represent meaningful patterns in your data.

3. Statistical Techniques:

Depending on your analysis goals, you might need to account for the skewness in your data. For instance:
If you're performing statistical tests that assume normality (e.g., t-tests or ANOVA), you could consider transforming the data (e.g., using a logarithmic transformation) to make it closer to a normal distribution.
When fitting regression models, some models (e.g., linear regression) assume normally distributed residuals. In such cases, you might need to address the skewness in your response variable.
Alternatively, you could use non-parametric tests or models that don't assume a specific distribution.

In [None]:
# generate a heatmap to visualize the correlation matrix between recovery scores, sleep scores, and other relevant variables

In [None]:
# Select the columns for correlation analysis
selected_columns = df[['Recovery score %', 'Resting heart rate (bpm)',
                     'Heart rate variability (ms)', 'Skin temp (celsius)', 'Blood oxygen %',
                     'Day Strain', 'Energy burned (cal)', 'Max HR (bpm)', 'Average HR (bpm)',
                     'Sleep performance %','Respiratory rate (rpm)', 'Asleep duration (min)',
                     'In bed duration (min)', 'Light sleep duration (min)',
                     'Deep (SWS) duration (min)', 'REM duration (min)',
                     'Awake duration (min)', 'Sleep need (min)', 'Sleep debt (min)',
                     'Sleep efficiency %']]

In [None]:
correlation_matrix = selected_columns.corr()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

Discuss the findings of this correlation matrix e.g. which variables have strong relationships

# perform feature engineering (consider current data and external data)

In [None]:

# Calculate the percentage of time spent in deep sleep
df['Percentage Deep Sleep'] = (df['Deep (SWS) duration (min)'] / df['Asleep duration (min)']) * 100
df['Percentage REM Sleep'] = (df['REM duration (min)'] / df['Asleep duration (min)']) * 100


In [None]:
# Create features that capture the rate of change in recovery scores, which may indicate rapid improvements or declines.

# calculate moving average of recover over past week
df = df.sort_values(by='start date')

# Calculate the 7-day rolling average of 'Recovery score %' using available data
df['7-Day Rolling Average'] = df['Recovery score %'].rolling(window=7, min_periods=1).mean()

In [None]:
# Calculate metrics like weekly or monthly total exercise time or intensity level.

# calculate moving average of strain over past week
df['Starin 7-Day Moving Average'] = df['Day Strain'].rolling(window=7, min_periods=1).mean()

In [None]:
# get weather data 
weather_data = pd.read_csv("data/daily_weather_data.csv")

In [None]:
weather_data['date'] = pd.to_datetime(weather_data['date'], dayfirst=True)

In [None]:
weather_data = weather_data[['date', 'rain', 'sun', 'maxtp', 'mintp']]

In [None]:
# Filter the DataFrame based on the date range
start_date = '2022-12-30'
end_date = '2023-05-19'
weather_data = weather_data[(weather_data['date'] >= start_date) & (weather_data['date'] <= end_date)]
weather_data 

In [None]:
merged_df = df.merge(weather_data, left_on='end date', right_on='date', how='inner')

In [None]:
merged_df.head(8)

In [None]:
# Time series analysis techniques can help you understand how your physiological parameters change over time. You can use techniques like autocorrelation and decomposition to identify patterns and trends.

# Recovery Analysis

In [None]:
# perform scaling

In [None]:
# Conduct statistical tests (e.g., t-tests, correlation analysis) to determine relationships between different variables.

In [None]:
# Perform time series analysis, regression analysis, or machine learning models to predict future recovery or sleep patterns

# Sleep Analysis

In [None]:
# Conduct statistical tests (e.g., t-tests, correlation analysis) to determine relationships between different variables.

In [None]:
# Perform time series analysis, regression analysis, or machine learning models to predict future recovery or sleep patterns