In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functions import clean_cycles, clean_sleep_length, add_journal_data, create_features, external_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import sagemaker
import boto3
from sagemaker import Session
import io # The io module allows for dealing with various types of I/O (text I/O, binary I/O and raw I/O). 
import sagemaker.amazon.common as smac # sagemaker common libary
import os
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput


In [None]:
cycles = pd.read_csv("data/physiological_cycles.csv")
journals = pd.read_csv("data/journal_entries.csv")
sleeps = pd.read_csv("data/sleeps.csv")
workouts = pd.read_csv("data/workouts.csv")
weather_data = pd.read_csv("data/daily_weather_data.csv")

In [None]:
df1 = clean_cycles(cycles)
df2 = clean_sleep_length(df1)
df3 = add_journal_data(df2, journals)
df4 = create_features(df3)
final_data = external_data(df4, weather_data)

In [None]:
#final_data.to_csv("final_data.csv")

In [None]:
# convert categorical columns to numerical
final_data['Experience bloating?'] = final_data['Experience bloating?'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Experiencing COVID-19 symptoms'] = final_data['Experiencing COVID-19 symptoms'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Experiencing a fever'] = final_data['Experiencing a fever'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Feel energized throughout the day?'] = final_data['Feel energized throughout the day?'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Felt nervous or anxious'] = final_data['Felt nervous or anxious'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Felt recovered'] = final_data['Felt recovered'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Felt you had control over your life'] = final_data['Felt you had control over your life'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Have any alcoholic drinks?'] = final_data['Have any alcoholic drinks?'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Have any caffeine? '] = final_data['Have any caffeine? '].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Hydrated sufficiently'] = final_data['Hydrated sufficiently'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Read (non-screened device) while in bed?'] = final_data['Read (non-screened device) while in bed?'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['See artificial light upon waking up?'] = final_data['See artificial light upon waking up?'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['See direct sunlight upon waking up?'] = final_data['See direct sunlight upon waking up?'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Spend time outdoors?'] = final_data['Spend time outdoors?'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Take creatine?'] = final_data['Take creatine?'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Travel on a plane?'] = final_data['Travel on a plane?'].apply(lambda x: 0 if x == 'FALSE' else 1)
final_data['Viewed a screen device in bed'] = final_data['Viewed a screen device in bed'].apply(lambda x: 0 if x == 'FALSE' else 1)

### create training and testing data sets

In [None]:
y = final_data['Recovery score %']
X = final_data.drop(columns =['Recovery score %','Cycle start time','Cycle end time','Sleep onset','Wake onset','start date','end date','date','Cycle length'])

In [None]:
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

In [None]:
y = y.reshape(-1,1)

In [None]:

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [None]:
#scaling the data before feeding the model


scaler_x = StandardScaler()
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)

scaler_y = StandardScaler()
y_train = scaler_y.fit_transform(y_train)
y_test = scaler_y.transform(y_test)

### set up data in sagemaker for linear learner

In [None]:
# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Basically it allows us to interface with AWS services like Amazon S3 and Amazon EC2

# create a Sagemaker session
sagemaker_session = sagemaker.Session()

# define the S3 bucket and prefix that we want to use in this session i.e. where we want to store our data
bucket = bucket = Session().default_bucket()
prefix = 'whoop_project' # prefix is the subfolder within the bucket.

# get the execution role for the notebook instance.
role = sagemaker.get_execution_role()

In [None]:
# need to get data into a format that is accepted by AWS

# Code below converts the data in numpy array format to RecordIO format
# This is the format required by Sagemaker Linear Learner 

buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_numpy_to_dense_tensor(buf, X_train, y_train.reshape(-1))
buf.seek(0) 
# When you write to in-memory byte arrays, it increments 1 every time you write to it
# Let's reset that back to zero 

In [None]:
# upload data to S3

# Code to upload RecordIO data to S3

# Key refers to the name of the file
key = 'liner-learner-whoop-train-data'

# The following code uploads the data in record-io format to S3 bucket to be accessed later for training
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

In [None]:
# create an output placeholder in S3 bucket to store the linear learner output

output_location = 's3://{}/{}/linear-learner-output'.format(bucket, prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

### set up linear learner

In [None]:
# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use


container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [None]:
# We have pass in the container, the type of instance that we would like to use for training 
# output path and sagemaker session into the Estimator.
# We can also specify how many instances we would like to use for training

linear = sagemaker.estimator.Estimator(container,
                                       role,
                                       train_instance_count = 1,
                                       train_instance_type = 'ml.c4.xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sagemaker_session)


# We can tune parameters
# Train 32 different versions of the model and will get the best out of them (built-in parameters optimization!)

linear.set_hyperparameters(feature_dim = 45,
                           predictor_type = 'regressor',
                           mini_batch_size = 5,
                           epochs = 5,
                           num_models = 32,
                           loss = 'absolute_loss')

# Now we are ready to pass in the training data from S3 to train the linear learner model

linear.fit({'train': s3_train_data})

# Let's see the progress using cloudwatch logs

In [None]:
# in previous code use train_use_spot_instances = True to save money

### now lets deploy and test the model

In [None]:
# Deploying the model to perform inference 

linear_regressor = linear.deploy(initial_instance_count = 1,
                                          instance_type = 'ml.m4.xlarge')

In [None]:

# Content type overrides the data that will be passed to the deployed model, since the deployed model expects data in text/csv format.
# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content type
# Deserializer accepts two arguments, the result data and the response content type, and return a sequence of bytes in the specified content type.

# Set the content type, serializer, and deserializer
linear_regressor.content_type = 'text/csv'
linear_regressor.serializer = CSVSerializer()
linear_regressor.deserializer = JSONDeserializer()

In [None]:
# making prediction on the test data

result_linear_learner = linear_regressor.predict(X_test)

In [None]:
result_linear_learner # results are in Json format

In [None]:
# Since the result is in json format, we access the scores by iterating through the scores in the predictions

predictions = np.array([r['score'] for r in result_linear_learner['predictions']])

In [None]:
predictions.shape

In [None]:
y_predict_orig = scaler_y.inverse_transform(predictions.reshape(-1, 1))
y_test_orig = scaler_y.inverse_transform(y_test.reshape(-1, 1))

In [None]:

RMSE = float(format(np.sqrt(mean_squared_error(y_test_orig, y_predict_orig)),'.3f'))
MSE = mean_squared_error(y_test_orig, y_predict_orig)
MAE = mean_absolute_error(y_test_orig, y_predict_orig)
r2 = r2_score(y_test_orig, y_predict_orig)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2) 

In [None]:
# Delete the end-point

linear_regressor.delete_endpoint()

### set up data for xgboost

In [None]:
# Convert the array into dataframe in a way that target variable is set as the first column and followed by feature columns
# This is because sagemaker built-in algorithm expects the data in this format.

train_data = pd.DataFrame({'Target': y_train[:,0]})
for i in range(X_train.shape[1]):
    train_data[i] = X_train[:,i]

In [None]:
train_data

In [None]:
train_data.to_csv('xgboost-whoop-train-data.csv', sep=",", header = False, index = False)

In [None]:
# create a Sagemaker session
#sagemaker_session = sagemaker.Session()

# define the S3 bucket and prefix that we want to use in this session i.e. where we want to store our data
#bucket = bucket = Session().default_bucket()
#prefix = 'whoop_project' # prefix is the subfolder within the bucket.

# get the execution role for the notebook instance.
#role = sagemaker.get_execution_role()
# Key refers to the name of the file
#key = 'xgboost-whoop-train-data.csv'

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "xgboost-whoop-train-data.csv")
).upload_file("xgboost-whoop-train-data.csv")







sm_sess = sagemaker.Session()
#bucket = sm_sess.default_bucket()
#prefix = 'whoop_project' # prefix is the subfolder within the bucket.
#file_path = "whoop"

#Need this role to perform stuff
#role = get_execution_role()

#boto3.Session().resource("s3").Bucket(bucket).Object(
#    os.path.join(file_path, "train.csv")
#).upload_file("train.csv")

In [None]:
container = sagemaker.image_uris.retrieve("xgboost", sm_sess.boto_region_name, "latest")

s3_input_train = TrainingInput(
    s3_data="s3://{}/{}/xgboost-whoop-train-data".format(bucket, prefix), content_type="csv"
)



xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/xgboost-output".format(bucket, prefix),
    sagemaker_session=sm_sess,
)
xgb.set_hyperparameters(max_depth = 10,
                           objective = 'reg:linear',
                           colsample_bytree = 0.3,
                           alpha = 10,
                           eta = 0.1,
                           num_round = 100
                           )

xgb.fit({"train": s3_input_train})

In [None]:
xgb_predictor = xgb.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer()
)


### Deploy the xgboost model to perform inference 


In [None]:
# making prediction on the test data

result_XgBoost = xgb_predictor.predict(X_test)

In [None]:
def bytes_2_array(x):
    
    #makes entire prediction as string and splits based on ','
    l = str(x).split(',')
    
    #Since the first element contains unwanted characters like (b,',') we remove them
    l[0] = l[0][2:]
    #same-thing as above remove the unwanted last character (')
    l[-1] = l[-1][:-1]
    
    #iterating through the list of strings and converting them into float type
    for i in range(len(l)):
        l[i] = float(l[i])
        
    #converting the list to into array
    l = np.array(l).astype('float32')
    
    #reshape one-dimensional array to two-dimentaional array
    return l.reshape(-1,1)

In [None]:
predicted_values = bytes_2_array(result_XgBoost)

In [None]:
y_test = np.array(y_test)
y_test = y_test.reshape(-1,1)

In [None]:
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2) 

In [None]:
xgb_predictor.delete_endpoint()