In [None]:
from datetime import datetime
import json
import keyring
import requests

import fitbit
import gather_keys_oauth2 as Oauth2
import pandas as pd 
from pprint import pprint
import matplotlib.pyplot as plt

## Process Flow
1. Set up authorization for using the FitBit API.
2. Pull data from FitBit into a list of dictionaries ('activity_data') for a particular endpoint.
3. Prepare an endpoint data frame for analysis. 
4. Analyze.

### Alternative - use data in CSV file instead of connecting to FitBit
If you analyzing FitBit data stored in a CSV file:
1. Skip Steps 1, 2 and 3 in the process flow above 
2. Go to "3 Alternative" set of functions to read the CSV data and create the endpoint data frame structure. 
3. Continue with Step 4 to start your analysis. 

An example CSV file is in the repository: example_sleep.csv 

# 1 - Set up authorization for using the FitBit API

First we have to set up authorization for the FitBit API. These instructions show you how to set up Fitbit so that you can connect to their api. 

https://towardsdatascience.com/collect-your-own-fitbit-data-with-python-ff145fa10873

At the moment, the first chunk of the code in this notebook is copied directly from this post.


When the directions mention secrets and keys, you'll notice that the code has in this notebook has stored the key and secret using the keyring library. This library helps you manage your keys and IDs (so that if you share your code, you don't share your credentials!).

Here's a great link on how/why to use the keyring library.

https://alexwlchan.net/2016/11/you-should-use-keyring/

One last thing to note is that while we import the fitbit library, we're really only using it for authentication. In other words, we stop following the instruction after step two. Why is this? The fitbit python library calls the fitbit api in units of one day. And the fitbit api limits a single user's calls to 150 per hour, which means that if we used this library, we'd be limited to grabbing only 5 months of data at a time. 

Instead, we're going to create some functions that interact directly with the fitbit API so that we can grab a range of days' worth of data at a time.

To be clear, sometimes you might want to get a single day's worth of data (and there's some code that does exactly that at the bottom of this notebook, but for this analysis, I'm more interested in trends across days than within days.

In [None]:
CLIENT_ID = keyring.get_password("fitbit", "key")
CLIENT_SECRET = keyring.get_password("fitbit", "secret")

server = Oauth2.OAuth2Server(CLIENT_ID, CLIENT_SECRET)
server.browser_authorize()
ACCESS_TOKEN = str(server.fitbit.client.session.token['access_token'])
REFRESH_TOKEN = str(server.fitbit.client.session.token['refresh_token'])
auth2_client = fitbit.Fitbit(CLIENT_ID, CLIENT_SECRET, oauth2=True, access_token=ACCESS_TOKEN, refresh_token=REFRESH_TOKEN)

Now we're authorized to pull data from the Fitbit API. If you've never interacted with an api before, that won't keep you from moving forward with this analysis. Here's the only pieces of information you'll need to understand for this analysis:

1. APIs are tools that entities to provide to allow your program to connect directly to their data. It's how we request data instead of using a UI interface. In Fitbit's case, the API provides more complete and detailed access to your data than is available in the UI download interface.

2. APIs let you 'get', 'post', 'delete', and 'patch'(edit) data. We'll only 'get' data, using the python 'requests' library.

3. Well-designed APIs use consistent URL formats to structure API calls. This takes the form of a URL. Getting, posting (etc.) data involves:
    1. using the correct verb from the requests library (get, post, etc.)
    2. structuring the text of the URL to meet the pattern that the API in question uses.


Here's an example URL from the Fitbit api
 "https://api.fitbit.com/1.2/user/-/sleep/date/2018-04-02/2018-04-08.json"
 
This breaks down into the following pattern:

"https://api.fitbit.com/1.2/user/-/" + endpoint + "/date/" + start_date + "/" + end_date + ".json"

We'll use this to build a generic function that takes the endpoint name, start_date, and end_date.

Things that other APIs might be able to do that the Fitbit API cannot:
    
    -Handle queries about the data: for example, we can't ask the Fitbit API questions like "How many days in the past month have I slept less than 6 hours?"

# 2 - Pull data from FitBit into a list of dictionaries ('activity_data')

In [None]:
# getEndpointData is a generic function that lets us retrieve data from any fitbit api endpoint we want
def getEndpointData(endpoint, start_date, end_date):
    # At some point, we should insert some defensive coding here to make sure that the start_date and 
    # end_date are provided in the proper format (YYYY-MM-DD e.g. '2018-04-28'). For now, we'll 
    # leave it to the user to know the correct format
    
    url = "https://api.fitbit.com/1.2/user/-/" + endpoint + "/date/" + start_date + "/" + end_date + ".json"
    results = requests.get(url = url, headers={'Authorization':'Bearer ' + ACCESS_TOKEN})
    if results.status_code == 200:
        activity_data = json.loads(results.text)
        return activity_data
    else:
        print(results.text)
        return "ERROR"
    

Let's also build a function that makes our initial dates go backwards in time. This gives us an an easy way to loop back through all of the data we have stored in fitbit for a given endpoint.

In [None]:
# right now this is set to assume we're pulling one month at a time. That's something 
# that it will probably make sense to change in the future.
import datetime

def makeDatesEarlier(start_date, end_date):
    end_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") - datetime.timedelta(days=30)).strftime("%Y-%m-%d")
    start_date = (datetime.datetime.strptime(start_date, "%Y-%m-%d") - datetime.timedelta(days=30)).strftime("%Y-%m-%d")
    return start_date, end_date

Now let's try this out with some sleep data!

In [None]:
endpoint = "sleep"

end_date = "2018-04-28"
start_date = "2018-03-29"

activity_data = getEndpointData(endpoint, start_date, end_date)

# 3 - Prepare an endpoint data frame for analysis. 

In [None]:
def processSleepResults(activity_data, sleep_summaries, sleep_time_events_detail):

    if not activity_data['sleep']:
        # sleep endpoint no longer returns results
        print("no more sleep data!")
        return sleep_summaries, sleep_time_events_detail, "stop"
    else:
        for sleep_event in activity_data['sleep']:
            sleep_time_events_detail.append(sleep_event['levels']['data'])
            del sleep_event['levels']['data']
            try: 
                del sleep_event['levels']['shortData']
            except:
                pass
                #this was a nap, so no shortData was available
            sleep_summaries.append(sleep_event)
    return sleep_summaries, sleep_time_events_detail, "continue"

In [None]:
sleep_time_events_detail = []
sleep_summaries = []
endpoint = "sleep"


# start_date decrement month by 1, increment date by 1 
end_date = "2018-05-19"
start_date = "2018-04-20"
    
status = "continue"
    
while status == "continue":
    activity_data = getEndpointData(endpoint, start_date, end_date)
    if activity_data != "ERROR":
        sleep_summaries, sleep_time_events_detail, status = processSleepResults(activity_data, sleep_summaries, sleep_time_events_detail)
        start_date, end_date = makeDatesEarlier(start_date, end_date)
        print("start date: {}, end date: {}".format(start_date, end_date))
    else:
        break

In [None]:
pprint((sleep_summaries))

In [None]:
# collect date of end sleep time, which will always be the day after the activity time

# for each sleep record, grab the 'endTime' and minutes_asleep, and deep_minutes - ['levels']['summary']['deep']['minutes']

sleep_list = []

#open question: naps?



# make note later about disappointing 30-day-average bs

for sleep_summary in sleep_summaries:
    end_time = sleep_summary['endTime']
    end_date, wakeup_time = end_time.split("T")
    wakeup_time = datetime.datetime.strptime(wakeup_time, "%H:%M:%S.%f").time()
    start_time = sleep_summary['startTime']
    start_date, bed_time = start_time.split("T")
    bed_time = datetime.datetime.strptime(bed_time, "%H:%M:%S.%f").time()
    #convert end_date to a date instead of string
    end_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d")- datetime.timedelta(days=1)).date()
    minutes_asleep = sleep_summary['minutesAsleep']
    sleep_efficiency = sleep_summary['efficiency']
    try:
        deep_minutes = sleep_summary['levels']['summary']['deep']['minutes']
        deep_count = sleep_summary['levels']['summary']['deep']['count']
        
        light_minutes = sleep_summary['levels']['summary']['light']['minutes']
        light_count = sleep_summary['levels']['summary']['light']['count']
        
        
        rem_minutes = sleep_summary['levels']['summary']['rem']['minutes']
        rem_count = sleep_summary['levels']['summary']['rem']['count']        
        
        wake_minutes = sleep_summary['levels']['summary']['wake']['minutes']
        wake_count = sleep_summary['levels']['summary']['wake']['count']
        
        sleep_list.append({'end_date': end_date, 'wakeup_time': wakeup_time, 'minutes_asleep': minutes_asleep,\
                          'deep_minutes': deep_minutes, 'deep_count': deep_count, \
                           'light_minutes': light_minutes, 'light_count': light_count, \
                           'rem_minutes': rem_minutes, 'rem_count': rem_count, \
                           'wake_minutes': wake_minutes, 'wake_count': wake_count, \
                           'efficiency': sleep_efficiency, 'start_data': start_date, 'bed_time': bed_time
                          
                          })
    except:
        # we're skipping nap data
        pass

In [None]:
sleep_df = pd.DataFrame(sleep_list)
print(sleep_df)

# 3 (CSV alternative) - Read FitBit data in from a CSV file
To avoid running this portion inadvertantly, toggle the 'use_csv' boolean.

In [None]:
use_csv = False

from tkinter import filedialog
import csv

def csv_to_list(csv_filename):
    '''Return embedded list containing data in given CSV file.'''

    result_list = []

    with open(csv_filename) as f:
        csv_f = csv.reader(f)
        for row in csv_f:
            result_list.append(row)

    return result_list


if not use_csv:
    pass
else:
    # Build activity_list using CSV file
    # Improve this by adding endpoint as a parameter so this function can be repeatable for any endpoint
    
    sleep_data = []
    sleep_data = csv_to_list(filedialog.askopenfilename(title='Select FitBit CSV', filetypes=[("CSV","*.csv"),("All files","*.*")]))

    # Cleanup by deleting first column which is an index
    for x in sleep_data:
        del x[0]
        
    # Convert sleep_data to sleep_list list of dictionaries to match 'fitbit messing around' data frame
    sleep_summaries = sleep_data[1:]  # Don't include first list which is column headers
    sleep_list = []

    '''
    For reference, this is the list of columns with example formats:
    0 bed_time: 1900-01-01 23:17:00
    1 deep_count: 4
    2 deep_minutes: 58
    3 efficiency: 98
    4 end_date: 2017-06-09 00:00:00
    5 light_count: 30
    6 light_minutes: 258
    7 minutes_asleep: 509
    8 rem_count: 8
    9 rem_minutes: 193
    10 start_data: 2017-06-09
    11 wake_count: 24
    12 wake_minutes: 39
    13 wakeup_time: 1900-01-01 08:25:30
    '''

    for sleep_summary in sleep_summaries:
        bed_time = strip_time(sleep_summary[0])

        sleep_list.append({
            'bed_time': sleep_summary[0],\
            'deep_count': sleep_summary[1],\
            'deep_minutes': sleep_summary[2],\
            'efficiency': sleep_summary[3],\
            'end_date': sleep_summary[4],\
            'light_count': sleep_summary[5],\
            'light_minutes': sleep_summary[6],\
            'minutes_asleep': sleep_summary[7],\
            'rem_count': sleep_summary[8],\
            'rem_minutes': sleep_summary[9],\
            'start_data': sleep_summary[10],\
            'wake_count': sleep_summary[11],\
            'wake_minutes': sleep_summary[12],\
            'wakeup_time': sleep_summary[13]})
        
    sleep_df = pd.DataFrame(sleep_list)
    

In [None]:
print(sleep_df)

# 4 - Analyze

In [None]:
sleep_df.to_csv(path_or_buf = "melissa_sleep.csv")

In [None]:
plt.hist(sleep_df['efficiency'])
plt.show()

In [None]:
plt.hist(sleep_df['deep_minutes'])
plt.show()

In [None]:
plt.hist(sleep_df['wake_count'])
plt.show()

In [None]:
sleep_df[sleep_df['efficiency'] < 92]

In [None]:
plt.hist(sleep_df['wake_minutes'])
plt.show()

Our question: How is sleep impacted by activity level?

veryActiveMinutes and fairlyActiveMinutes is how we're measuring activity level

Do I have more deep sleep the more I am very and fairly active

In [None]:
plt.hist(sleep_df['minutes_asleep'])
plt.show()

In [None]:
sleep_df['minutes_asleep'].mean()

# End Analysis for Sleep Endpoint
# ------------------------------------------------

# Begin Data Build and Analysis (steps 2-4) for Activity Endpoint(s)

In [None]:
def convertActiveMinuteData(data, endpoint):
    data_list = []
    
    endpoint = endpoint.replace(r"/", "-")
    column_name = endpoint.split("-")[1]
    for line in data[endpoint]:
        data_list.append({'end_date': datetime.datetime.strptime(line['dateTime'], "%Y-%m-%d"), column_name: int(line['value'])})
    df = pd.DataFrame(data_list)
    return df

In [None]:
# list of activities endpoints for reference

endpoints = ["activities/calories", "activities/caloriesBMR", "activities/steps", "activities/distance", 
             "activities/floors", "activities/elevation", "activities/minutesSedentary", 
             "activities/minutesLightlyActive", "activities/minutesFairlyActive", "activities/minutesVeryActive",
            "activities/activityCalories"]



In [None]:
import datetime

start_date = '2017-06-10' 
end_date = '2018-05-19'



endpoint = "activities/minutesVeryActive"

# first grab veryActiveMinutes
veryActiveData = getEndpointData(endpoint, start_date, end_date)

# grab date and minutes from veryActiveMinutes
veryActiveList = convertActiveMinuteData(veryActiveData, endpoint)

# then grab fairlyActiveMinutes
endpoint = "activities/minutesFairlyActive"
fairlyActiveData = getEndpointData(endpoint, start_date, end_date)

#grab date and minutes from fairlyActiveMinutes
fairlyActiveList = convertActiveMinuteData(fairlyActiveData, endpoint)


endpoint = "activities/minutesSedentary"
sedentaryActiveData = getEndpointData(endpoint, start_date, end_date)

#grab date and minutes from fairlyActiveMinutes
sedentaryActiveList = convertActiveMinuteData(sedentaryActiveData, endpoint)



endpoint = "activities/minutesLightlyActive"
lightlyActiveData = getEndpointData(endpoint, start_date, end_date)

#grab date and minutes from fairlyActiveMinutes
lightlyActiveList = convertActiveMinuteData(lightlyActiveData, endpoint)

In [None]:
# activity score. 
# dailyminutes (1440) = sedentaryMinutes + all active minutes

# of the waking time, how much was active, and how much was sedentary?

# use average number of asleep minutes - 407 to subtract from sedentaryMinutes and dailyMinutes

# what percentage of time was I active (out of awake minutes)?
# all_active_minutes / (1440-407)

# all_active_minutes / 1033

def calculatePercent(input_list, target):
    return target / sum(input_list) 




In [None]:
activity_data = pd.merge(veryActiveList, fairlyActiveList, on='end_date')
activity_data = pd.merge(activity_data, sedentaryActiveList, on='end_date')
activity_data = pd.merge(activity_data, lightlyActiveList, on='end_date')
activity_data = pd.merge(activity_data, sleep_df, on='end_date')

activity_data = activity_data.fillna(0)

print(activity_data)

# Investigate data further using Tableau

In [None]:
# Trying to see if very_active_minutes taken as a percent of total active time are 
# able to predict deep sleep


activity_data['very_active_percent'] = activity_data.apply(lambda x: calculatePercent([x['minutesLightlyActive'], x['minutesVeryActive'], x['minutesFairlyActive']], x['minutesVeryActive']), axis = 1)



plt.scatter(activity_data['very_active_percent'], activity_data['deep_minutes'])
plt.show()

It would seem that very_active_minutes are not a good predictor of the number of deep sleep minutes.

Do very active minutes decrease the number of waking events during the night?



In [None]:

plt.scatter(activity_data['very_active_percent'], activity_data['wake_count'])
plt.show()

In [None]:
plt.scatter((activity_data['minutesVeryActive']+activity_data['minutesFairlyActive']),activity_data['deep_minutes']/activity_data['minutes_asleep'])
plt.show()



In [None]:
endpoint = "activities/heart"
end_date = "2018-06-08"
start_date = "2018-05-09"

heart_rate_data = getEndpointData(endpoint, start_date, end_date)

In [None]:
pprint(heart_rate_data)

In [None]:


pprint(heart_rate_data['activities-heart'][0])

In [None]:
import datetime

heart_list = []

for record in heart_rate_data['activities-heart']:
    heart_dict = {}
    heart_dict['resting_heart_rate'] = record['value']['restingHeartRate']
    heart_dict['date_obj'] = datetime.datetime.strptime(record['dateTime'], "%Y-%m-%d").date()
    heart_dict['date_str'] = record['dateTime']
    for zone in record['value']['heartRateZones']:
        minute_name = zone['name'] + "_minutes"
        minutes = zone['minutes']
        calories_name = zone['name'] + "_calories"
        calories_out = zone['caloriesOut']
        heart_dict[minute_name] = minutes
        heart_dict[calories_name] = calories_out
    heart_list.append(heart_dict)
heart_df = pd.DataFrame(heart_list)

In [None]:
print(heart_df)

In [None]:
import matplotlib

plt.hist(heart_df['resting_heart_rate'], bins=7)
plt.show()

In [None]:
plt.hist(heart_df['Fat Burn_minutes'])
plt.show()

In [None]:
plt.hist(heart_df['Cardio_minutes'])
plt.show()

In [None]:
plt.scatter(heart_df['Cardio_minutes'], heart_df['resting_heart_rate'])
plt.show()

Below we have some code Melissa wrote to originally interact with the fitbit library (the one that runs into that 150 calls per hour limit.)  We're keeping the code here in case it's helpful as we're building out the other data sets.

In [None]:
#yesterday2 = str((datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d"))
#today = str(datetime.now().strftime("%Y%m%d"))

#yesterday2 = ((datetime.now() - timedelta(days=1)))
#yesterday3 = (yesterday2 - timedelta(days=1))
#print(yesterday3)

current_day = "2018-05-18"


'''
These functions use the intra-day endpoint. 

CAUTION: Plan your calls wisely, or you will exceed 150 API calls per hour.

'''

#take a starting date and a total number of days as an input
# day needs to be in YYYY-MM-DD format
def pullFitBitData(start_date, days, call_type):
    #insert date error checking laterz
    print("Processing day: {}".format(start_date))
    current_date = start_date
    activity_df, heartRate_df = buildActivityData(current_date)
    day_counter = 0
    while day_counter < days:
        current_date = (datetime.datetime.strptime(current_date, "%Y-%m-%d") - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
        print("Processing day: {}".format(current_date))
        activity_df2, heartRate_df2 = buildActivityData(current_date)
        activity_df = pd.concat([activity_df, activity_df2])
        heartRate_df = pd.concat([heartRate_df, heartRate_df2])
        day_counter += 1
    print("Ended processing on {}.".format(current_date))
    return activity_df, heartRate_df
        

In [None]:
with open('sleep_time_events_detail.txt', 'w') as outfile:
    json.dump(sleep_time_events_detail, outfile)

In [None]:
fit_statsHR = auth2_client.intraday_time_series('activities/heart', base_date=current_day, detail_level='15min')
heartRateZones = (fit_statsHR['activities-heart'][0]['value']['heartRateZones'])

'''
column_names = heartRatePivot_df.columns.values
new_column_names = []
for name in column_names:
    new_name = name[1].replace(' ', '_')+'.'+name[0]
    new_column_names.append(new_name)
heartRatePivot_df.columns = new_column_names
print(heartRatePivot_df)
'''


def accumulateHeartData(current_day, heartRateZones):
    date_dict = {'date': [current_day, current_day, current_day, current_day]}
    date_df = pd.DataFrame(date_dict)
    
    heartRateZones_df = pd.DataFrame.from_records(heartRateZones)
    heartRateZones_df['date'] = date_df['date']
    heartRatePivot_df = heartRateZones_df.pivot(index='date', columns='name')
    column_names = heartRatePivot_df.columns.values
    new_column_names = []
    for name in column_names:
        new_name = name[1].replace(' ', '_')+'.'+name[0]
        new_column_names.append(new_name)
    heartRatePivot_df.columns = new_column_names
    return heartRatePivot_df

def buildActivityData(current_date):
    activity_stats = auth2_client.activities(date=current_date)
    pprint(activity_stats)
    activity_stats = activity_stats['summary']
    heartRateZones = activity_stats['heartRateZones']
    heartRate_df = accumulateHeartData(current_date, heartRateZones)
    del activity_stats['distances']
    del activity_stats['heartRateZones']
    activity_df = pd.DataFrame(activity_stats, index=[current_date])
    return activity_df, heartRate_df

In [None]:
activity_df, heartRate_df = buildActivityData("2018-05-18")

In [None]:
print(activity_df)

In [None]:
print(heartRate_df)

In [None]:
fit_statsHR = auth2_client.intraday_time_series('activities/heart', base_date=current_day, detail_level='15min')
pprint(fit_statsHR)