# My 2022 Steps Analysis

This notebook shows the analysis of the steps I took in the last 5 months of 2022.

### Objectives of the analysis

The objectives of the analysis are to:
    1. Determine if steps taken vary with day of the week or not.
    2. Access whether steps taken vary with time of day or not.
    3. Investigate whether steps taken vary with district of work in that week or not, 
    4. Use the factors of steps taken to predict the steps for typical days in 2023.

In [1]:
# Importing the necesssary libraries
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

# Setting the styles,
# Note: add semi-colon after the last line of plot code to generate figure without the disturbing numbers
%matplotlib inline
sns.set_style('darkgrid')

In [113]:
# Loading the dataset
steps_22 = pd.read_csv('DetailedSteps_2023_01_01_0402.csv')

In [114]:
# Viewing Examining the steps_22 dataframe
steps_22.head(10)

Unnamed: 0,StartTime,EndTime,Steps,Calories,Distance(meters),ActiveTime(seconds)
0,2022-08-03 20:30:00,2022-08-03 20:44:59,362,11,256,300
1,2022-08-03 20:45:00,2022-08-03 20:59:59,501,15,355,360
2,2022-08-03 22:00:00,2022-08-03 22:14:59,127,6,90,120
3,2022-08-04 07:15:00,2022-08-04 07:29:59,176,9,124,120
4,2022-08-04 07:45:00,2022-08-04 07:59:59,130,8,92,60
5,2022-08-04 08:00:00,2022-08-04 08:14:59,182,9,129,60
6,2022-08-04 08:15:00,2022-08-04 08:29:59,970,37,688,600
7,2022-08-04 08:30:00,2022-08-04 08:44:59,639,25,453,480
8,2022-08-04 09:30:00,2022-08-04 09:44:59,187,9,132,180
9,2022-08-04 09:45:00,2022-08-04 09:59:59,287,13,203,180


In [115]:
# Getting the summary statistics of the dataframe
steps_22.describe()

Unnamed: 0,Steps,Calories,Distance(meters),ActiveTime(seconds)
count,2534.0,2534.0,2534.0,2534.0
mean,438.921863,18.013418,319.551302,292.68824
std,377.051585,14.135707,275.123989,247.862066
min,1.0,0.0,0.0,60.0
25%,139.0,6.0,101.0,60.0
50%,312.5,14.0,227.5,180.0
75%,646.0,26.75,468.75,420.0
max,1805.0,63.0,1321.0,900.0


In [116]:
# Adding necessary columns to the datafarme for more comprehensive analysis
# 1. Categorizing the steps into medium, high, and very high
step_bins = list(steps_22.Steps.describe())
step_bins
steps_22['steps_category'] = pd.cut(steps_22['Steps'], 
                                    bins=[step_bins[3], step_bins[4], step_bins[5], step_bins[6], step_bins[7]],
                                   labels= ['low', 'medium', 'high', 'very_high'])

In [117]:
# 2. Converting the active seconds into minutes
steps_22['active_minutes'] = round(steps_22['ActiveTime(seconds)'] / 60, 0)
steps_22.head(10)

Unnamed: 0,StartTime,EndTime,Steps,Calories,Distance(meters),ActiveTime(seconds),steps_category,active_minutes
0,2022-08-03 20:30:00,2022-08-03 20:44:59,362,11,256,300,high,5.0
1,2022-08-03 20:45:00,2022-08-03 20:59:59,501,15,355,360,high,6.0
2,2022-08-03 22:00:00,2022-08-03 22:14:59,127,6,90,120,low,2.0
3,2022-08-04 07:15:00,2022-08-04 07:29:59,176,9,124,120,medium,2.0
4,2022-08-04 07:45:00,2022-08-04 07:59:59,130,8,92,60,low,1.0
5,2022-08-04 08:00:00,2022-08-04 08:14:59,182,9,129,60,medium,1.0
6,2022-08-04 08:15:00,2022-08-04 08:29:59,970,37,688,600,very_high,10.0
7,2022-08-04 08:30:00,2022-08-04 08:44:59,639,25,453,480,high,8.0
8,2022-08-04 09:30:00,2022-08-04 09:44:59,187,9,132,180,medium,3.0
9,2022-08-04 09:45:00,2022-08-04 09:59:59,287,13,203,180,medium,3.0


In [118]:
# 3. Creating Multiple columns from timestamp processing
# First, I convert the timestamp string to a datetime object
steps_22['StartTime'] = pd.to_datetime(steps_22['StartTime'], format='%Y-%m-%d %H:%M:%S')
steps_22['EndTime'] = pd.to_datetime(steps_22['EndTime'], format='%Y-%m-%d %H:%M:%S')
steps_22['weekday'] = steps_22['StartTime'].apply(lambda x: x.weekday())
steps_22['day_name'] = steps_22['StartTime'].dt.day_name()
steps_22['day_date'] = steps_22['StartTime'].dt.date
steps_22['month'] = steps_22['StartTime'].dt.month
steps_22['month_name'] = steps_22['StartTime'].dt.month_name()
steps_22['hour_of_day'] = steps_22['StartTime'].dt.hour

hour_bins = [0,4,8,12,16,20,24]
hour_labels = ['Late Night', 'Early Morning', 'Morning', 'Noon', 'Eve', 'Night']
steps_22['session'] = pd.cut(steps_22['hour_of_day'], 
                                    bins= hour_bins,
                                   labels= hour_labels, include_lowest=True)
steps_22['week_of_year'] = steps_22['StartTime'].dt.strftime('%U')

In [105]:
#steps_22.drop('work_location', inplace=True, axis=1)

In [121]:
steps_22.shape

(2534, 16)

In [122]:
steps_22.to_csv('processed_stage1.csv', index=False)

In [2]:
steps_22 = pd.read_csv('processed_stage1.csv')
steps_22.head(15)

Unnamed: 0,StartTime,EndTime,Steps,Calories,Distance(meters),ActiveTime(seconds),steps_category,active_minutes,weekday,day_name,day_date,month,month_name,hour_of_day,session,week_of_year
0,2022-08-03 20:30:00,2022-08-03 20:44:59,362,11,256,300,high,5.0,2,Wednesday,2022-08-03,8,August,20,Eve,31
1,2022-08-03 20:45:00,2022-08-03 20:59:59,501,15,355,360,high,6.0,2,Wednesday,2022-08-03,8,August,20,Eve,31
2,2022-08-03 22:00:00,2022-08-03 22:14:59,127,6,90,120,low,2.0,2,Wednesday,2022-08-03,8,August,22,Night,31
3,2022-08-04 07:15:00,2022-08-04 07:29:59,176,9,124,120,medium,2.0,3,Thursday,2022-08-04,8,August,7,Early Morning,31
4,2022-08-04 07:45:00,2022-08-04 07:59:59,130,8,92,60,low,1.0,3,Thursday,2022-08-04,8,August,7,Early Morning,31
5,2022-08-04 08:00:00,2022-08-04 08:14:59,182,9,129,60,medium,1.0,3,Thursday,2022-08-04,8,August,8,Early Morning,31
6,2022-08-04 08:15:00,2022-08-04 08:29:59,970,37,688,600,very_high,10.0,3,Thursday,2022-08-04,8,August,8,Early Morning,31
7,2022-08-04 08:30:00,2022-08-04 08:44:59,639,25,453,480,high,8.0,3,Thursday,2022-08-04,8,August,8,Early Morning,31
8,2022-08-04 09:30:00,2022-08-04 09:44:59,187,9,132,180,medium,3.0,3,Thursday,2022-08-04,8,August,9,Morning,31
9,2022-08-04 09:45:00,2022-08-04 09:59:59,287,13,203,180,medium,3.0,3,Thursday,2022-08-04,8,August,9,Morning,31


In [9]:
# 4. Adding work location using apply to apply a custom function
#work_week = [31,32,40,42,43,46,48]
#def work_locate(week_of_year):
    #if week_of_year in work_week:
        #print('here')
        #return 'baruten'
    #else:
        #print('here now')
        #return 'ilorin'
#steps_22['work_location'] = steps_22['week_of_year'].map(work_locate)


In [10]:
steps_22.work_location.value_counts()

ilorin     1661
baruten     873
Name: work_location, dtype: int64

In [11]:
steps_22.to_csv('processed_stage2.csv', index=False)

In [None]:
#

In [47]:
# Code to pull episodes in a day together
#steps_22.resample('D', on='StartTime').sum()

Unnamed: 0_level_0,Steps,Calories,Distance(meters),ActiveTime(seconds),active_minutes,weekday
StartTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-08-03,990,32,701,780,13.0,6
2022-08-04,12814,530,9077,8640,144.0,90
2022-08-05,7046,284,4992,4440,74.0,72
2022-08-06,6280,250,4450,4020,67.0,50
2022-08-07,7755,313,5497,5220,87.0,96
...,...,...,...,...,...,...
2022-12-28,6918,269,5058,4200,70.0,22
2022-12-29,1730,78,1262,1320,22.0,30
2022-12-30,2189,94,1598,1500,25.0,32
2022-12-31,6892,268,5037,4380,73.0,75


42    188
39    181
47    170
46    157
45    155
41    150
36    134
38    130
32    125
49    122
44    119
40    116
48    115
43    111
50     99
34     82
51     75
37     74
52     72
33     61
31     61
35     36
01      1
Name: week_of_year, dtype: int64

ilorin     1661
baruten     873
Name: work_location, dtype: int64