# My 2022 Steps Analysis

This notebook shows the analysis of the steps I took in the last 5 months of 2022.

### Objectives of the analysis

The objectives of the analysis are to:
    1. Determine if steps taken vary with day of the week or not.
    2. Access whether steps taken vary with time of day or not.
    3. Investigate whether steps taken vary with district of work in that week or not, 
    4. Use the factors of steps taken to predict the steps for typical days in 2023.

In [1]:
# Importing the necesssary libraries
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

# Setting the styles,
# Note: add semi-colon after the last line of plot code to generate figure without the disturbing numbers
%matplotlib inline
sns.set_style('darkgrid')

In [113]:
# Loading the dataset
steps_22 = pd.read_csv('DetailedSteps_2023_01_01_0402.csv')

In [114]:
# Viewing Examining the steps_22 dataframe
steps_22.head(10)

Unnamed: 0,StartTime,EndTime,Steps,Calories,Distance(meters),ActiveTime(seconds)
0,2022-08-03 20:30:00,2022-08-03 20:44:59,362,11,256,300
1,2022-08-03 20:45:00,2022-08-03 20:59:59,501,15,355,360
2,2022-08-03 22:00:00,2022-08-03 22:14:59,127,6,90,120
3,2022-08-04 07:15:00,2022-08-04 07:29:59,176,9,124,120
4,2022-08-04 07:45:00,2022-08-04 07:59:59,130,8,92,60
5,2022-08-04 08:00:00,2022-08-04 08:14:59,182,9,129,60
6,2022-08-04 08:15:00,2022-08-04 08:29:59,970,37,688,600
7,2022-08-04 08:30:00,2022-08-04 08:44:59,639,25,453,480
8,2022-08-04 09:30:00,2022-08-04 09:44:59,187,9,132,180
9,2022-08-04 09:45:00,2022-08-04 09:59:59,287,13,203,180


In [115]:
# Getting the summary statistics of the dataframe
steps_22.describe()

Unnamed: 0,Steps,Calories,Distance(meters),ActiveTime(seconds)
count,2534.0,2534.0,2534.0,2534.0
mean,438.921863,18.013418,319.551302,292.68824
std,377.051585,14.135707,275.123989,247.862066
min,1.0,0.0,0.0,60.0
25%,139.0,6.0,101.0,60.0
50%,312.5,14.0,227.5,180.0
75%,646.0,26.75,468.75,420.0
max,1805.0,63.0,1321.0,900.0


## Cleaning the data

In [116]:
# Adding necessary columns to the datafarme for more comprehensive analysis
# 1. Categorizing the steps into medium, high, and very high
step_bins = list(steps_22.Steps.describe())
step_bins
steps_22['steps_category'] = pd.cut(steps_22['Steps'], 
                                    bins=[step_bins[3], step_bins[4], step_bins[5], step_bins[6], step_bins[7]],
                                   labels= ['low', 'medium', 'high', 'very_high'])

In [117]:
# 2. Converting the active seconds into minutes
steps_22['active_minutes'] = round(steps_22['ActiveTime(seconds)'] / 60, 0)
steps_22.head(10)

Unnamed: 0,StartTime,EndTime,Steps,Calories,Distance(meters),ActiveTime(seconds),steps_category,active_minutes
0,2022-08-03 20:30:00,2022-08-03 20:44:59,362,11,256,300,high,5.0
1,2022-08-03 20:45:00,2022-08-03 20:59:59,501,15,355,360,high,6.0
2,2022-08-03 22:00:00,2022-08-03 22:14:59,127,6,90,120,low,2.0
3,2022-08-04 07:15:00,2022-08-04 07:29:59,176,9,124,120,medium,2.0
4,2022-08-04 07:45:00,2022-08-04 07:59:59,130,8,92,60,low,1.0
5,2022-08-04 08:00:00,2022-08-04 08:14:59,182,9,129,60,medium,1.0
6,2022-08-04 08:15:00,2022-08-04 08:29:59,970,37,688,600,very_high,10.0
7,2022-08-04 08:30:00,2022-08-04 08:44:59,639,25,453,480,high,8.0
8,2022-08-04 09:30:00,2022-08-04 09:44:59,187,9,132,180,medium,3.0
9,2022-08-04 09:45:00,2022-08-04 09:59:59,287,13,203,180,medium,3.0


In [22]:
# 3. Creating Multiple columns from timestamp processing
# First, I convert the timestamp string to a datetime object
steps_22['StartTime'] = pd.to_datetime(steps_22['StartTime'], format='%Y-%m-%d %H:%M:%S')
steps_22['EndTime'] = pd.to_datetime(steps_22['EndTime'], format='%Y-%m-%d %H:%M:%S')
steps_22['weekday'] = steps_22['StartTime'].apply(lambda x: x.weekday())
steps_22['day_name'] = steps_22['StartTime'].dt.day_name()
steps_22['day_date'] = steps_22['StartTime'].dt.date
steps_22['month'] = steps_22['StartTime'].dt.month
steps_22['month_name'] = steps_22['StartTime'].dt.month_name()
steps_22['hour_of_day'] = steps_22['StartTime'].dt.hour

hour_bins = [0,4,8,12,16,20,24]
hour_labels = ['Late Night', 'Early Morning', 'Morning', 'Noon', 'Eve', 'Night']
steps_22['session'] = pd.cut(steps_22['hour_of_day'], 
                                    bins= hour_bins,
                                   labels= hour_labels, include_lowest=True)
steps_22['week_of_year'] = steps_22['StartTime'].dt.strftime('%U')

In [105]:
#steps_22.drop('work_location', inplace=True, axis=1)

In [121]:
steps_22.shape

(2534, 16)

In [23]:
steps_22.to_csv('processed_stage1.csv', index=False)

In [24]:
steps_22 = pd.read_csv('processed_stage1.csv')
steps_22.head(15)

Unnamed: 0,StartTime,EndTime,Steps,Calories,Distance(meters),ActiveTime(seconds),steps_category,active_minutes,weekday,day_name,day_date,month,month_name,hour_of_day,session,week_of_year,work_location
0,2022-08-03 20:30:00,2022-08-03 20:44:59,362,11,256,300,high,5.0,2,Wednesday,2022-08-03,8,August,20,Eve,31,baruten
1,2022-08-03 20:45:00,2022-08-03 20:59:59,501,15,355,360,high,6.0,2,Wednesday,2022-08-03,8,August,20,Eve,31,baruten
2,2022-08-03 22:00:00,2022-08-03 22:14:59,127,6,90,120,low,2.0,2,Wednesday,2022-08-03,8,August,22,Night,31,baruten
3,2022-08-04 07:15:00,2022-08-04 07:29:59,176,9,124,120,medium,2.0,3,Thursday,2022-08-04,8,August,7,Early Morning,31,baruten
4,2022-08-04 07:45:00,2022-08-04 07:59:59,130,8,92,60,low,1.0,3,Thursday,2022-08-04,8,August,7,Early Morning,31,baruten
5,2022-08-04 08:00:00,2022-08-04 08:14:59,182,9,129,60,medium,1.0,3,Thursday,2022-08-04,8,August,8,Early Morning,31,baruten
6,2022-08-04 08:15:00,2022-08-04 08:29:59,970,37,688,600,very_high,10.0,3,Thursday,2022-08-04,8,August,8,Early Morning,31,baruten
7,2022-08-04 08:30:00,2022-08-04 08:44:59,639,25,453,480,high,8.0,3,Thursday,2022-08-04,8,August,8,Early Morning,31,baruten
8,2022-08-04 09:30:00,2022-08-04 09:44:59,187,9,132,180,medium,3.0,3,Thursday,2022-08-04,8,August,9,Morning,31,baruten
9,2022-08-04 09:45:00,2022-08-04 09:59:59,287,13,203,180,medium,3.0,3,Thursday,2022-08-04,8,August,9,Morning,31,baruten


In [9]:
# 4. Adding work location using apply to apply a custom function
#work_week = [31,32,40,42,43,46,48]
#def work_locate(week_of_year):
    #if week_of_year in work_week:
        #print('here')
        #return 'baruten'
    #else:
        #print('here now')
        #return 'ilorin'
#steps_22['work_location'] = steps_22['week_of_year'].map(work_locate)


In [25]:
steps_22.work_location.value_counts()

ilorin     1661
baruten     873
Name: work_location, dtype: int64

In [47]:
steps_22.to_csv('processed_stage2.csv', index=False)

In [50]:
new_step = pd.read_csv('processed_stage2.csv')
print(new_step.shape)
new_step.head()

(2534, 17)


Unnamed: 0,StartTime,EndTime,Steps,Calories,Distance(meters),ActiveTime(seconds),steps_category,active_minutes,weekday,day_name,day_date,month,month_name,hour_of_day,session,week_of_year,work_location
0,2022-08-03 20:30:00,2022-08-03 20:44:59,362,11,256,300,high,5.0,2,Wednesday,2022-08-03,8,August,20,Eve,31,baruten
1,2022-08-03 20:45:00,2022-08-03 20:59:59,501,15,355,360,high,6.0,2,Wednesday,2022-08-03,8,August,20,Eve,31,baruten
2,2022-08-03 22:00:00,2022-08-03 22:14:59,127,6,90,120,low,2.0,2,Wednesday,2022-08-03,8,August,22,Night,31,baruten
3,2022-08-04 07:15:00,2022-08-04 07:29:59,176,9,124,120,medium,2.0,3,Thursday,2022-08-04,8,August,7,Early Morning,31,baruten
4,2022-08-04 07:45:00,2022-08-04 07:59:59,130,8,92,60,low,1.0,3,Thursday,2022-08-04,8,August,7,Early Morning,31,baruten


In [51]:
for i,v in enumerate(new_step.columns):
    print(i,v)

0 StartTime
1 EndTime
2 Steps
3 Calories
4 Distance(meters)
5 ActiveTime(seconds)
6 steps_category
7 active_minutes
8 weekday
9 day_name
10 day_date
11 month
12 month_name
13 hour_of_day
14 session
15 week_of_year
16 work_location


In [52]:
# Pulling episodes in a day together
in_columns = ['StartTime', 'EndTime', 'day_date', 'Steps', 'Calories',
              'Distance(meters)', 'ActiveTime(seconds)', 'active_minutes', 'day_name',
              'month_name', 'work_location']
step_episodes = steps_22[in_columns].copy()

In [58]:
# Setting the index to the date of the day
step_episodes.set_index('day_date', inplace=True)

In [61]:
# Converting the index to datetime object
step_episodes.index = pd.to_datetime(step_episodes.index)

In [65]:
# Summing the quantitative for each day
daily_steps = step_episodes.resample('D').sum()

In [68]:
# Saving the resulting dataframe to file
daily_steps.to_csv('total_daily_steps.csv')


In [77]:
# Creating the hour_spent column
daily['hour_spent'] = round(daily['active_minutes'] / 60, 0)

In [82]:
# Resaving the final dataframe
daily.to_csv('total_daily_steps.csv', index=False)

## Exploratory Data Analysis for the two files