In [641]:
#### Author : Maximiliano Lopez Salgado
#### First commit: 2023-05-05
#### Last commit: 2023-05-12
#### Description: This notebook is used to explore the bike data set

<center><h1>Data Wrangling</center></h1>

In [642]:
# import ML libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import math
import folium
import geopy
from geopy.geocoders import Nominatim
import time
from geopy.exc import GeocoderUnavailable
from geopy.extra.rate_limiter import RateLimiter
import branca.colormap as cm
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime

In [643]:
# import day dataset from csv file and store it in a dataframe named day
day = pd.read_csv('/Users/maximilianolopezsalgado/data_projects/capital_bike_sharing/datasets/day.csv')

# import hour dataset from csv file and store it in a dataframe named hour
hour = pd.read_csv('/Users/maximilianolopezsalgado/data_projects/capital_bike_sharing/datasets/hour.csv')

# print the first 5 rows of the day dataset
display(day.head())

# print the first 5 rows of the hour dataset
display(hour.head())

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [644]:
# Check the shape of datasets and column names for day and hour datasets 
print('Shape of day dataset: ', day.shape)
print('Shape of hour dataset: ', hour.shape)

Shape of day dataset:  (731, 16)
Shape of hour dataset:  (17379, 17)


In [645]:
# look out for missing values in day dataset
print('Missing values in day dataset: ', day.isnull().sum())

# look out for missing values in hour dataset
print('Missing values in hour dataset: ', hour.isnull().sum())

Missing values in day dataset:  instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64
Missing values in hour dataset:  instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64


In [646]:
# check the data types for day dataset
print('Data types for day dataset: ', day.dtypes)

# check the data types for hour dataset
print('Data types for hour dataset: ', hour.dtypes)

Data types for day dataset:  instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object
Data types for hour dataset:  instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object


In [647]:
# check the unique values for each column in day dataset
print('Unique values for each column in day dataset: ', day.nunique())

# check the unique values for each column in hour dataset
print('Unique values for each column in hour dataset: ', hour.nunique())

Unique values for each column in day dataset:  instant       731
dteday        731
season          4
yr              2
mnth           12
holiday         2
weekday         7
workingday      2
weathersit      3
temp          499
atemp         690
hum           595
windspeed     650
casual        606
registered    679
cnt           696
dtype: int64
Unique values for each column in hour dataset:  instant       17379
dteday          731
season            4
yr                2
mnth             12
hr               24
holiday           2
weekday           7
workingday        2
weathersit        4
temp             50
atemp            65
hum              89
windspeed        30
casual          322
registered      776
cnt             869
dtype: int64


In [648]:
#### Checking the descriptive statistics is now not necessary because the data is normalized.

In [649]:
# check the descriptive statistics for day dataset
# print('Descriptive statistics for day dataset: ', day.describe())

# check the descriptive statistics for hour dataset
# print('Descriptive statistics for hour dataset: ', hour.describe())

In [650]:
# rename the hour dataset to bike
bike = hour

In [651]:
# change the data type for dteday column to datetime
bike['dteday'] = pd.to_datetime(bike['dteday'])

# check the data types for the concatenated dataset
print('Data types for concatenated dataset: ', bike.dtypes)

Data types for concatenated dataset:  instant                int64
dteday        datetime64[ns]
season                 int64
yr                     int64
mnth                   int64
hr                     int64
holiday                int64
weekday                int64
workingday             int64
weathersit             int64
temp                 float64
atemp                float64
hum                  float64
windspeed            float64
casual                 int64
registered             int64
cnt                    int64
dtype: object


In [652]:
# change the name of the columns in the concatenated dataset to make them more readable
bike.rename(columns={'dteday':'date','yr':'year', 'mnth':'month', 'hr':'hour', 'weathersit':'weather', 'casual':'casual_user', 'registered':'registered_user', 'hum':'humidity', 'cnt':'count', 'atemp':'apparent_temp'}, inplace=True)

# check the column names for the concatenated dataset
print('Column names for concatenated dataset: ', bike.columns)

Column names for concatenated dataset:  Index(['instant', 'date', 'season', 'year', 'month', 'hour', 'holiday',
       'weekday', 'workingday', 'weather', 'temp', 'apparent_temp', 'humidity',
       'windspeed', 'casual_user', 'registered_user', 'count'],
      dtype='object')


In [653]:
# rename names and values in columns to make them look human readable
# change the values of holiday column to month names
bike.loc[bike['holiday'] == 0, 'holiday'] = 'No'
bike.loc[bike['holiday'] == 1, 'holiday'] = 'Yes'

# change the values of season column to month names

bike.loc[bike['season'] == 1, 'season'] = 'Winter'
bike.loc[bike['season'] == 2, 'season'] = 'Spring'
bike.loc[bike['season'] == 3, 'season'] = 'Summer'
bike.loc[bike['season'] == 4, 'season'] = 'Fall'

# change the values of workingday column to month names

bike.loc[bike['workingday'] == 0, 'workingday'] = 'No'
bike.loc[bike['workingday'] == 1, 'workingday'] = 'Yes'

# change the values of weather column to month names

bike.loc[bike['weather'] == 1, 'weather'] = 'Clear'
bike.loc[bike['weather'] == 2, 'weather'] = 'Mist-Cloudy'
bike.loc[bike['weather'] == 3, 'weather'] = 'Light-Rain'
bike.loc[bike['weather'] == 4, 'weather'] = 'Heavy-Rain'

# change the values of weekday column to month names

bike.loc[bike['weekday'] == 0, 'weekday'] = 'Monday'
bike.loc[bike['weekday'] == 1, 'weekday'] = 'Tuesday'
bike.loc[bike['weekday'] == 2, 'weekday'] = 'Wednesday'
bike.loc[bike['weekday'] == 3, 'weekday'] = 'Thursday'
bike.loc[bike['weekday'] == 4, 'weekday'] = 'Friday'
bike.loc[bike['weekday'] == 5, 'weekday'] = 'Saturday'
bike.loc[bike['weekday'] == 6, 'weekday'] = 'Sunday'

# temp column: normalize temperature in celsius. Formula: (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (from dataset description)
t_min = -8  # Minimum temperature value in Celsius
t_max = 39  # Maximum temperature value in Celsius

# Convert normalized temperature values back to Celsius
bike['temp'] = bike['temp'].apply(lambda x: (x*(t_max-t_min))+t_min)

# display the range of values for temp column
display(print('Range of values for temp column: ', bike['temp'].min(), bike['temp'].max()))

# atemp column: normalize feeling temperature in celsius. Formula: (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (from dataset description)
at_min=-16
at_max=+50

# Convert normalized feeling temperature values back to Celsius
bike['apparent_temp'] = bike['apparent_temp'].apply(lambda x: (x*(at_max-at_min))+at_min)

# display the range of values for atemp column
display(print('Range of values for atemp column: ', bike['apparent_temp'].min(), bike['apparent_temp'].max()))

# hum column: normalize humidity. Formula: (hum-hum_min)/(hum_max-hum_min), hum_min=0, hum_max=1 (from dataset description)
bike['humidity'] = bike['humidity'].apply(lambda x: x/100)

# windspeed column: Normalized wind speed. The values are divided to 67 (max) (from dataset description)
windspeed_min=0
windspeed_max=0.85

# Convert normalized windspeed values back to not normalized values (km/h) The values are divided to 67 (from dataset description)       
bike['windspeed'] = bike['windspeed'].apply(lambda x: (x * 67))

# display the range of values for windspeed column
display(print('Range of values for windspeed column: ', bike['windspeed'].min(), bike['windspeed'].max()))

# change the values of humidity column to percentage
bike['humidity'] = bike['humidity'].apply(lambda x: x*100)

# print the range of values for humidity column
display(print('Range of values for humidity column: ', bike['humidity'].min(), bike['humidity'].max()))

# change the values of month column to month names

bike.loc[bike['month'] == 1, 'month'] = 'January'
bike.loc[bike['month'] == 2, 'month'] = 'February'
bike.loc[bike['month'] == 3, 'month'] = 'March'
bike.loc[bike['month'] == 4, 'month'] = 'April'
bike.loc[bike['month'] == 5, 'month'] = 'May'
bike.loc[bike['month'] == 6, 'month'] = 'June'
bike.loc[bike['month'] == 7, 'month'] = 'July'
bike.loc[bike['month'] == 8, 'month'] = 'August'
bike.loc[bike['month'] == 9, 'month'] = 'September'
bike.loc[bike['month'] == 10, 'month'] = 'October'
bike.loc[bike['month'] == 11, 'month'] = 'November'
bike.loc[bike['month'] == 12, 'month'] = 'December'

bike.loc[bike['year'] == 0, 'year'] = '2011'
bike.loc[bike['year'] == 1, 'year'] = '2012'


# check the unique values for each column in bike dataset
display(print('Unique values for each column in bike dataset: ', bike.nunique()))

# print the first 5 rows of the bike dataset to check the changes made to the columns names and values
display(bike.head())

Range of values for temp column:  -7.06 39.0


None

Range of values for atemp column:  -16.0 50.0


None

Range of values for windspeed column:  0.0 56.996900000000004


None

Range of values for humidity column:  0.0 1.0


None

Unique values for each column in bike dataset:  instant            17379
date                 731
season                 4
year                   2
month                 12
hour                  24
holiday                2
weekday                7
workingday             2
weather                4
temp                  50
apparent_temp         65
humidity              89
windspeed             30
casual_user          322
registered_user      776
count                869
dtype: int64


None

Unnamed: 0,instant,date,season,year,month,hour,holiday,weekday,workingday,weather,temp,apparent_temp,humidity,windspeed,casual_user,registered_user,count
0,1,2011-01-01,Winter,2011,January,0,No,Sunday,No,Clear,3.28,3.0014,0.81,0.0,3,13,16
1,2,2011-01-01,Winter,2011,January,1,No,Sunday,No,Clear,2.34,1.9982,0.8,0.0,8,32,40
2,3,2011-01-01,Winter,2011,January,2,No,Sunday,No,Clear,2.34,1.9982,0.8,0.0,5,27,32
3,4,2011-01-01,Winter,2011,January,3,No,Sunday,No,Clear,3.28,3.0014,0.75,0.0,3,10,13
4,5,2011-01-01,Winter,2011,January,4,No,Sunday,No,Clear,3.28,3.0014,0.75,0.0,0,1,1


In [654]:
# check the descriptive statistics for the bike and hours dataset
display(bike.describe())
display(hour.describe())

Unnamed: 0,instant,hour,temp,apparent_temp,humidity,windspeed,casual_user,registered_user,count
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,11.546752,15.358397,15.401157,0.627229,12.73654,35.676218,153.786869,189.463088
std,5017.0295,6.914405,9.050138,11.342114,0.19293,8.196795,49.30503,151.357286,181.387599
min,1.0,0.0,-7.06,-16.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,6.0,7.98,5.9978,0.48,7.0015,4.0,34.0,40.0
50%,8690.0,12.0,15.5,15.9968,0.63,12.998,17.0,115.0,142.0
75%,13034.5,18.0,23.02,24.9992,0.78,16.9979,48.0,220.0,281.0
max,17379.0,23.0,39.0,50.0,1.0,56.9969,367.0,886.0,977.0


Unnamed: 0,instant,hour,temp,apparent_temp,humidity,windspeed,casual_user,registered_user,count
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,11.546752,15.358397,15.401157,0.627229,12.73654,35.676218,153.786869,189.463088
std,5017.0295,6.914405,9.050138,11.342114,0.19293,8.196795,49.30503,151.357286,181.387599
min,1.0,0.0,-7.06,-16.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,6.0,7.98,5.9978,0.48,7.0015,4.0,34.0,40.0
50%,8690.0,12.0,15.5,15.9968,0.63,12.998,17.0,115.0,142.0
75%,13034.5,18.0,23.02,24.9992,0.78,16.9979,48.0,220.0,281.0
max,17379.0,23.0,39.0,50.0,1.0,56.9969,367.0,886.0,977.0


In [655]:
# transform the data type for the columns in the bike dataset into category in order to make them usable for the future analysis
bike['year'] = bike['year'].astype('datetime64[ns]')
bike['month'] = bike['month'].astype('category')
bike['weekday'] = bike['weekday'].astype('category')
bike['season'] = bike['season'].astype('category')
bike['weather'] = bike['weather'].astype('category')
bike['holiday'] = bike['holiday'].astype('category')
bike['workingday'] = bike['workingday'].astype('category')

In [656]:
# create a new column name called day_period to classify the hour in the dataset‚
# create a function to classify the hour
def day_period(hour):
    if hour >= 0 and hour <= 6:
        return 'Dawn'
    elif hour > 6 and hour <= 12:
        return 'Morning'
    elif hour > 12 and hour <= 18:
        return 'Afternoon'
    else:
        return 'Night'

# apply the function to the column hour
bike['day_period'] = bike['hour'].apply(day_period)

In [657]:
# Create a new column name called warmness to classify the apparent_temperature in the dataset
# create a function to classify the apparent_temperature
def warmness(apparent_temp):
    if apparent_temp <= 12:
        return 'Cold'
    elif apparent_temp > 12 and apparent_temp <= 20:
        return 'Warm'
    elif apparent_temp > 20 and apparent_temp <= 30:
        return 'Hot'
    else:
        return 'Very Hot'
    
# apply the function to the column apparent_temp
bike['warmness'] = bike['apparent_temp'].apply(warmness)

# check the values of the column warmness
print('Values of the column warmness: ', bike['warmness'].unique())

Values of the column warmness:  ['Cold' 'Warm' 'Hot' 'Very Hot']


In [658]:
# Create a new column name called humidity_level to classify the humidity in the dataset
# create a function to classify the humidity
def humidity_level(humidity):
    if humidity <= 0.40:
        return 'Low'
    elif humidity > 0.40 and humidity <= 0.70:
        return 'Medium'
    else:
        return 'High'
    
# apply the function to the column humidity
bike['humidity_level'] = bike['humidity'].apply(humidity_level)

# check the values of the column humidity_level
print('Values of the column humidity_level: ', bike['humidity_level'].unique())

Values of the column humidity_level:  ['High' 'Medium' 'Low']


In [659]:
# create a function to classify the windspeed
def windspeed_level(windspeed):
    if windspeed <= 10:
        return 'Low'
    elif windspeed > 10 and windspeed <= 20:
        return 'Medium'
    else:
        return 'High'

# apply the function to the column windspeed
bike['windspeed_level'] = bike['windspeed'].apply(windspeed_level)

# check the values of the column windspeed_level
print('Values of the column windspeed_level: ', bike['windspeed_level'].unique())

Values of the column windspeed_level:  ['Low' 'Medium' 'High']


In [660]:
# export the bike dataset to a csv file to the datasets folder
bike.to_csv('/Users/maximilianolopezsalgado/data_projects/capital_bike_sharing/datasets/bike_clean.csv', index=False)

In [661]:
### Import and analize datasets with geographical information

In [662]:
# import the other bike datasets with the trip history data
bike_2011 = pd.read_csv('/Users/maximilianolopezsalgado/data_projects/bike_sharing/datasets/2011-capitalbikeshare-tripdata.csv')
bike_2012_q1 = pd.read_csv('/Users/maximilianolopezsalgado/data_projects/bike_sharing/datasets/2012Q1-capitalbikeshare-tripdata.csv')
bike_2012_q2 = pd.read_csv('/Users/maximilianolopezsalgado/data_projects/bike_sharing/datasets/2012Q2-capitalbikeshare-tripdata.csv')
bike_2012_q3 = pd.read_csv('/Users/maximilianolopezsalgado/data_projects/bike_sharing/datasets/2012Q3-capitalbikeshare-tripdata.csv')
bike_2012_q4 = pd.read_csv('/Users/maximilianolopezsalgado/data_projects/bike_sharing/datasets/2012Q4-capitalbikeshare-tripdata.csv')
# explore the first 5 rows of the bike_2011 dataset
display(bike_2011.head())

# explore the first 5 rows of the bike_2012_q1 dataset
display(bike_2012_q1.head())

# explore the first 5 rows of the bike_2012_q2 dataset
display(bike_2012_q2.head())

# explore the first 5 rows of the bike_2012_q3 dataset
display(bike_2012_q3.head())

# explore the first 5 rows of the bike_2012_q4 dataset
display(bike_2012_q4.head())

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,3548,2011-01-01 00:01:29,2011-01-01 01:00:37,31620,5th & F St NW,31620,5th & F St NW,W00247,Member
1,346,2011-01-01 00:02:46,2011-01-01 00:08:32,31105,14th & Harvard St NW,31101,14th & V St NW,W00675,Casual
2,562,2011-01-01 00:06:13,2011-01-01 00:15:36,31400,Georgia & New Hampshire Ave NW,31104,Adams Mill & Columbia Rd NW,W00357,Member
3,434,2011-01-01 00:09:21,2011-01-01 00:16:36,31111,10th & U St NW,31503,Florida Ave & R St NW,W00970,Member
4,233,2011-01-01 00:28:26,2011-01-01 00:32:19,31104,Adams Mill & Columbia Rd NW,31106,Calvert & Biltmore St NW,W00346,Casual


Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,475,2012-01-01 00:04:00,2012-01-01 00:11:56,31245,7th & R St NW / Shaw Library,31109,7th & T St NW,W01412,Member
1,1162,2012-01-01 00:10:05,2012-01-01 00:29:28,31400,Georgia & New Hampshire Ave NW,31103,16th & Harvard St NW,W00524,Casual
2,1145,2012-01-01 00:10:23,2012-01-01 00:29:28,31400,Georgia & New Hampshire Ave NW,31103,16th & Harvard St NW,W00235,Member
3,485,2012-01-01 00:15:41,2012-01-01 00:23:46,31101,14th & V St NW,31602,Park Rd & Holmead Pl NW,W00864,Member
4,471,2012-01-01 00:15:42,2012-01-01 00:23:34,31102,11th & Kenyon St NW,31109,7th & T St NW,W00995,Member


Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,192,2012-04-01 00:01:13,2012-04-01 00:04:26,31237,25th St & Pennsylvania Ave NW,31212,21st & M St NW,W00663,Member
1,978,2012-04-01 00:01:49,2012-04-01 00:18:08,31225,C & O Canal & Wisconsin Ave NW,31228,8th & H St NW,W00574,Member
2,843,2012-04-01 00:03:25,2012-04-01 00:17:28,31230,Metro Center / 12th & G St NW,31201,15th & P St NW,W00201,Member
3,408,2012-04-01 00:03:37,2012-04-01 00:10:26,31110,20th St & Florida Ave NW,31239,Rhode Island & Connecticut Ave NW,W01060,Casual
4,163,2012-04-01 00:04:13,2012-04-01 00:06:57,31104,Adams Mill & Columbia Rd NW,31112,Harvard St & Adams Mill Rd NW,W01071,Casual


Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,420,2012-07-01 00:00:08,2012-07-01 00:07:09,31104,Adams Mill & Columbia Rd NW,31200,Massachusetts Ave & Dupont Circle NW,W01155,Member
1,268,2012-07-01 00:00:31,2012-07-01 00:05:00,31616,3rd & H St NE,31615,6th & H St NE,W00405,Member
2,494,2012-07-01 00:00:32,2012-07-01 00:08:46,31307,3000 Connecticut Ave NW / National Zoo,31113,Columbia Rd & Belmont St NW,W00942,Member
3,655,2012-07-01 00:01:34,2012-07-01 00:12:29,31007,Crystal City Metro / 18th & Bell St,31002,20th & Crystal Dr,W01121,Member
4,1681,2012-07-01 00:02:06,2012-07-01 00:30:08,31235,19th St & Constitution Ave NW,31222,New York Ave & 15th St NW,W01039,Casual


Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,611,2012-10-01 00:02:06,2012-10-01 00:12:17,31230,Metro Center / 12th & G St NW,31212,21st & M St NW,W00311,Member
1,612,2012-10-01 00:02:52,2012-10-01 00:13:05,31624,North Capitol St & F St NW,31251,12th & L St NW,W00137,Member
2,734,2012-10-01 00:06:34,2012-10-01 00:18:48,31234,20th & O St NW / Dupont South,31223,Convention Center / 7th & M St NW,W20125,Member
3,219,2012-10-01 00:07:27,2012-10-01 00:11:06,31106,Calvert & Biltmore St NW,31104,Adams Mill & Columbia Rd NW,W01451,Member
4,924,2012-10-01 00:07:28,2012-10-01 00:22:53,31109,7th & T St NW,31615,6th & H St NE,W00441,Member


In [663]:
# check the shape of the bike_2011 dataset
display(print('Shape of the bike_2011 dataset: ', bike_2011.shape))

# check the shape of the bike_2012_q1 dataset
display(print('Shape of the bike_2012_q1 dataset: ', bike_2012_q1.shape))

# check the shape of the bike_2012_q2 dataset
display(print('Shape of the bike_2012_q2 dataset: ', bike_2012_q2.shape))

# check the shape of the bike_2012_q3 dataset
display(print('Shape of the bike_2012_q3 dataset: ', bike_2012_q3.shape))

# check the shape of the bike_2012_q4 dataset
display(print('Shape of the bike_2012_q4 dataset: ', bike_2012_q4.shape))

Shape of the bike_2011 dataset:  (1226767, 9)


None

Shape of the bike_2012_q1 dataset:  (361317, 9)


None

Shape of the bike_2012_q2 dataset:  (566832, 9)


None

Shape of the bike_2012_q3 dataset:  (629917, 9)


None

Shape of the bike_2012_q4 dataset:  (470845, 9)


None

In [664]:
# check the columns of the bike_2011 dataset
display(print('Columns of the bike_2011 dataset: ', bike_2011.columns))

# check the columns of the bike_2012_q1 dataset
display(print('Columns of the bike_2012_q1 dataset: ', bike_2012_q1.columns))

# check the columns of the bike_2012_q2 dataset
display(print('Columns of the bike_2012_q2 dataset: ', bike_2012_q2.columns))

# check the columns of the bike_2012_q3 dataset
display(print('Columns of the bike_2012_q3 dataset: ', bike_2012_q3.columns))

# check the columns of the bike_2012_q4 dataset
display(print('Columns of the bike_2012_q4 dataset: ', bike_2012_q4.columns))

Columns of the bike_2011 dataset:  Index(['Duration', 'Start date', 'End date', 'Start station number',
       'Start station', 'End station number', 'End station', 'Bike number',
       'Member type'],
      dtype='object')


None

Columns of the bike_2012_q1 dataset:  Index(['Duration', 'Start date', 'End date', 'Start station number',
       'Start station', 'End station number', 'End station', 'Bike number',
       'Member type'],
      dtype='object')


None

Columns of the bike_2012_q2 dataset:  Index(['Duration', 'Start date', 'End date', 'Start station number',
       'Start station', 'End station number', 'End station', 'Bike number',
       'Member type'],
      dtype='object')


None

Columns of the bike_2012_q3 dataset:  Index(['Duration', 'Start date', 'End date', 'Start station number',
       'Start station', 'End station number', 'End station', 'Bike number',
       'Member type'],
      dtype='object')


None

Columns of the bike_2012_q4 dataset:  Index(['Duration', 'Start date', 'End date', 'Start station number',
       'Start station', 'End station number', 'End station', 'Bike number',
       'Member type'],
      dtype='object')


None

In [665]:
# check the data types of the bike_2011 dataset
display(print('Data types of the bike_2011 dataset: ', bike_2011.dtypes))

# check the data types of the bike_2012_q1 dataset
display(print('Data types of the bike_2012_q1 dataset: ', bike_2012_q1.dtypes))

# check the data types of the bike_2012_q2 dataset
display(print('Data types of the bike_2012_q2 dataset: ', bike_2012_q2.dtypes))

# check the data types of the bike_2012_q3 dataset
display(print('Data types of the bike_2012_q3 dataset: ', bike_2012_q3.dtypes))

# check the data types of the bike_2012_q4 dataset
display(print('Data types of the bike_2012_q4 dataset: ', bike_2012_q4.dtypes))

Data types of the bike_2011 dataset:  Duration                 int64
Start date              object
End date                object
Start station number     int64
Start station           object
End station number       int64
End station             object
Bike number             object
Member type             object
dtype: object


None

Data types of the bike_2012_q1 dataset:  Duration                 int64
Start date              object
End date                object
Start station number     int64
Start station           object
End station number       int64
End station             object
Bike number             object
Member type             object
dtype: object


None

Data types of the bike_2012_q2 dataset:  Duration                 int64
Start date              object
End date                object
Start station number     int64
Start station           object
End station number       int64
End station             object
Bike number             object
Member type             object
dtype: object


None

Data types of the bike_2012_q3 dataset:  Duration                 int64
Start date              object
End date                object
Start station number     int64
Start station           object
End station number       int64
End station             object
Bike number             object
Member type             object
dtype: object


None

Data types of the bike_2012_q4 dataset:  Duration                 int64
Start date              object
End date                object
Start station number     int64
Start station           object
End station number       int64
End station             object
Bike number             object
Member type             object
dtype: object


None

In [666]:
# concatenate the bike_2011 and bike_2012_q1, bike_2012_q2, bike_2012_q3, bike_2012_q4 datasets
trip_history = pd.concat([bike_2011, bike_2012_q1, bike_2012_q2, bike_2012_q3, bike_2012_q4], axis=0)

# check the shape of the trip_history dataset
display(print('Shape of the trip_history dataset: ', trip_history.shape))

# check the columns of the trip_history dataset
display(print('Columns of the trip_history dataset: ', trip_history.columns))

# check the data types of the trip_history dataset
display(print('Data types of the trip_history dataset: ', trip_history.dtypes))

# check the first 5 rows of the trip_history dataset
display(trip_history.head())

Shape of the trip_history dataset:  (3255678, 9)


None

Columns of the trip_history dataset:  Index(['Duration', 'Start date', 'End date', 'Start station number',
       'Start station', 'End station number', 'End station', 'Bike number',
       'Member type'],
      dtype='object')


None

Data types of the trip_history dataset:  Duration                 int64
Start date              object
End date                object
Start station number     int64
Start station           object
End station number       int64
End station             object
Bike number             object
Member type             object
dtype: object


None

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,3548,2011-01-01 00:01:29,2011-01-01 01:00:37,31620,5th & F St NW,31620,5th & F St NW,W00247,Member
1,346,2011-01-01 00:02:46,2011-01-01 00:08:32,31105,14th & Harvard St NW,31101,14th & V St NW,W00675,Casual
2,562,2011-01-01 00:06:13,2011-01-01 00:15:36,31400,Georgia & New Hampshire Ave NW,31104,Adams Mill & Columbia Rd NW,W00357,Member
3,434,2011-01-01 00:09:21,2011-01-01 00:16:36,31111,10th & U St NW,31503,Florida Ave & R St NW,W00970,Member
4,233,2011-01-01 00:28:26,2011-01-01 00:32:19,31104,Adams Mill & Columbia Rd NW,31106,Calvert & Biltmore St NW,W00346,Casual


In [667]:
# check if trip_history is a valid df object
display(type(trip_history))


pandas.core.frame.DataFrame

In [668]:
# check the missing values of the trip_history dataset
display(trip_history.isnull().sum())

Duration                0
Start date              0
End date                0
Start station number    0
Start station           0
End station number      0
End station             0
Bike number             0
Member type             0
dtype: int64

In [669]:
# count unique values of column start station and end station
display(trip_history['Start station'].nunique())
display(trip_history['End station'].nunique())

194

194

### Create a new df with the count values of the start and end stations

#### Start stations

In [670]:
# count the unique values of each start stations using groupby
display(trip_history.groupby('Start station').size())

# create a new df with the unique values of each start stations and the count of each start station
start_station = pd.DataFrame(trip_history.groupby('Start station').size().reset_index(name='counts'))

# check the first 5 rows of the start_station dataset
display(start_station.head())

Start station
10th & E St NW                    6280
10th & Monroe St NE               5423
10th & U St NW                   38149
10th St & Constitution Ave NW    33682
11th & F St NW                      38
                                 ...  
Wilson Blvd & Franklin Rd         3372
Wilson Blvd & N Edgewood St       2725
Wilson Blvd & N Quincy St         1831
Wilson Blvd & N Uhle St           7239
Wisconsin Ave & Newark St NW     15747
Length: 194, dtype: int64

Unnamed: 0,Start station,counts
0,10th & E St NW,6280
1,10th & Monroe St NE,5423
2,10th & U St NW,38149
3,10th St & Constitution Ave NW,33682
4,11th & F St NW,38


In [671]:
# check that the shape of the start_station dataset is correct
display(start_station.shape)

# check the missing values of the start_station dataset
display(start_station.isnull().sum())

# check the data types of the start_station dataset
display(start_station.dtypes)

# check the unique values of the start_station dataset
display(start_station.nunique())

(194, 2)

Start station    0
counts           0
dtype: int64

Start station    object
counts            int64
dtype: object

Start station    194
counts           193
dtype: int64

In [672]:
# add two new columns to the start_station dataset called 'Start station latitude' and 'Start station longitude'
start_station['Start station latitude'] = ''
start_station['Start station longitude'] = ''

# check the first 5 rows of the start_station dataset
display(start_station.head())


Unnamed: 0,Start station,counts,Start station latitude,Start station longitude
0,10th & E St NW,6280,,
1,10th & Monroe St NE,5423,,
2,10th & U St NW,38149,,
3,10th St & Constitution Ave NW,33682,,
4,11th & F St NW,38,,


In [673]:
# use the geopy library to get the latitude and longitude values of each start station
geolocator = Nominatim(user_agent="bike_sharing")
timeout = 5  # Increase timeout to 5 seconds
retry_attempts = 3  # Retry geocoding up to 3 times

for i in range(len(start_station)):
    for attempt in range(retry_attempts):
        try:
            location = geolocator.geocode(start_station['Start station'][i], timeout=timeout)
            if location is not None:
                start_station.at[i, 'Start station latitude'] = location.latitude
                start_station.at[i, 'Start station longitude'] = location.longitude
            break
        except GeocoderUnavailable:
            print(f"Geocoding attempt {attempt + 1} failed. Retrying in 1 second...")
            time.sleep(1)
    else:
        print(f"Geocoding failed after {retry_attempts} attempts. Skipping...")
        continue

# check the first 5 rows of the start_station dataset
display(start_station.head())

# check the missing values of the start_station dataset
display(start_station.isnull().sum())


Unnamed: 0,Start station,counts,Start station latitude,Start station longitude
0,10th & E St NW,6280,38.896136,-77.022843
1,10th & Monroe St NE,5423,38.932514,-76.992889
2,10th & U St NW,38149,38.917007,-77.024112
3,10th St & Constitution Ave NW,33682,38.891931,-77.02599
4,11th & F St NW,38,38.897206,-77.022971


Start station              0
counts                     0
Start station latitude     0
Start station longitude    0
dtype: int64

In [674]:
missing_stations = start_station[start_station['Start station latitude'].isnull() | start_station['Start station longitude'].isnull()]['Start station'].tolist()

print(missing_stations)




[]


In [675]:
# export the start_station dataset to a csv file to fill in the missing values manually
# start_station.to_csv('/Users/maximilianolopezsalgado/data_projects/bike_sharing/start_station.csv', index=False) --- commented out to avoid overwriting the csv file

# import the start_station dataset with the missing values filled in manually
start_stations_long_lat = pd.read_csv('/Users/maximilianolopezsalgado/data_projects/capital_bike_sharing/datasets/start_station.csv', sep=';')

# check if there are missing values
display(start_stations_long_lat.isnull().sum())

# check the first 5 rows of the start_stations_long_lat dataset
display(start_stations_long_lat.head())

Start station              0
counts                     0
Start station latitude     0
Start station longitude    0
dtype: int64

Unnamed: 0,Start station,counts,Start station latitude,Start station longitude
0,10th & E St NW,6280,38.896136,-77.022843
1,10th & Monroe St NE,5423,38.932514,-76.992889
2,10th & U St NW,38149,38.917007,-77.024112
3,10th St & Constitution Ave NW,33682,38.891931,-77.02599
4,11th & F St NW,38,38.897206,-77.022971


In [676]:
# checking the data types of the latitude and longitude columns of the start_station dataset
display(start_stations_long_lat['Start station latitude'].dtypes)
display(start_stations_long_lat['Start station longitude'].dtypes)

dtype('float64')

dtype('float64')

In [677]:
# Create a heatmap layer using the start station coordinates and counts
heat_data = start_stations_long_lat[['Start station latitude', 'Start station longitude', 'counts']].values.tolist()
start_station_map = folium.Map(location=[38.9072, -77.0369], zoom_start=12)
HeatMap(heat_data).add_to(start_station_map)

# Add markers to the map with the count number of each start station
for lat, lng, label, count in zip(start_stations_long_lat['Start station latitude'], start_stations_long_lat['Start station longitude'], start_stations_long_lat['Start station'], start_stations_long_lat['counts']):
    if isinstance(lat, float) and isinstance(lng, float):
        folium.Marker(
            [lat, lng],
            popup=label,
            icon=folium.Icon(color='blue', icon='bicycle', prefix='fa', icon_size=(1,1)),
            tooltip=f"{count} trips"
        ).add_to(start_station_map)

# Display the map
start_station_map

### End Stations

In [678]:
# create a new df of end stations and their count for the years 2011 and 2012
end_station_count = trip_history['End station'].value_counts().reset_index()
end_station_count.columns = ['Station', 'Count']
display(end_station_count)

Unnamed: 0,Station,Count
0,Massachusetts Ave & Dupont Circle NW,118707
1,15th & P St NW,83457
2,Columbus Circle / Union Station,79681
3,17th & Corcoran St NW,67382
4,14th & V St NW,59658
...,...,...
189,Branch & Pennsylvania Ave SE,160
190,Fairfax Village,121
191,21st St & Constitution Ave NW,88
192,11th & M St NW,87


### Merge start_stations_long_lat and end_station datasets

In [679]:
# create a for loop that adds the longitude value on a new column called 'End station longitude' to the end_station_long df when the Station Name matches in both df respectively
end_station_long = []
for i in end_station_count['Station']:
    if i in start_stations_long_lat['Start station'].values:
        end_station_long.append(start_stations_long_lat.loc[start_stations_long_lat['Start station'] == i, 'Start station longitude'].iloc[0])

# count the values on this list
display(len(end_station_count))

# create a for loop that adds the longitude value on a new column called 'End station latitude' to the end_station_lat df when the Station Name matches in both df respectively
end_station_lat = []
for i in end_station_count['Station']:
    if i in start_stations_long_lat['Start station'].values:
        end_station_lat.append(start_stations_long_lat.loc[start_stations_long_lat['Start station'] == i, 'Start station latitude'].iloc[0])
# count the values on this list
display(len(end_station_count))

194

194

In [680]:
display(len(end_station_lat))
display(len(end_station_long))

192

192

In [689]:
# Iterate over station names in end_station_count
for station in end_station_count['Station']:
    if station in start_stations_long_lat['Start station'].values:
        # Retrieve latitude and longitude values if station name matches
        lat = start_stations_long_lat.loc[start_stations_long_lat['Start station'] == station, 'Start station latitude'].iloc[0]
        long = start_stations_long_lat.loc[start_stations_long_lat['Start station'] == station, 'Start station longitude'].iloc[0]
    else:
        # Assign NaN for missing latitude and longitude values
        lat = np.nan
        long = np.nan

# Append latitude and longitude values to end_station_lat and end_station_long lists
end_station_lat.append(lat)
end_station_long.append(long)

In [690]:
display(len(end_station_lat))
display(len(end_station_long))

194

194

In [683]:
# Check for missing values in the index and lists
print("Missing values in index:", end_station_count.index.isnull().sum())
print("Missing values in longitude list:", pd.isnull(end_station_long).sum())
print("Missing values in latitude list:", pd.isnull(end_station_lat).sum())


Missing values in index: 0
Missing values in longitude list: 0
Missing values in latitude list: 0


In [697]:
end_station_long_lat = end_station_count 

In [692]:
# Assign the latitude and longitude lists to the end_station_count DataFrame
end_station_count['End station latitude'] = end_station_lat
end_station_count['End station longitude'] = end_station_long

In [693]:
# Check the lengths of the lists and DataFrame
print("Length of end_station_count DataFrame:", len(end_station_count))
print("Length of end_station_lat list:", len(end_station_lat))
print("Length of end_station_long list:", len(end_station_long))

Length of end_station_count DataFrame: 194
Length of end_station_lat list: 194
Length of end_station_long list: 194


In [698]:
# export the missing_stations_df to a csv file to the datasets folder in order to add the coordinates manually
# end_station_long_lat.to_csv('/Users/maximilianolopezsalgado/data_projects/capital_bike_sharing/datasets/end_stations_long_lat.csv', index=False)

# once you have exported this df, is important to comment the code above to avoid overwriting the file

In [699]:
# Find missing station names in start_stations_long_lat
missing_stations = set(end_station_count['Station']) - set(start_stations_long_lat['Start station'])

# Print the missing station names
print("Missing Station Names:")
for station in missing_stations:
    print(station)

Missing Station Names:
Lincoln Park / 13th & East Capitol St NE 
Utah St & 11th St N 


In [701]:
# import the end_station_long_lat dataset with the missing values filled in manually
end_stations_long_lat = pd.read_csv('/Users/maximilianolopezsalgado/data_projects/capital_bike_sharing/datasets/end_stations_long_lat.csv', sep=',')

# check if there are missing values
display(end_stations_long_lat.isnull().sum())

# check the first 5 rows of the end_stations_long_lat dataset
display(end_stations_long_lat.head())


Station                  0
Count                    0
End station latitude     0
End station longitude    0
dtype: int64

Unnamed: 0,Station,Count,End station latitude,End station longitude
0,Massachusetts Ave & Dupont Circle NW,118707,38.9101,-77.0444
1,15th & P St NW,83457,38.909627,-77.044409
2,Columbus Circle / Union Station,79681,38.897027,-77.004915
3,17th & Corcoran St NW,67382,38.912083,-77.038646
4,14th & V St NW,59658,38.918116,-77.035802


In [707]:
# Create a heatmap layer using the end station coordinates and counts
heat_data = end_stations_long_lat[['End station latitude', 'End station longitude', 'Count']].values.tolist()
end_station_map = folium.Map(location=[38.9072, -77.0369], zoom_start=12)
HeatMap(heat_data).add_to(end_station_map)

# Add markers to the map with the count number of each end station
for lat, lng, label, count in zip(end_stations_long_lat['End station latitude'], end_stations_long_lat['End station longitude'], end_stations_long_lat['Station'], end_stations_long_lat['Count']):
    if isinstance(lat, float) and isinstance(lng, float):
        folium.Marker(
            [lat, lng],
            popup=label,
            icon=folium.Icon(color='blue', icon='bicycle', prefix='fa', icon_size=(1,1)),
            tooltip=f"{count} trips"
        ).add_to(end_station_map)

# Display the map
end_station_map