In [1]:
import pandas as pd
pd.set_option("display.max_columns", 150)
import sqlite3
import glob
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#importing all the files.
csv_files = glob.glob('../data/2019/*.csv')

#Creating the empty dataframes so that the files can be added to.

citibike_2019 = pd.DataFrame()


# Iterating over the 2019 files so that they are in a dataframe.
for csv_file in csv_files:
    df = pd.read_csv(csv_file, low_memory = False)
    citibike_2019 = pd.concat([citibike_2019, df])


In [3]:
#removing spaces out of column names.
citibike_2019 = citibike_2019.rename(columns = {'starttime': 'started_at', 'stoptime' : 'ended_at','start station id': 'start_station_id', 'start station name': 'start_station_name', 'start station latitude':'start_lat', 'start station longitude':'start_lng', 'end station id':'end_station_id', 'end station name':'end_station_name', 'end station latitude':'end_lat','end station longitude':'end_lng', 'usertype':'user_type', 'tripduration' : 'trip_duration'})

#removing columns that are not also available in 2021 df
citibike_2019 = citibike_2019.drop(columns = ['birth year', 'gender', 'bikeid'])

In [4]:
#A function that changed the dtype of these columns to datetime.

def convert_datetime(df, columns):
    for column in columns:
        df[column] = pd.to_datetime(df[column])

columns = ['started_at', 'ended_at']

convert_datetime(citibike_2019, columns)

In [5]:
# A function that will make new columns for the date, time, hour and day of the week each trip began and ended.

def datetime_date_time(df, columns):
    for column in columns:
        df[f'{column}_date'] = pd.to_datetime(df[column]).dt.date
        df[f'{column}_time'] = pd.to_datetime(df[column]).dt.time
        df[f'{column}_hour'] = pd.to_datetime(df[column]).dt.hour
        df[f'{column}_day_of_week'] = pd.to_datetime(df[column]).dt.weekday
        df[f'{column}_day_of_year']= pd.to_datetime(df[column]).dt.dayofyear
        
columns = ['started_at', 'ended_at']

datetime_date_time(citibike_2019, columns)

In [6]:
#using the convert_datetime function to convert the date column to a datetime dtype.

columns = ['started_at_date', 'ended_at_date']

convert_datetime(citibike_2019, columns)

In [7]:
#A function that will create a column for the month of the trip.

def datetime_month(df, columns):
    for column in columns:
        df[f'{column}_month'] = pd.DatetimeIndex(df[column]).month
        
columns = ['started_at', 'ended_at']

datetime_month(citibike_2019, columns)

In [8]:
#https://stackoverflow.com/questions/29688899/pandas-checking-if-a-date-is-a-holiday-and-assigning-boolean-value

#A datframe that will define if a date was a holiday.

dr = pd.date_range(start='2019-01-01', end='2019-12-31')
df = pd.DataFrame()
df['date'] = dr

cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())

df['holiday'] = df['date'].isin(holidays)


In [9]:
citibike_2019['start_station_id'].value_counts().head(20)

519.0     156575
497.0     121781
3255.0    119958
402.0     113138
285.0     113012
435.0     110305
426.0     105636
499.0     103167
358.0     101413
514.0      99300
459.0      96991
490.0      95138
293.0      93809
477.0      90092
3641.0     89806
379.0      86406
523.0      86096
3711.0     85203
151.0      85076
465.0      84659
Name: start_station_id, dtype: int64

In [10]:
citibike_2019['end_station_id'].value_counts().head(20)

519.0     155536
497.0     125466
402.0     123146
3255.0    121631
426.0     116003
285.0     114178
435.0     110523
514.0     103748
358.0     101823
499.0      99931
459.0      99754
293.0      97613
490.0      94567
477.0      89814
3641.0     88210
379.0      86251
523.0      86076
3263.0     85364
151.0      85294
491.0      85079
Name: end_station_id, dtype: int64

In [11]:
citibike_2019['trip_duration'].describe()

count    2.055170e+07
mean     9.782455e+02
std      1.055304e+04
min      6.100000e+01
25%      3.620000e+02
50%      6.150000e+02
75%      1.079000e+03
max      3.812666e+06
Name: trip_duration, dtype: float64

In [12]:
citibike_2019

Unnamed: 0,trip_duration,started_at,ended_at,start_station_id,start_station_name,start_lat,start_lng,end_station_id,end_station_name,end_lat,end_lng,user_type,started_at_date,started_at_time,started_at_hour,started_at_day_of_week,started_at_day_of_year,ended_at_date,ended_at_time,ended_at_hour,ended_at_day_of_week,ended_at_day_of_year,started_at_month,ended_at_month
0,393,2019-08-01 00:00:01.468,2019-08-01 00:06:35.378,531.0,Forsyth St & Broome St,40.718939,-73.992663,408.0,Market St & Cherry St,40.710762,-73.994004,Subscriber,2019-08-01,00:00:01.468000,0,3,213,2019-08-01,00:06:35.378000,0,3,213,8,8
1,627,2019-08-01 00:00:01.929,2019-08-01 00:10:29.784,274.0,Lafayette Ave & Fort Greene Pl,40.686919,-73.976682,3409.0,Bergen St & Smith St,40.686744,-73.990632,Subscriber,2019-08-01,00:00:01.929000,0,3,213,2019-08-01,00:10:29.784000,0,3,213,8,8
2,1132,2019-08-01 00:00:04.048,2019-08-01 00:18:56.165,2000.0,Front St & Washington St,40.702551,-73.989402,3388.0,President St & Henry St,40.682800,-73.999904,Subscriber,2019-08-01,00:00:04.048000,0,3,213,2019-08-01,00:18:56.165000,0,3,213,8,8
3,1780,2019-08-01 00:00:04.163,2019-08-01 00:29:44.794,479.0,9 Ave & W 45 St,40.760193,-73.991255,473.0,Rivington St & Chrystie St,40.721101,-73.991925,Subscriber,2019-08-01,00:00:04.163000,0,3,213,2019-08-01,00:29:44.794000,0,3,213,8,8
4,1517,2019-08-01 00:00:05.458,2019-08-01 00:25:23.455,3312.0,1 Ave & E 94 St,40.781721,-73.945940,3312.0,1 Ave & E 94 St,40.781721,-73.945940,Subscriber,2019-08-01,00:00:05.458000,0,3,213,2019-08-01,00:25:23.455000,0,3,213,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955205,93,2019-12-31 23:58:18.016,2019-12-31 23:59:51.636,3141.0,1 Ave & E 68 St,40.765005,-73.958185,3142.0,1 Ave & E 62 St,40.761227,-73.960940,Subscriber,2019-12-31,23:58:18.016000,23,1,365,2019-12-31,23:59:51.636000,23,1,365,12,12
955206,786,2019-12-31 23:58:59.896,2020-01-01 00:12:06.346,490.0,8 Ave & W 33 St,40.751551,-73.993934,513.0,W 56 St & 10 Ave,40.768254,-73.988639,Subscriber,2019-12-31,23:58:59.896000,23,1,365,2020-01-01,00:12:06.346000,0,2,1,12,1
955207,351,2019-12-31 23:59:03.695,2020-01-01 00:04:54.873,3349.0,Grand Army Plaza & Plaza St West,40.672968,-73.970880,3368.0,5 Ave & 3 St,40.672815,-73.983524,Subscriber,2019-12-31,23:59:03.695000,23,1,365,2020-01-01,00:04:54.873000,0,2,1,12,1
955208,1571,2019-12-31 23:59:21.361,2020-01-01 00:25:32.942,252.0,MacDougal St & Washington Sq,40.732264,-73.998522,366.0,Clinton Ave & Myrtle Ave,40.693261,-73.968896,Subscriber,2019-12-31,23:59:21.361000,23,1,365,2020-01-01,00:25:32.942000,0,2,1,12,1


**Using the weather**

In [13]:
#importing the weather file.
cols = ['PRCP', 'SNOW', 'TMAX', 'TMIN', 'DATE', 'RHAV', 'AWND', 'RHMN', 'RHMX']
weather = pd.read_csv('../data/USW00094728.csv',usecols = cols, low_memory = False)

In [14]:
#changing all the column titles to lowercase.
weather.columns = map(str.lower, weather.columns)

#changing the DATE column to a datetime datatype.
weather['date'] = pd.to_datetime(weather['date'])

#choosing the years I want data in.
weather = weather[weather['date'].dt.year.isin([2019])].reset_index(drop = True)

In [15]:
day_trips = citibike_2019.groupby('started_at_date').size().reset_index(name = 'num_of_trips')
day_trips = day_trips.rename(columns = {'started_at_date' : 'date'})

In [16]:
day_trips = day_trips.merge(df, left_on = 'date', right_on = 'date', how = 'left')
day_trips = day_trips.merge(weather, left_on = 'date', right_on = 'date', how = 'left')

In [17]:
#Changing the temperature to fahrenheit.
#the original temperature was the celsius temp x 10. 
def to_fahrenheit(df, columns):
    for column in columns:
        df[f'{column}_fahrenheit'] = (df[column]/10)*(9/5)+32
columns = ['tmax', 'tmin']

to_fahrenheit(day_trips, columns)

In [18]:
#Droping the original temp columns and renaming the farenheit temp columns.

day_trips = day_trips.drop(columns = ['tmax','tmin'])
day_trips = day_trips.rename(columns = {'tmax_fahrenheit' : 'tmax', 'tmin_fahrenheit' : 'tmin'})

day_trips['tavg'] = (day_trips['tmax'] + day_trips['tmin'])/2

In [19]:
median_trips = round(citibike_2019.groupby('started_at_date')['trip_duration'].median(), 2) #df that has the date and medians trip duration.
day_trips = day_trips.merge(median_trips, left_on = 'date', right_on = 'started_at_date', how = 'left') #merging the df to day_trips.
day_trips = day_trips.rename(columns = {'trip_duration' : 'median_trip_duration'}) #changing the name of the column.

In [20]:
mean_trips = round(citibike_2019.groupby('started_at_date')['trip_duration'].mean(), 2)
day_trips = day_trips.merge(mean_trips, left_on = 'date', right_on = 'started_at_date', how = 'left') #merging the df to day_trips.
day_trips = day_trips.rename(columns = {'trip_duration' : 'mean_trip_duration'}) #changing the name of the column.

In [21]:
#EXTRAAA
# day_trips = day_trips.merge(mean_trips, left_on = 'date', right_on = 'started_at_date', how = 'left') #merging the df to day_trips.
# day_trips = day_trips.rename(columns = {'trip_duration' : 'mean_trip_duration'}) #changing the name of the column.

In [22]:
#Adding a day of the week column.
day_trips['day_of_week'] = pd.to_datetime(day_trips['date']).dt.weekday

In [23]:
day_trips

Unnamed: 0,date,num_of_trips,holiday,prcp,snow,awnd,rhav,rhmn,rhmx,tmax,tmin,tavg,median_trip_duration,mean_trip_duration,day_of_week
0,2019-01-01,21962,True,15,0.0,,69.0,47.0,97.0,57.92,39.02,48.47,663.0,1175.93,1
1,2019-01-02,37797,False,0,0.0,,56.0,48.0,68.0,39.92,35.06,37.49,548.0,780.93,2
2,2019-01-03,41676,False,0,0.0,,63.0,43.0,83.0,44.06,37.04,40.55,533.0,769.52,3
3,2019-01-04,43922,False,0,0.0,,63.0,51.0,74.0,46.94,35.06,41.00,535.0,778.15,4
4,2019-01-05,17432,False,127,0.0,,91.0,71.0,97.0,46.94,41.00,43.97,476.0,811.47,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2019-12-27,31808,False,0,0.0,20.0,73.0,66.0,83.0,53.96,46.04,50.00,560.0,984.35,4
361,2019-12-28,28520,False,0,0.0,18.0,61.0,40.0,89.0,51.08,42.98,47.03,617.0,1045.72,5
362,2019-12-29,17968,False,64,0.0,22.0,71.0,55.0,97.0,44.06,39.02,41.54,545.0,1120.37,6
363,2019-12-30,12138,False,188,0.0,57.0,96.0,89.0,100.0,41.00,37.04,39.02,475.0,716.91,0


In [None]:
# #saving df to csv
# day_trips.to_csv('../data/day_trips_2019')

***Answering data questions***

*What time are the bikes most frequently used during the day?*

In [None]:
#The hour the most amount of trips occur.
def most_hour (df, year):
    hour_count = df.groupby('started_at_hour')['started_at_hour'].count().sort_values(ascending=False)
    most_popular = hour_count.index[0]
    number_of_rides = hour_count.iloc[0]
    print(f"The hour with the most amount of trips in {year} was {most_popular} with {number_of_rides} rides.")

most_hour(citibike_2019, 2019)

*What time are the bikes used the least during the day?*

In [None]:
#The hour the least amount of trips occur.
def least_hour (df, year):
    hour_count = df.groupby('started_at_hour')['started_at_hour'].count().sort_values(ascending=True)
    least_popular = hour_count.index[0]
    number_of_rides = hour_count.iloc[0]
    print(f"The hour with the least amount of trips in {year} was {least_popular} with {number_of_rides} rides.")

least_hour(citibike_2019, 2019)

*Which stations are the most frequent for starting & ending a trip?*

In [None]:
#The station where the most trips begin
def popular_starting_station (df, year):
    station_count = df.groupby('start_station_name')['start_station_name'].count().sort_values(ascending=False)
    most_popular = station_count.index[0]
    number_of_rides = station_count.iloc[0]
    print(f"The station where most trips began in {year} was {most_popular} with {number_of_rides} rides.")

popular_starting_station(citibike_2019, 2019)

In [None]:
#The station where the most trips ended.
def popular_ending_station (df, year):
    station_count = df.groupby('end_station_name')['end_station_name'].count().sort_values(ascending=False)
    most_popular = station_count.index[0]
    number_of_rides = station_count.iloc[0]
    print(f"The station where most trips ended in {year} was {most_popular} with {number_of_rides} rides.")

popular_ending_station(citibike_2019, 2019)

**What is the average trip length? Does it change depending on day or time of day?**

In [None]:
#The average bike trip.
def avg_bike_trip (df, year):
    for column in columns:
        avg_trip_len = round(df[column].mean())
        print(f"The average length of a bike trip in {year} is {avg_trip_len} seconds")
        
columns = ['trip_duration']
avg_bike_trip(citibike_2019, 2019)

In [None]:
#The hour with the longest bike trip.
def avg_bike_trip_by_hour (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_hour')['trip_duration'].mean().sort_values(ascending = False))
        hour_with_longest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the hour that has the longest bike trip on average is hour {hour_with_longest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_hour(citibike_2019, 2019)

In [None]:
#The hour with the shortest bike trip.
def avg_bike_trip_by_hour_least (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_hour')['trip_duration'].mean().sort_values(ascending = True))
        hour_with_shortest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the hour that has the shortest bike trip on average is hour {hour_with_shortest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_hour_least(citibike_2019, 2019)

In [None]:
#The day of year with the longest trips.
def avg_bike_trip_by_day_of_year (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_day_of_year')['trip_duration'].mean().sort_values(ascending = False))
        day_of_year_with_longest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the day of year that has the longest bike trip on average is day {day_of_year_with_longest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_day_of_year(citibike_2019, 2019)

In [None]:
#The day of year with the shortest trips.
def avg_bike_trip_by_day_of_year_shortest (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_day_of_year')['trip_duration'].mean().sort_values(ascending = True))
        day_of_year_with_shortest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the day of year that has the shortest bike trip on average is day {day_of_year_with_shortest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_day_of_year_shortest(citibike_2019, 2019)