In [1]:
import pandas as pd
pd.set_option("display.max_columns", 150)
import sqlite3
import glob
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#importing all the files.
csv_files = glob.glob('../data/2019/*.csv')

#Creating the empty dataframes so that the files can be added to.

citibike_2019 = pd.DataFrame()


# Iterating over the 2019 files so that they are in a dataframe.
for csv_file in csv_files:
    df = pd.read_csv(csv_file, low_memory = False)
    citibike_2019 = pd.concat([citibike_2019, df])

#importing the weather file.
weather = pd.read_csv('../data/USW00094728.csv', low_memory = False)

In [3]:
print(citibike_2019.shape)



(20551697, 15)


In [4]:
#removing spaces out of column names.
citibike_2019 = citibike_2019.rename(columns = {'starttime': 'started_at', 'stoptime' : 'ended_at','start station id': 'start_station_id', 'start station name': 'start_station_name', 'start station latitude':'start_lat', 'start station longitude':'start_lng', 'end station id':'end_station_id', 'end station name':'end_station_name', 'end station latitude':'end_lat','end station longitude':'end_lng', 'usertype':'user_type', 'tripduration' : 'trip_duration'})

#removing columns that are not also available in 2021 df
citibike_2019 = citibike_2019.drop(columns = ['birth year', 'gender', 'bikeid'])

In [5]:
#A function that changed the dtype of these columns to datetime.

def convert_datetime(df, columns):
    for column in columns:
        df[column] = pd.to_datetime(df[column])

columns = ['started_at', 'ended_at']

convert_datetime(citibike_2019, columns)

In [6]:
# A function that will make new columns for the date, time, hour and day of the week each trip began and ended.

def datetime_date_time(df, columns):
    for column in columns:
        df[f'{column}_date'] = pd.to_datetime(df[column]).dt.date
        df[f'{column}_time'] = pd.to_datetime(df[column]).dt.time
        df[f'{column}_hour'] = pd.to_datetime(df[column]).dt.hour
        df[f'{column}_day_of_week'] = pd.to_datetime(df[column]).dt.weekday
        df[f'{column}_day_of_year']= pd.to_datetime(df[column]).dt.dayofyear
        
columns = ['started_at', 'ended_at']

datetime_date_time(citibike_2019, columns)

In [7]:
#using the convert_datetime function to convert the date column to a datetime dtype.

columns = ['started_at_date', 'ended_at_date']

convert_datetime(citibike_2019, columns)

In [8]:
#A function that will create a column for the month of the trip.

def datetime_month(df, columns):
    for column in columns:
        df[f'{column}_month'] = pd.DatetimeIndex(df[column]).month
        
columns = ['started_at', 'ended_at']

datetime_month(citibike_2019, columns)

In [9]:
#https://stackoverflow.com/questions/29688899/pandas-checking-if-a-date-is-a-holiday-and-assigning-boolean-value

#A datframe that will define if a date was a holiday.

dr = pd.date_range(start='2019-01-01', end='2019-12-31')
df = pd.DataFrame()
df['date'] = dr

cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())

df['holiday'] = df['date'].isin(holidays)
print(df)

          date  holiday
0   2019-01-01     True
1   2019-01-02    False
2   2019-01-03    False
3   2019-01-04    False
4   2019-01-05    False
..         ...      ...
360 2019-12-27    False
361 2019-12-28    False
362 2019-12-29    False
363 2019-12-30    False
364 2019-12-31    False

[365 rows x 2 columns]


In [10]:
#merging the citibike dataframes and the holiday dataframes

citibike_2019 = citibike_2019.merge(df, left_on='started_at_date', right_on='date', how='left')
citibike_2019 = citibike_2019.drop(columns = ['date'])

In [11]:
##The amount of rides that were started on a Holiday.
def holiday_rides_count(df):
    holiday_rides = df[df['holiday']== True]
    holiday_rides_amount = holiday_rides.groupby('started_at_date')['started_at_date'].count().sort_values(ascending = False)
    return holiday_rides_amount
    
holiday_rides_2019 = holiday_rides_count(citibike_2019)

print("2019 Holiday Rides Count:")
print(holiday_rides_2019)


2019 Holiday Rides Count:
started_at_date
2019-10-14    77630
2019-05-27    66595
2019-11-11    64883
2019-07-04    52712
2019-09-02    37792
2019-02-18    25827
2019-01-01    21962
2019-11-28    16173
2019-12-25    13082
2019-01-21    10291
Name: started_at_date, dtype: int64


**Using the weather**

In [12]:
#changing all the column titles to lowercase.
weather.columns = map(str.lower, weather.columns)

#changing the DATE column to a datetime datatype.
weather['date'] = pd.to_datetime(weather['date'])

#choosing the years I want data in.
weather = weather[weather['date'].dt.year.isin([2019])]

In [13]:
#Getting a count of how many nans are in each column.
nan_counts = weather.isna().sum()
too_many_nans = nan_counts[nan_counts > 500].index #locating the columns that hace more than 500 nans.
weather = weather.drop(columns = too_many_nans)# dropping the columns with more than 500 nans.

In [23]:
date_count_2019 = citibike_2019.groupby('started_at_date').size().reset_index(name = 'num_of_trips')
date_count_2019 = date_count_2019.rename(columns = {'started_at_date' : 'date'})

In [29]:
date_count_2019 = date_count_2019.merge(df, left_on = 'date', right_on = 'date', how = 'left')

***Answering data questions***

*What time are the bikes most frequently used during the day?*

In [None]:
#The hour the most amount of trips occur.
def most_hour (df, year):
    hour_count = df.groupby('started_at_hour')['started_at_hour'].count().sort_values(ascending=False)
    most_popular = hour_count.index[0]
    number_of_rides = hour_count.iloc[0]
    print(f"The hour with the most amount of trips in {year} was {most_popular} with {number_of_rides} rides.")

most_hour(citibike_2019, 2019)

*What time are the bikes used the least during the day?*

In [None]:
#The hour the least amount of trips occur.
def least_hour (df, year):
    hour_count = df.groupby('started_at_hour')['started_at_hour'].count().sort_values(ascending=True)
    least_popular = hour_count.index[0]
    number_of_rides = hour_count.iloc[0]
    print(f"The hour with the least amount of trips in {year} was {least_popular} with {number_of_rides} rides.")

least_hour(citibike_2019, 2019)

*Which stations are the most frequent for starting & ending a trip?*

In [None]:
#The station where the most trips begin
def popular_starting_station (df, year):
    station_count = df.groupby('start_station_name')['start_station_name'].count().sort_values(ascending=False)
    most_popular = station_count.index[0]
    number_of_rides = station_count.iloc[0]
    print(f"The station where most trips began in {year} was {most_popular} with {number_of_rides} rides.")

popular_starting_station(citibike_2019, 2019)

In [None]:
#The station where the most trips ended.
def popular_ending_station (df, year):
    station_count = df.groupby('end_station_name')['end_station_name'].count().sort_values(ascending=False)
    most_popular = station_count.index[0]
    number_of_rides = station_count.iloc[0]
    print(f"The station where most trips ended in {year} was {most_popular} with {number_of_rides} rides.")

popular_ending_station(citibike_2019, 2019)

**What is the average trip length? Does it change depending on day or time of day?**

In [None]:
#The average bike trip.
def avg_bike_trip (df, year):
    for column in columns:
        avg_trip_len = round(df[column].mean())
        print(f"The average length of a bike trip in {year} is {avg_trip_len} seconds")
        
columns = ['trip_duration']
avg_bike_trip(citibike_2019, 2019)

In [None]:
#The hour with the longest bike trip.
def avg_bike_trip_by_hour (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_hour')['trip_duration'].mean().sort_values(ascending = False))
        hour_with_longest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the hour that has the longest bike trip on average is hour {hour_with_longest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_hour(citibike_2019, 2019)

In [None]:
#The hour with the shortest bike trip.
def avg_bike_trip_by_hour_least (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_hour')['trip_duration'].mean().sort_values(ascending = True))
        hour_with_shortest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the hour that has the shortest bike trip on average is hour {hour_with_shortest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_hour_least(citibike_2019, 2019)

In [None]:
#The day of year with the longest trips.
def avg_bike_trip_by_day_of_year (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_day_of_year')['trip_duration'].mean().sort_values(ascending = False))
        day_of_year_with_longest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the day of year that has the longest bike trip on average is day {day_of_year_with_longest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_day_of_year(citibike_2019, 2019)

In [None]:
#The day of year with the shortest trips.
def avg_bike_trip_by_day_of_year_shortest (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_day_of_year')['trip_duration'].mean().sort_values(ascending = True))
        day_of_year_with_shortest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the day of year that has the shortest bike trip on average is day {day_of_year_with_shortest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_day_of_year_shortest(citibike_2019, 2019)