In [1]:
import pandas as pd
pd.set_option("display.max_columns", 150)
import glob

In [2]:
#importing all the files and keeping them seperated by year.
csv_files = glob.glob('../data/2019/*.csv')
csv_files_1 = glob.glob('../data/2020/*.csv')
csv_files_2 = glob.glob('../data/2021/*.csv')

#importing the weather file.
weather = pd.read_csv('../data/USW00094728.csv', low_memory = False)

In [4]:
#Creating the empty dataframes so that the files can be added to.

citibike_2019 = pd.DataFrame()
citibike_2020 = pd.DataFrame()
citibike_2021 = pd.DataFrame()

In [5]:
# Iterating over the 2019 files so that they are in a dataframe.
for csv_file in csv_files:
    df = pd.read_csv(csv_file, low_memory = False)
    citibike_2019 = pd.concat([citibike_2019, df])

In [6]:
# Iterating over the 2020 files so that they are in a dataframe.
for csv_file in csv_files_1:
    df = pd.read_csv(csv_file, low_memory = False)
    citibike_2020 = pd.concat([citibike_2020, df])

In [7]:
# Iterating over the 2021 files so that they are in a dataframe.
for csv_file in csv_files_2:
    df = pd.read_csv(csv_file, low_memory = False)
    citibike_2021 = pd.concat([citibike_2021, df])

In [67]:
print(citibike_2019.shape)
print(citibike_2020.shape)
print(citibike_2021.shape)
print(weather.shape)

(20551697, 24)
(19506857, 24)
(27661451, 24)
(1096, 40)


In [9]:
#changing the DATE column to a datetime datatype.
weather['DATE'] = pd.to_datetime(weather['DATE'])

In [10]:
#choosing the years I want data on.
weather = weather[weather['DATE'].dt.year.isin([2019, 2020, 2021])]

In [11]:
#Getting a count of how many nans are in each column.
nan_counts = weather.isna().sum()
too_many_nans = nan_counts[nan_counts > 500].index #locating the columns that hace more than 500 nans.
weather = weather.drop(columns = too_many_nans)# dropping the columns with more than 500 nans.

In [12]:
#renaming column to match 2019 & 2020 dfs.
citibike_2021 = citibike_2021.rename(columns = {'member_casual':'user_type'}) 

#removing the columns that were unique to 2021
citibike_2021 = citibike_2021.drop(columns = ['ride_id', 'rideable_type', 'starttime', 'stoptime', 'start station id', 'start station name', 'start station latitude', 'start station longitude', 'end station id', 'end station name', 'end station latitude', 'end station longitude','bikeid','usertype','birth year','gender', 'tripduration'])

In [13]:
#removing spaces out of column names.
citibike_2019 = citibike_2019.rename(columns = {'starttime': 'started_at', 'stoptime' : 'ended_at','start station id': 'start_station_id', 'start station name': 'start_station_name', 'start station latitude':'start_lat', 'start station longitude':'start_lng', 'end station id':'end_station_id', 'end station name':'end_station_name', 'end station latitude':'end_lat','end station longitude':'end_lng', 'usertype':'user_type', 'tripduration' : 'trip_duration'})
citibike_2020 = citibike_2020.rename(columns = {'starttime': 'started_at', 'stoptime' : 'ended_at','start station id': 'start_station_id', 'start station name': 'start_station_name', 'start station latitude':'start_lat', 'start station longitude':'start_lng', 'end station id':'end_station_id', 'end station name':'end_station_name', 'end station latitude':'end_lat','end station longitude':'end_lng', 'usertype':'user_type', 'tripduration' : 'trip_duration'})

#removing columns that are not also available in 2021 df
citibike_2019 = citibike_2019.drop(columns = ['birth year', 'gender', 'bikeid'])
citibike_2020 = citibike_2020.drop(columns = ['birth year', 'gender', 'bikeid'])

In [14]:
#A function that changed the dtype of these columns to datetime.

def convert_datetime(df, columns):
    for column in columns:
        df[column] = pd.to_datetime(df[column])

columns = ['started_at', 'ended_at']

convert_datetime(citibike_2019, columns)
convert_datetime(citibike_2020, columns)
convert_datetime(citibike_2021, columns)

In [15]:
#creating a column in 2021 dataframe that has the trip duration and then turning it into seconds.

citibike_2021['trip_duration'] = citibike_2021['ended_at'] - citibike_2021['started_at']
citibike_2021['trip_duration'] = citibike_2021['trip_duration'].dt.total_seconds()

In [42]:
# A function that will make new columns for the date, time, hour and day of the week each trip began and ended.

def datetime_date_time(df, columns):
    for column in columns:
        df[f'{column}_date'] = pd.to_datetime(df[column]).dt.date
        df[f'{column}_time'] = pd.to_datetime(df[column]).dt.time
        df[f'{column}_hour'] = pd.to_datetime(df[column]).dt.hour
        df[f'{column}_day_of_week'] = pd.to_datetime(df[column]).dt.weekday
        df[f'{column}_day_of_year']= pd.to_datetime(df[column]).dt.dayofyear
        
columns = ['started_at', 'ended_at']

datetime_date_time(citibike_2019, columns)
datetime_date_time(citibike_2020, columns)
datetime_date_time(citibike_2021, columns)

In [18]:
#using the convert_datetime function to convert the date column to a datetime dtype.

columns = ['started_at_date', 'ended_at_date']

convert_datetime(citibike_2019, columns)
convert_datetime(citibike_2020, columns)
convert_datetime(citibike_2021, columns)

In [19]:
#A function that will create a column for the month of the trip.

def datetime_month(df, columns):
    for column in columns:
        df[f'{column}_month'] = pd.DatetimeIndex(df[column]).month
        
columns = ['started_at', 'ended_at']

datetime_month(citibike_2019, columns)
datetime_month(citibike_2020, columns)
datetime_month(citibike_2021, columns)

***Answering data questions***

*What time are the bikes most frequently used during the day?*

In [28]:
#The hour the most amount of trips occur.
def most_hour (df, year):
    hour_count = df.groupby('started_at_hour')['started_at_hour'].count().sort_values(ascending=False)
    most_popular = hour_count.index[0]
    number_of_rides = hour_count.iloc[0]
    print(f"The hour with the most amount of trips in {year} was {most_popular} with {number_of_rides} rides.")

most_hour(citibike_2019, 2019)
most_hour(citibike_2020, 2020)
most_hour(citibike_2021, 2021)

The hour with the most amount of trips in 2019 was 17 with 2002482 rides.
The hour with the most amount of trips in 2020 was 17 with 1879814 rides.
The hour with the most amount of trips in 2021 was 17.0 with 2379834 rides.


*What time are the bikes used the least during the day?*

In [27]:
#The hour the least amount of trips occur.
def least_hour (df, year):
    hour_count = df.groupby('started_at_hour')['started_at_hour'].count().sort_values(ascending=True)
    least_popular = hour_count.index[0]
    number_of_rides = hour_count.iloc[0]
    print(f"The hour with the least amount of trips in {year} was {least_popular} with {number_of_rides} rides.")

least_hour(citibike_2019, 2019)
least_hour(citibike_2020, 2020)
least_hour(citibike_2021, 2021)

The hour with the least amount of trips in 2019 was 3 with 41559 rides.
The hour with the least amount of trips in 2020 was 4 with 38609 rides.
The hour with the least amount of trips in 2021 was 4.0 with 82777 rides.


*Which stations are the most frequent for starting & ending a trip?*

In [21]:
#The station where the most trips begin
def popular_starting_station (df, year):
    station_count = df.groupby('start_station_name')['start_station_name'].count().sort_values(ascending=False)
    most_popular = station_count.index[0]
    number_of_rides = station_count.iloc[0]
    print(f"The station where most trips began in {year} was {most_popular} with {number_of_rides} rides.")

popular_starting_station(citibike_2019, 2019)
popular_starting_station(citibike_2020, 2020)
popular_starting_station(citibike_2021, 2021)

The station where most trips began in 2019 was Pershing Square North with 156575 rides.
The station where most trips began in 2020 was 1 Ave & E 68 St with 100753 rides.
The station where most trips began in 2021 was W 21 St & 6 Ave with 122588 rides.


In [39]:
#The station where the most trips ended.
def popular_ending_station (df, year):
    station_count = df.groupby('end_station_name')['end_station_name'].count().sort_values(ascending=False)
    most_popular = station_count.index[0]
    number_of_rides = station_count.iloc[0]
    print(f"The station where most trips ended in {year} was {most_popular} with {number_of_rides} rides.")

popular_ending_station(citibike_2019, 2019)
popular_ending_station(citibike_2020, 2020)
popular_ending_station(citibike_2020, 2021)

The station where most trips ended in 2019 was Pershing Square North with 155536 rides.
The station where most trips ended in 2020 was West St & Chambers St with 101767 rides.
The station where most trips ended in 2021 was West St & Chambers St with 101767 rides.


**What is the average trip length? Does it change depending on day or time of day?**

In [52]:
#The average bike trip.
def avg_bike_trip (df, year):
    for column in columns:
        avg_trip_len = round(df[column].mean())
        print(f"The average length of a bike trip in {year} is {avg_trip_len} seconds")
        
columns = ['trip_duration']
avg_bike_trip(citibike_2019, 2019)
avg_bike_trip(citibike_2020, 2020)
avg_bike_trip(citibike_2021, 2021)

The average length of a bike trip in 2019 is 978 seconds
The average length of a bike trip in 2020 is 1311 seconds
The average length of a bike trip in 2021 is 1143 seconds


In [59]:
#The hour with the longest bike trip.
def avg_bike_trip_by_hour (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_hour')['trip_duration'].mean().sort_values(ascending = False))
        hour_with_longest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the hour that has the longest bike trip on average is hour {hour_with_longest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_hour(citibike_2019, 2019)
avg_bike_trip_by_hour(citibike_2020, 2020)
avg_bike_trip_by_hour(citibike_2021, 2021)

In 2019, the hour that has the longest bike trip on average is hour 2, which lasts about 1484.0 seconds
In 2020, the hour that has the longest bike trip on average is hour 2, which lasts about 2019.0 seconds
In 2021, the hour that has the longest bike trip on average is hour 2.0, which lasts about 1733.0 seconds


In [63]:
#The hour with the shortest bike trip.
def avg_bike_trip_by_hour_least (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_hour')['trip_duration'].mean().sort_values(ascending = True))
        hour_with_shortest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the hour that has the shortest bike trip on average is hour {hour_with_shortest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_hour_least(citibike_2019, 2019)
avg_bike_trip_by_hour_least(citibike_2020, 2020)
avg_bike_trip_by_hour_least(citibike_2021, 2021)

In 2019, the hour that has the shortest bike trip on average is hour 6, which lasts about 659.0 seconds
In 2020, the hour that has the shortest bike trip on average is hour 6, which lasts about 872.0 seconds
In 2021, the hour that has the shortest bike trip on average is hour 6.0, which lasts about 803.0 seconds


In [65]:
#the day of year with the longest trips.
def avg_bike_trip_by_day_of_year (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_day_of_year')['trip_duration'].mean().sort_values(ascending = False))
        day_of_year_with_longest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the day of year that has the longest bike trip on average is day {day_of_year_with_longest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_day_of_year(citibike_2019, 2019)
avg_bike_trip_by_day_of_year(citibike_2020, 2020)
avg_bike_trip_by_day_of_year(citibike_2021, 2021)

In 2019, the day of year that has the longest bike trip on average is day 185, which lasts about 1668.0 seconds
In 2020, the day of year that has the longest bike trip on average is day 352, which lasts about 2211.0 seconds
In 2021, the day of year that has the longest bike trip on average is day 34.0, which lasts about 3618.0 seconds


In [66]:
#the day of year with the shortest trips.
def avg_bike_trip_by_day_of_year_shortest (df, year):
    for column in columns:
        avg_trip_len = round(df.groupby('started_at_day_of_year')['trip_duration'].mean().sort_values(ascending = True))
        day_of_year_with_shortest_trips = avg_trip_len.index[0]
        avg_seconds = avg_trip_len.iloc[0]
        print(f"In {year}, the day of year that has the shortest bike trip on average is day {day_of_year_with_shortest_trips}, which lasts about {avg_seconds} seconds")
        
avg_bike_trip_by_day_of_year_shortest(citibike_2019, 2019)
avg_bike_trip_by_day_of_year_shortest(citibike_2020, 2020)
avg_bike_trip_by_day_of_year_shortest(citibike_2021, 2021)

In 2019, the day of year that has the shortest bike trip on average is day 31, which lasts about 648.0 seconds
In 2020, the day of year that has the shortest bike trip on average is day 37, which lasts about 679.0 seconds
In 2021, the day of year that has the shortest bike trip on average is day 53.0, which lasts about 745.0 seconds


In [43]:
citibike_2019.head()

Unnamed: 0,trip_duration,started_at,ended_at,start_station_id,start_station_name,start_lat,start_lng,end_station_id,end_station_name,end_lat,end_lng,user_type,started_at_date,started_at_time,started_at_hour,started_at_day_of_week,ended_at_date,ended_at_time,ended_at_hour,ended_at_day_of_week,started_at_month,ended_at_month,started_at_day_of_year,ended_at_day_of_year
0,393,2019-08-01 00:00:01.468,2019-08-01 00:06:35.378,531.0,Forsyth St & Broome St,40.718939,-73.992663,408.0,Market St & Cherry St,40.710762,-73.994004,Subscriber,2019-08-01,00:00:01.468000,0,3,2019-08-01,00:06:35.378000,0,3,8,8,213,213
1,627,2019-08-01 00:00:01.929,2019-08-01 00:10:29.784,274.0,Lafayette Ave & Fort Greene Pl,40.686919,-73.976682,3409.0,Bergen St & Smith St,40.686744,-73.990632,Subscriber,2019-08-01,00:00:01.929000,0,3,2019-08-01,00:10:29.784000,0,3,8,8,213,213
2,1132,2019-08-01 00:00:04.048,2019-08-01 00:18:56.165,2000.0,Front St & Washington St,40.702551,-73.989402,3388.0,President St & Henry St,40.6828,-73.999904,Subscriber,2019-08-01,00:00:04.048000,0,3,2019-08-01,00:18:56.165000,0,3,8,8,213,213
3,1780,2019-08-01 00:00:04.163,2019-08-01 00:29:44.794,479.0,9 Ave & W 45 St,40.760193,-73.991255,473.0,Rivington St & Chrystie St,40.721101,-73.991925,Subscriber,2019-08-01,00:00:04.163000,0,3,2019-08-01,00:29:44.794000,0,3,8,8,213,213
4,1517,2019-08-01 00:00:05.458,2019-08-01 00:25:23.455,3312.0,1 Ave & E 94 St,40.781721,-73.94594,3312.0,1 Ave & E 94 St,40.781721,-73.94594,Subscriber,2019-08-01,00:00:05.458000,0,3,2019-08-01,00:25:23.455000,0,3,8,8,213,213


In [None]:
pd.merge(citibike_2019, citibike_2020)

In [None]:
citibike_2020.info()

In [None]:
citibike_2019['start_station_id'].value_counts()

In [None]:
citibike_2019.info()

In [None]:
citibike_2020.info()

In [None]:
citibike_2021.info()

In [None]:
citibike_2021['start_station_id'].value_counts()

In [None]:
citibike_2020['start_station_id'].value_counts()

In [None]:
citibike_2019['start_station_id'].value_counts()

In [None]:
# def datetime_hour(df, columns):
#     for column in columns:
#         df['hour'] = pd.to_datetime(df[column]).dt.hour
# columns = ['started_at', 'ended_at']

# datetime_hour(citibike_2019, columns)
# datetime_hour(citibike_2020, columns)
# datetime_hour(citibike_2021, columns)