In [1]:
import requests
import zipfile

import pandas as pd

from io import BytesIO, StringIO

%matplotlib inline

## First things first... Download the data

Data comes from https://www.citibikenyc.com/system-data

In [2]:
base_url = "https://s3.amazonaws.com/tripdata/"
citibike_2019 = base_url + "201903-citibike-tripdata.csv.zip"
citibike_2020 = base_url + "202003-citibike-tripdata.csv.zip"

In [3]:
def download_citibike_zipfile(url):
    # Download data from web and store in zipfile
    dwnld_content = requests.get(url)
    citi_zip = zipfile.ZipFile(
        BytesIO(dwnld_content.content)
    )

    # Find a filename and open it
    fn = citi_zip.namelist()[0]
    citi_data = citi_zip.open(fn)

    # Read into a dataframe
    df = pd.read_csv(citi_data)

    return df


In [4]:
df_2019 = download_citibike_zipfile(citibike_2019)

In [5]:
df_2020 = download_citibike_zipfile(citibike_2020)

## What is in our data?

In [6]:
df_2019.shape

(1327960, 15)

In [7]:
df_2019.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id           float64
start station name          object
start station latitude     float64
start station longitude    float64
end station id             float64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
birth year                   int64
gender                       int64
dtype: object

In [8]:
df_2019.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,1463,2019-03-01 00:00:16.0970,2019-03-01 00:24:39.3880,319.0,Fulton St & Broadway,40.711066,-74.009447,347.0,Greenwich St & W Houston St,40.728846,-74.008591,35618,Subscriber,1989,1
1,285,2019-03-01 00:00:32.3850,2019-03-01 00:05:18.1830,439.0,E 4 St & 2 Ave,40.726281,-73.98978,150.0,E 2 St & Avenue C,40.720874,-73.980858,31113,Subscriber,1980,1
2,686,2019-03-01 00:00:47.7970,2019-03-01 00:12:14.3090,526.0,E 33 St & 5 Ave,40.747659,-73.984907,3474.0,6 Ave & Spring St,40.725256,-74.004121,19617,Subscriber,1987,1
3,442,2019-03-01 00:01:01.2090,2019-03-01 00:08:23.7510,3474.0,6 Ave & Spring St,40.725256,-74.004121,355.0,Bayard St & Baxter St,40.716021,-73.999744,27086,Subscriber,1987,2
4,2913,2019-03-01 00:01:09.2810,2019-03-01 00:49:42.3810,379.0,W 31 St & 7 Ave,40.749156,-73.9916,212.0,W 16 St & The High Line,40.743349,-74.006818,34791,Subscriber,1991,1


In [9]:
df_2019["start_dt"] = pd.to_datetime(df_2019["starttime"])
df_2019["end_dt"] = pd.to_datetime(df_2019["stoptime"])

df_2020["start_dt"] = pd.to_datetime(df_2020["starttime"])
df_2020["end_dt"] = pd.to_datetime(df_2020["stoptime"])

In [10]:
4 columns and 4 rowsdf_2019.dtypes

tripduration                        int64
starttime                          object
stoptime                           object
start station id                  float64
start station name                 object
start station latitude            float64
start station longitude           float64
end station id                    float64
end station name                   object
end station latitude              float64
end station longitude             float64
bikeid                              int64
usertype                           object
birth year                          int64
gender                              int64
start_dt                   datetime64[ns]
end_dt                     datetime64[ns]
dtype: object

## Questions to ask the data?

The point of data is to learn something... What types of questions can we ask the data?

In each example, begin with the raw DataFrames `df_2019` and `df_2020`.

**More bike rides in March 2019 or March 2020?** (Warmup)


In [11]:
# each row is a ride and we only have march data, just look at row counts
df_2019.shape

(1327960, 17)

In [12]:
df_2020.shape

(1068457, 17)

In [None]:
# more rides in March 2019

**Which bike saw the most use in 2019?**

In [58]:
def tripduration_summary_by_col(df, col):
    funcs = ["mean", "count", "sum"]
    bike_summaries = df.pivot_table(index=col, values="tripduration", aggfunc=funcs)
    bike_summaries.columns = funcs
    return bike_summaries

In [59]:
def bike_summary(df):
    return tripduration_summary_by_col(df, "bikeid")

In [60]:
bike_summary(df_2019)["count"].nlargest(3)

bikeid
35641    545
36624    503
36696    497
Name: count, dtype: int64

In [61]:
# bike 34641, by a good margin

**Which bike saw the most use in 2020?**


In [62]:
bike_summary(df_2020)["count"].nlargest(3)

bikeid
37042    384
37088    369
37078    368
Name: count, dtype: int64

In [63]:
# bike 37042

**Which stations saw the biggest change in rides?**

* Which stations saw the biggest decrease?
* Which stations saw the biggest increase?

In [64]:
def station_summary(df):
    return tripduration_summary_by_col(df, "start station name")

In [65]:
start_station_diffs = (station_summary(df_2020) - station_summary(df_2019)).dropna()

In [66]:
start_station_diffs.sort_values("count").iloc[[0,1,2,-3,-2,-1], :]

Unnamed: 0_level_0,mean,count,sum
start station name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pershing Square North,629.11845,-4210.0,1104598.0
8 Ave & W 31 St,68.343217,-3279.0,-2085035.0
Broadway & E 22 St,59.102021,-2811.0,-1827734.0
12 Ave & W 40 St,299.15884,754.0,2404501.0
FDR Drive & E 35 St,290.859944,884.0,1856287.0
E 12 St & 3 Ave,212.5164,1017.0,1221346.0


In [None]:
# Pershing Square North originated 4210 LESS rides
# E 12 & 3 ave originated 1017 MORE rides

**Which station saw the longest average distance rides (include data from both years)?**

In [122]:
import numpy as np

all_data = pd.concat([df_2019, df_2020]).reset_index()

In [95]:
# Install geopy for distance calculation
# !pip install geopy
# from geopy import distance
# distance.distance((lat, lon), (lat, lon)).km

# NOTE from Spencer: I found that this took a very long time
# while the code was running I googled for how to approximate this
# distance, wrote the python function below, and did the calculation
# before geopy finished...
# after it finished the answers were the same up to 2 decimal points --
# good enough for what we are doing

In [123]:
def compute_distance_in_km(lat1, lon1, lat2, lon2):
    """
    This function computes approximate distance between two
    (lat, lon) points. It assumes the earth is a perfect sphere,
    so it should only be used for distance calculations between
    two points fairly close to one another
    """
    # approximate radius of earth in km
    R = 6373.0

    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    distance = R * c
    return distance

In [124]:
lat1 = all_data["start station latitude"]
lon1 = all_data["start station longitude"]
lat2 = all_data["end station latitude"]
lon2 = all_data["end station longitude"]
all_data["distance_traveled"] = compute_distance_in_km(lat1, lon1, lat2, lon2)

In [125]:
(
    all_data
    .pivot_table(index="start station name", values="distance_traveled", aggfunc=["mean", "sum", "count"])
    .loc[:, ("mean", "distance_traveled")]
    .sort_values()
)

start station name
MTL-AOS-5.1                                      0.000000
Bayard St & Leonard St                           1.215724
Columbia St & W 9 St                             1.216646
Henry St & Degraw St                             1.250702
3 St & Prospect Park West                        1.260312
                                                   ...   
Riverside Dr & W 104 St                          3.636671
39 St & 2 Ave - Citi Bike HQ at Industry City    3.736155
NYCBS Depot - GOW                                3.993949
12 Ave & W 125 St                                4.414386
58th St Depot                                    6.976548
Name: (mean, distance_traveled), Length: 929, dtype: float64

In [126]:
# the 58th St Depot had the longest average ride at almost 7 km

**Which two stations saw the highest speed ridden between them?**

Compute this assuming the distance is "as the crow flies".

In [127]:
all_data["trip_duration_hours"] = (all_data["end_dt"] - all_data["start_dt"]) / np.timedelta64(1, "h")
all_data["trip_speed_km_h"] = all_data.eval("distance_traveled / trip_duration_hours")

In [128]:
all_data.loc[all_data["trip_speed_km_h"].idxmax(), :]

index                                          110867
tripduration                                       62
starttime                    2020-03-03 11:17:32.6150
stoptime                     2020-03-03 11:18:35.4820
start station id                                477.0
start station name                    W 41 St & 8 Ave
start station latitude                      40.756405
start station longitude                    -73.990026
end station id                                 3798.0
end station name                      W 40 St & 5 Ave
end station latitude                        40.752269
end station longitude                      -73.982079
bikeid                                          38915
usertype                                   Subscriber
birth year                                       1988
gender                                              1
start_dt                   2020-03-03 11:17:32.615000
end_dt                     2020-03-03 11:18:35.482000
distance_traveled           

In [None]:
# looks like the fastest trip was between W 41st+8Ave and W 40+5 AVE
# it was a *very* quick trip only lasting 1 minute 3 seconds
# The rider traveled 0.81 km in that time!!!

**What else can we try to do?**