In [19]:
import pandas as pd
import datetime
import time
from scipy.spatial import distance
import requests

In [20]:
# data cleaning & standardizing functions (from exploratory data analysis notebook) - to be used on each data set
# downloaded from the Citi bike website
def getDist(row):
    start = [row["start station latitude"], row["start station longitude"]]
    end = [row["end station latitude"], row["end station longitude"]]
    sec = row["tripduration"]
    total_coord_dist = distance.cdist([start], [end], 'cityblock')
    return (total_coord_dist*69*3600/(sec))[0][0]

def getRiderAge(df):
    df["rider_age"] = df["birth year"].apply(lambda x: datetime.datetime.now().year - x)
    
def getAvgSpeed(df):
    df["avg_speed"] = df.apply(lambda row: getDist(row), axis=1)
    
#start to make a cleaning/organizing function for new data frame imports
def addCols(df):
    getRiderAge(df)
    #remove all riders older than about 80 - not really a huge demographic for cylcing anyway
    getAvgSpeed(df)

In [31]:
#uses the "requests" python library and a custom, formatted url string 
#to get a range of csvs downloaded, with the "start" and "end" parameters in the 
#form of [month, year], with months numbered 1 through 12 and year being 2017 or later
def getCSVZips(start, end):
    #validate inputs here later - ensuring that the "end" argument is after "start"
    
    #array to return with url strings
    output = []
    for year in range(start[1], end[1]+1):
        if(start[1]==year):
            start_month = start[0]
        else:
            start_month = 1
        
        for month in range(start_month, 13):
            curr_url = "https://s3.amazonaws.com/tripdata/{}{:0>2d}-citibike-tripdata.csv.zip".format(year, month)
            output.append(curr_url)
            if(end[0]==month and end[1]==year):
                return output

getCSVZips([1, 2018], [1, 2020])

['https://s3.amazonaws.com/tripdata/201801-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201802-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201803-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201804-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201805-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201806-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201807-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201808-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201809-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201810-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201811-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201812-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201901-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201902-citibike-tripdata.csv.zip',
 'http

In [35]:
#now get the data for the past 3 years to look for seasonal trends 
url_list = getCSVZips([8, 2017], [8, 2020])

response = requests.get(url_list[0], allow_redirects=True)
open("../raw_data/" + url_list[0].split("/")[-1], 'wb').write(response.content)

63277904