In [3]:
import pandas as pd
import datetime
import time
from scipy.spatial import distance
import requests

In [4]:
# data cleaning & standardizing functions (from exploratory data analysis notebook) - to be used on each data set
# downloaded from the Citi bike website
def getDist(row):
    start = [row["start station latitude"], row["start station longitude"]]
    end = [row["end station latitude"], row["end station longitude"]]
    sec = row["tripduration"]
    total_coord_dist = distance.cdist([start], [end], 'cityblock')
    return (total_coord_dist*69*3600/(sec))[0][0]

def getRiderAge(df):
    df["rider_age"] = df["birth year"].apply(lambda x: datetime.datetime.now().year - x)
    #remove "outlier" values over ~80 years old
    df = df.loc[df["rider_age"] <= 80]
    
def getAvgSpeed(df):
    df["avg_speed"] = df.apply(lambda row: getDist(row), axis=1)
    
#start to make a cleaning/organizing function for new data frame imports
def addCols(df):
    getRiderAge(df)
    getAvgSpeed(df)
    #drop duplicates in case there are any
    df = df.drop_duplicates()

In [5]:
#uses the "requests" python library and a custom, formatted url string 
#to get a range of csvs downloaded, with the "start" and "end" parameters in the 
#form of [month, year], with months numbered 1 through 12 and year being 2017 or later
def getCSVZips(start, end):
    #validate inputs here later - ensuring that the "end" argument is after "start"
    
    #array to return with url strings
    output = []
    for year in range(start[1], end[1]+1):
        if(start[1]==year):
            start_month = start[0]
        else:
            start_month = 1
        
        for month in range(start_month, 13):
            curr_url = "https://s3.amazonaws.com/tripdata/{}{:0>2d}-citibike-tripdata.csv.zip".format(year, month)
            output.append(curr_url)
            if(end[0]==month and end[1]==year):
                return output

getCSVZips([1, 2018], [1, 2020])

['https://s3.amazonaws.com/tripdata/201801-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201802-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201803-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201804-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201805-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201806-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201807-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201808-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201809-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201810-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201811-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201812-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201901-citibike-tripdata.csv.zip',
 'https://s3.amazonaws.com/tripdata/201902-citibike-tripdata.csv.zip',
 'http

In [35]:
#now get the data for the past 3 years to look for seasonal trends 
url_list = getCSVZips([8, 2017], [8, 2020])

response = requests.get(url_list[0], allow_redirects=True)
open("../raw_data/" + url_list[0].split("/")[-1], 'wb').write(response.content)

63277904

In [37]:
#seemed to work, get last year of data
url_last_year = getCSVZips([8, 2017], [8, 2019])

for url in url_last_year:
    response = requests.get(url, allow_redirects=True)
    open("../raw_data/" + url.split("/")[-1], 'wb').write(response.content)

In [13]:
#now get file names for all files in the three-year period 
file_list = list(map(lambda x: x.split("/")[-1], getCSVZips([8, 2017], [8, 2020])))
file_list

['201708-citibike-tripdata.csv.zip',
 '201709-citibike-tripdata.csv.zip',
 '201710-citibike-tripdata.csv.zip',
 '201711-citibike-tripdata.csv.zip',
 '201712-citibike-tripdata.csv.zip',
 '201801-citibike-tripdata.csv.zip',
 '201802-citibike-tripdata.csv.zip',
 '201803-citibike-tripdata.csv.zip',
 '201804-citibike-tripdata.csv.zip',
 '201805-citibike-tripdata.csv.zip',
 '201806-citibike-tripdata.csv.zip',
 '201807-citibike-tripdata.csv.zip',
 '201808-citibike-tripdata.csv.zip',
 '201809-citibike-tripdata.csv.zip',
 '201810-citibike-tripdata.csv.zip',
 '201811-citibike-tripdata.csv.zip',
 '201812-citibike-tripdata.csv.zip',
 '201901-citibike-tripdata.csv.zip',
 '201902-citibike-tripdata.csv.zip',
 '201903-citibike-tripdata.csv.zip',
 '201904-citibike-tripdata.csv.zip',
 '201905-citibike-tripdata.csv.zip',
 '201906-citibike-tripdata.csv.zip',
 '201907-citibike-tripdata.csv.zip',
 '201908-citibike-tripdata.csv.zip',
 '201909-citibike-tripdata.csv.zip',
 '201910-citibike-tripdata.csv.zip',
 

In [None]:
#loop over list and make a .csv output file for each "year" by time duration - 12 files combined
start_month = 9

for year in range(2017, 2021):
    for i in range(12):
        curr_file = file_list.pop(0)
        curr_df = pd.read_csv("../raw_data/"+curr_file)
        addCols(curr_df)
        print("{} done!".format(curr_file[0:6]))
        if(i==0): 
            full_df = curr_df
        else:
            full_df = full_df.append(curr_df, ignore_index=True)
    full_df.to_csv("../raw_data/year_combined/{:0>2d}-{}.csv".format(start_month, year), index=False)
    print("{} csv output".format(year))
        

201708 done!
201709 done!
201710 done!
201711 done!
201712 done!
201801 done!
201802 done!
201803 done!
201804 done!
201805 done!
201806 done!
201807 done!
2017 csv output
201808 done!
201809 done!
201810 done!
201811 done!
201812 done!
201901 done!
201902 done!
201903 done!
201904 done!
201905 done!
201906 done!
201907 done!
2018 csv output
201908 done!
201909 done!
201910 done!
201911 done!
201912 done!
202001 done!
202002 done!
202003 done!
202004 done!
202005 done!
202006 done!
202007 done!
2019 csv output
