## Imports

In [1]:
import requests
import bs4
import pandas as pd
from math import radians, cos, sin, asin, sqrt
import numpy as np

## Part 1: Data Preprocessing

### Uber Sample Data Cleaning

In [9]:
# load_data
uber_sample = pd.read_csv("data/uber_rides_sample.csv")

In [10]:
# constraint on longitude and latitude
uber_sample = uber_sample[uber_sample['pickup_longitude']>=-74.242330]
uber_sample = uber_sample[uber_sample['pickup_longitude']<=-73.717047]
uber_sample = uber_sample[uber_sample['pickup_latitude']>=40.560445]
uber_sample = uber_sample[uber_sample['pickup_latitude']<=40.908524]
uber_sample = uber_sample[uber_sample['dropoff_longitude']>=-74.242330]
uber_sample = uber_sample[uber_sample['dropoff_longitude']<=-73.717047]
uber_sample = uber_sample[uber_sample['dropoff_latitude']>=40.560445]
uber_sample = uber_sample[uber_sample['dropoff_latitude']<=40.908524]

In [11]:
len(uber_sample)

195472

In [12]:
# function for calculating distance given longitude and latitude of pickup and dropoff location
from math import radians, cos, sin, asin, sqrt
def cal_distance(lat1, lat2, long1, long2):
     
    # convert degrees to radians
    long1 = radians(long1)
    long2 = radians(long2)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
      
    # apply the Haversine formula
    result = sin((lat2 - lat1) / 2)**2 + cos(lat1) * cos(lat2) * sin((long2 - long1) / 2)**2
    result = 2 * asin(sqrt(result))*3956
      
    return result

In [88]:
# unit test for cal_distance
def cal_distance_test():
    assert round(cal_distance(40, 41, 70, 71), 2) == 86.74
cal_distance_test()

In [13]:
# filter out unnecessary columns
uber_cols = ['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
uber_sample_cleaned = uber_sample[uber_cols].reset_index(drop=True)

In [14]:
# calculate distance for each trip and add it to the dataframe
distance = []
for i in range(len(uber_sample_cleaned)):
    data = uber_sample_cleaned.iloc[i,:]
    distance.append(cal_distance(data[2], data[4], data[1], data[3]))
uber_sample_cleaned['distance'] = distance

In [15]:
# save cleaned data to csv
uber_sample_cleaned.to_csv("data/uber_sample_cleaned.csv")

### count number of records in each month

In [16]:
uber_sample_cleaned['year'] = uber_sample_cleaned['pickup_datetime'].apply(lambda x:int(x[:4]))
uber_sample_cleaned['month'] = uber_sample_cleaned['pickup_datetime'].apply(lambda x:int(x[5:7]))

In [17]:
nrows_dict = uber_sample_cleaned.groupby(['year', 'month']).agg({'pickup_datetime':'count'}).to_dict()['pickup_datetime']

In [78]:
uber_sample_cleaned.drop(columns=['year', 'month'], inplace=True)

### Yellow Taxi Data Cleaning

#### Download and Sampling Yellow Taxi Data

In [11]:
# indicator function to check whether the time of the trip is within our desired time range
def isValidTime(year, month):
    if year > 2015 or year < 2009:
        return False
    if year == 2015 and month > 6:
        return False
    return True

In [12]:
# function to automatically scrap all the yellow taxi data within our desired time range
# the input parameter last_month is used to continue on unfinished downloading
def download_yellow_taxi(startpoint=None):
    # use requests to get the html of the yellow taxi page
    response = requests.get("https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page")

    # use beautifulsoup to parse the html
    soup = bs4.BeautifulSoup(response.content, 'html.parser')
    
    if startpoint:
        start = False
    else:
        start = True
    # extract all the urls that point to the csv files of yellow taxi trip records
    for row in soup.find_all(title="Yellow Taxi Trip Records"):
        url = row['href']
        # check if the csv is within our desired time range
        year = int(url[-11:-7])
        month = int(url[-6:-4])
        if (not start) and ((year, month) == startpoint):
            start = True
        if isValidTime(year, month) and start:
            # download the original csv files
            print(url)
            data = requests.get(url)
            fpath = "data/yellow_taxi/yt_{}_{}.csv".format(year, month)
            with open(fpath, 'wb')as file:
                file.write(data.content)
            print("Downloaded Successfully")
        
            temp = pd.read_csv(fpath, error_bad_lines=False)
            temp = temp.sample(nrows_dict[(year, month)]).reset_index(drop=True)
            
            temp.to_csv(fpath)
            print("Written Successfully")

In [18]:
download_yellow_taxi((2010, 4))

https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2010-04.csv
Downloaded Successfully
Written Successfully
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2010-05.csv
Downloaded Successfully
Written Successfully
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2010-06.csv
Downloaded Successfully
Written Successfully
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2010-07.csv
Downloaded Successfully
Written Successfully
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2010-08.csv
Downloaded Successfully
Written Successfully
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2010-09.csv
Downloaded Successfully
Written Successfully
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2010-10.csv
Downloaded Successfully
Written Successfully
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2010-11.csv
Downloaded Successfully
Written Successfully
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2010-12.csv
D

In [66]:
# There three different column namings among all the csv
yt_cols_1 = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'pickup_longitude', 'pickup_latitude', 
             'dropoff_longitude', 'dropoff_latitude', 'tip_amount']
yt_cols_2 = ['Trip_Pickup_DateTime', 'Trip_Dropoff_DateTime', 'Start_Lon', 'Start_Lat',
             'End_Lon', 'End_Lat', 'Tip_Amt']
yt_cols_3 = ['pickup_datetime', 'dropoff_datetime', 'pickup_longitude', 'pickup_latitude',
             'dropoff_longitude', 'dropoff_latitude', 'tip_amount']

In [147]:
def clean_yellow_taxi():
    yt_sample = pd.DataFrame()
    for key in nrows_dict:
        # read in all the yellow taxi csv
        fpath = "data/yellow_taxi/yt_{}_{}.csv".format(key[0], key[1])
        temp = pd.read_csv(fpath)
        
        # strip the column names as some of them contain leading spaces
        temp.columns = [i.strip() for i in temp.columns]
        
        # select only necessary columns
        if 'tpep_pickup_datetime' in temp.columns:
            temp = temp[yt_cols_1]
        elif 'Trip_Pickup_DateTime' in temp.columns:
            temp = temp[yt_cols_2]
        else:
            temp = temp[yt_cols_3]
            
        # set the column names to be the same
        temp.columns = yt_cols_3
        
        # filter out invalid latitude and longitude
        temp = temp[temp['pickup_longitude']>=-74.242330]
        temp = temp[temp['pickup_longitude']<=-73.717047]
        temp = temp[temp['pickup_latitude']>=40.560445]
        temp = temp[temp['pickup_latitude']<=40.908524]
        temp = temp[temp['dropoff_longitude']>=-74.242330]
        temp = temp[temp['dropoff_longitude']<=-73.717047]
        temp = temp[temp['dropoff_latitude']>=40.560445]
        temp = temp[temp['dropoff_latitude']<=40.908524]
        
        # calculate distance
        distance = []
        for i in range(len(temp)):
            data = temp.iloc[i,:]
            distance.append(cal_distance(data[3], data[5], data[2], data[4]))
        temp['distance'] = distance
        
        # combine the dataframe from each month together 
        yt_sample = pd.concat([yt_sample, temp])
        
    return yt_sample.reset_index(drop=True)

In [148]:
yt_sample_cleaned = clean_yellow_taxi()

In [150]:
yt_sample_cleaned.to_csv("data/yt_sample_cleaned.csv")

### Weather Data

In [161]:
def clean_weather():
    # function to check whether the date is in our desired time range
    def valid_date(date):
        date = pd.to_datetime(date)
        if date.year==2015 and date.month>6:
            return False
        else:
            return True
    
    weather_cleaned = pd.DataFrame() # place holder for the result
    weather_cols = ['DATE', 'HourlyPrecipitation', 'HourlyWindSpeed'] # necessary columns
    
    # read in csv of each month and select only necessary columns, then combine them into the final result
    for year in range(2009, 2016):
        temp = pd.read_csv("data/{}_weather.csv".format(year), infer_datetime_format=True)
        temp = temp[weather_cols]
        weather_cleaned = pd.concat([weather_cleaned, temp])
    weather_cleaned.reset_index(drop=True, inplace=True)
    
    # filter out data after 2015-06
    weather_cleaned['keep'] = weather_cleaned['DATE'].apply(lambda x:valid_date(x))
    weather_cleaned = weather_cleaned[weather_cleaned['keep']==True]
    weather_cleaned.drop(columns=['keep'], inplace=True)
    return weather_cleaned

In [162]:
weather_cleaned = clean_weather()

### draft

In [138]:
month

'2015-05-07 19:52:06 UTC'

In [32]:
ytd_2021_01.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2.0,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
1,1.0,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2.0,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0
2,1.0,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1.0,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1.0,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1.0,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2.0,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1.0,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5


In [34]:
import curl

In [36]:
invalid_loc = {27, 101, 44, 204, 5, 84, 109, 110, 176}

In [48]:
yt_cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_distance', 'tip_amount']

In [82]:
ytd_2021_01 = ytd_2021_01[yt_cols]

In [89]:
ytd_2021_01[~ytd_2021_01['PULocationID'].isin(invalid_loc)]

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID,trip_distance,tip_amount
0,2021-01-01 00:30:10,2021-01-01 00:36:12,142,43,2.10,0.00
1,2021-01-01 00:51:20,2021-01-01 00:52:19,238,151,0.20,0.00
2,2021-01-01 00:43:30,2021-01-01 01:11:06,132,165,14.70,8.65
3,2021-01-01 00:15:48,2021-01-01 00:31:01,138,132,10.60,6.05
4,2021-01-01 00:31:49,2021-01-01 00:48:21,68,33,4.94,4.06
...,...,...,...,...,...,...
1369760,2021-01-25 08:32:04,2021-01-25 08:49:32,135,82,8.80,0.00
1369761,2021-01-25 08:34:00,2021-01-25 09:04:00,42,161,5.86,0.00
1369762,2021-01-25 08:37:00,2021-01-25 08:53:00,14,106,4.45,0.00
1369763,2021-01-25 08:28:00,2021-01-25 08:50:00,175,216,10.04,0.00


In [90]:
ytd_2021_01[~ytd_2021_01['DOLocationID'].isin(invalid_loc)]

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID,trip_distance,tip_amount
0,2021-01-01 00:30:10,2021-01-01 00:36:12,142,43,2.10,0.00
1,2021-01-01 00:51:20,2021-01-01 00:52:19,238,151,0.20,0.00
2,2021-01-01 00:43:30,2021-01-01 01:11:06,132,165,14.70,8.65
3,2021-01-01 00:15:48,2021-01-01 00:31:01,138,132,10.60,6.05
4,2021-01-01 00:31:49,2021-01-01 00:48:21,68,33,4.94,4.06
...,...,...,...,...,...,...
1369760,2021-01-25 08:32:04,2021-01-25 08:49:32,135,82,8.80,0.00
1369761,2021-01-25 08:34:00,2021-01-25 09:04:00,42,161,5.86,0.00
1369762,2021-01-25 08:37:00,2021-01-25 08:53:00,14,106,4.45,0.00
1369763,2021-01-25 08:28:00,2021-01-25 08:50:00,175,216,10.04,0.00
