# Data initialising

**1. Dowload raw data for Yellow taxi**

In [1]:
from urllib.request import urlretrieve
import os

# from the current directory, go back one levels to the directory to find data
output_relative_dir = '../data/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for target_dir in ('curated', 'raw'): # taxi_zones should already exist
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

# select time we want to adress
year = '2022'
months = range(1,4)

# this is the URL template as of 01/2022
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"

# data output directory is `data/tlc_data/`
tlc_output_dir = output_relative_dir + 'raw'

for month in months:
    # 0-fill i.e 1 -> 01, 2 -> 02, etc
    month = str(month).zfill(2) 
    print(f"Begin month {month}")
    
    # generate url
    url = f'{URL_TEMPLATE}{year}-{month}.parquet'
    # generate output location and filename
    output_dir = f"{tlc_output_dir}/{year}-{month}.parquet"
    # download
    urlretrieve(url, output_dir) 
    
    print(f"Completed month {month}")

Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03


**2. read the data**

In [1]:
import numpy as np
import pandas as pd
# read the 3 months data
df1 = pd.read_parquet('../data/raw/2022-01.parquet')
df2 = pd.read_parquet('../data/raw/2022-02.parquet')
df3 = pd.read_parquet('../data/raw/2022-03.parquet')
df = pd.concat([df1,df2,df3],ignore_index=True)
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.80,1.0,N,142,236,1,14.50,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.10,1.0,N,236,42,1,8.00,0.5,0.5,4.00,0.0,0.3,13.30,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.50,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.00,0.5,0.5,0.00,0.0,0.3,11.80,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.30,1.0,N,68,163,1,23.50,0.5,0.5,3.00,0.0,0.3,30.30,2.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9071239,2,2022-03-31 23:45:00,2022-04-01 00:01:00,,3.57,,,48,262,0,18.33,0.0,0.5,1.00,0.0,0.3,22.63,,
9071240,2,2022-03-31 23:59:39,2022-04-01 00:06:09,,2.04,,,48,238,0,9.75,0.0,0.5,2.00,0.0,0.3,15.05,,
9071241,2,2022-03-31 23:50:00,2022-04-01 00:08:00,,0.59,,,48,4,0,17.33,0.0,0.5,2.23,0.0,0.3,22.86,,
9071242,2,2022-03-31 23:25:56,2022-03-31 23:32:37,,1.26,,,158,68,0,8.80,0.0,0.5,2.57,0.0,0.3,14.67,,


**3.drop the unwanted feature**

In [2]:
# improvement_surcharge and mta_tax and congestion_surcharge
# is not considered, since the taxi driver may not earn from this
df = df.iloc[:,[1,2,4,7,9,10,11,13,18]]
df

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,PULocationID,payment_type,fare_amount,extra,tip_amount,airport_fee
0,2022-01-01 00:35:40,2022-01-01 00:53:29,3.80,142,1,14.50,3.0,3.65,0.0
1,2022-01-01 00:33:43,2022-01-01 00:42:07,2.10,236,1,8.00,0.5,4.00,0.0
2,2022-01-01 00:53:21,2022-01-01 01:02:19,0.97,166,1,7.50,0.5,1.76,0.0
3,2022-01-01 00:25:21,2022-01-01 00:35:23,1.09,114,2,8.00,0.5,0.00,0.0
4,2022-01-01 00:36:48,2022-01-01 01:14:20,4.30,68,1,23.50,0.5,3.00,0.0
...,...,...,...,...,...,...,...,...,...
9071239,2022-03-31 23:45:00,2022-04-01 00:01:00,3.57,48,0,18.33,0.0,1.00,
9071240,2022-03-31 23:59:39,2022-04-01 00:06:09,2.04,48,0,9.75,0.0,2.00,
9071241,2022-03-31 23:50:00,2022-04-01 00:08:00,0.59,48,0,17.33,0.0,2.23,
9071242,2022-03-31 23:25:56,2022-03-31 23:32:37,1.26,158,0,8.80,0.0,2.57,


**4.save it**

it would be easy to reach the space limit, so I'll save the dataframe and use it in another ipynb file.

In [3]:
df.to_parquet("../data/curated/initialised_df.parquet")

**5.Download taxi zone by this link**

https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip

I downloaded the data by hand and unzip the data in the folder"raw"


In [4]:
# like tutorial sf stands for shape file
import geopandas as gpd
zone = pd.read_csv("https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv")
sf = gpd.read_file("../data/raw/taxi_zones.shp")
sf.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.918 192536.086, 933091.011 19..."
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((1033269.244 172126.008, 103343..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.770 256767.698, 1026495.593 ..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.467 203714.076, 992068.667 20..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((935843.310 144283.336, 936046.565 14..."


**6.Initialise and merge the data**

In [5]:
# Convert the geometry shape to to latitude and longitude like tute do
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((-74.18445 40.69500, -74.18449 40.695..."
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((-73.84793 40.87134, -73.84725 40.870..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((-73.97177 40.72582, -73.97179 40.725..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((-74.17422 40.56257, -74.17349 40.562..."


In [6]:
# Merge 2 data frame
gdf = gpd.GeoDataFrame(
    pd.merge(zone, sf, on='LocationID', how='inner')
)

gdf.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone,OBJECTID,Shape_Leng,Shape_Area,zone,borough,geometry
0,1,EWR,Newark Airport,EWR,1,0.116357,0.000782,Newark Airport,EWR,"POLYGON ((-74.18445 40.69500, -74.18449 40.695..."
1,2,Queens,Jamaica Bay,Boro Zone,2,0.43347,0.004866,Jamaica Bay,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone,3,0.084341,0.000314,Allerton/Pelham Gardens,Bronx,"POLYGON ((-73.84793 40.87134, -73.84725 40.870..."
3,4,Manhattan,Alphabet City,Yellow Zone,4,0.043567,0.000112,Alphabet City,Manhattan,"POLYGON ((-73.97177 40.72582, -73.97179 40.725..."
4,5,Staten Island,Arden Heights,Boro Zone,5,0.092146,0.000498,Arden Heights,Staten Island,"POLYGON ((-74.17422 40.56257, -74.17349 40.562..."


**7.Save it**

In [7]:
gdf.to_file("../data/curated/geo.shp")

  pd.Int64Index,
  gdf.to_file("../data/curated/geo.shp")
