# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1uAUJGEUzfNj6OsWNAimnYCw7eKaHhMUfU1MTj9YwYw4/edit?usp=sharing), [grading rubric](https://docs.google.com/document/d/1hKuRWqFcIdhOkow3Nljcm7PXzIkoa9c_aHkMKZDxWa0/edit?usp=sharing)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an outline to help you with your own approach.**_

## Group 10 
### Yixuan (Sharon) Qian - yq2348
### Michelle Jingyi Zhou - jz3508

## Project Setup

In [1]:
# all import statements needed for the project

import math
import os

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
import numpy as np
import re
import os.path
import glob
import geopandas as gpd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# any constants we might need

TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = "data/taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
UBER_CSV = "uber_rides_sample.csv"
WEATHER_CSV_FILES = ["2009_weather.csv", "2010_weather.csv", "2011_weather.csv", "2012_weather.csv",
                    "2013_weather.csv", "2014_weather.csv", "2015_weather.csv"]

EARTH_RADIUS = 6378.137
CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [3]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

Overview: For Part 1, we downloaded the Parquet files, cleaned and filtered for the relevant data, filling in missing data, and generating samples of these datasets.

## I changed this whole part to Mardown for now, we might change the order and load the taxi zone later - Michelle

### Load Taxi Zones

def load_taxi_zones(shapefile):

#Load taxi zones from a shapefile.
    taxi_zones = gpd.read_file(shapefile)
    return taxi_zones

def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    zone = loaded_taxi_zones[loaded_taxi_zones['LocationID'] == zone_loc_id]
    if len(zone) == 0:
        raise ValueError(f"Taxi zone with LocationID {zone_loc_id} not found.")
    elif len(zone) > 1:
        raise ValueError(f"Multiple taxi zones found with LocationID {zone_loc_id}.")
    lat = zone.geometry.centroid.y.values[0]
    lon = zone.geometry.centroid.x.values[0]
    return lat, lon

### Calculate distance

1.rad(d) function converts numeric degrees to radians

2.distance calculation function
calculate_distance_with_coords(from_coord, to_coord) calculates the distance btween coordinates

In [18]:
# This function converts numeric degrees to radians
# d is diameter
def rad(d):
    return d * math.pi / 180.0

In [2]:
def calculate_distance_with_coords(from_coord, to_coord):
    rad_lat1 = rad(from_coord['pickup_latitude'])
    rad_lon1 = rad(from_coord['pickup_longitude'])
    rad_lat2 = rad(to_coord['dropoff_longitude'])
    rad_lon2 = rad(to_coord['dropoff_latitude'])
    a = rad_lat1 - rad_lat2
    b = rad_lon1 - rad_lon2
    distance = 2 * math.asin(
        math.sqrt(math.pow(math.sin(a / 2), 2)+ math.cos(rad_lat1) * math.cos(rad_lat2) * math.pow(math.sin(b / 2), 2)))
    distance = distance * EARTH_RADIUS
    return distance

#### I commented out this cell - Michelle
def calculate_distance_with_zones(from_zone, to_zone):
    raise NotImplementedError()

In [2]:
#The calculation function is used to add columns to the relevant pandas dataframes (taxi data, uber data). 

def add_distance_column(dataframe):
    from_coord = dataframe[['pickup_latitude', 'pickup_longitude']]
    to_coord = dataframe[['dropoff_latitude', 'dropoff_longitude']]
    dataframe['cal_distance'] = calculate_distance(from_coord, to_coord)

### Process Taxi Data

1. this function programmatically downloads the Yellow Taxi Parquet files for a specific date range 2009-01 and 2015-06 from the website. it returns a list that contains all taxi data url in TAXI_URL ""https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

2.


3. we added latitude and logitude from taxi_zones. We also added pickup_latitude, dropoff_latitude, dropoff_latitude and dropoff_longitude as columns to the dataframe for convenient calculation

4. Download Parquet files, get some sample from these files, Clean the dataframe according to existing location IDs, Write data into .csv and return dataframe

5. get_and_clean_taxi_data
Get taxi data. If taxi.csv exists, read-only. Otherwise, download data and generate taxi.csv file

In [2]:
def get_all_urls_from_taxi_page():
    parquet_url_list=[]
    response=requests.get(url = TAXI_URL)
    
    if response.status_code==200:
        soup=BeautifulSoup(response.content,'lxml')
        hrefs=soup.find_all('a',href=re.compile("yellow_tripdata"))
        for href in hrefs:
            url=href.get('href')
            date = url.split('/')[-1].split("_")[-1]
            year = int(date[:4])
            if year >= 2009 and year < 2015:
                parquet_url_list.append(url)
            if year == 2015:
                month = int(date[5:7])
                if month <= 6:
                    parquet_url_list.append(url) 
    return parquet_url_list


In [None]:
columns = ["tpep_pickup_datetime", 
           "pickup_longitude", 
           "pickup_latitude", 
           "dropoff_longitude", 
           "dropoff_latitude",
           "total_amount"]
columns2 = ["pickup_datetime", 
           "pickup_longitude", 
           "pickup_latitude", 
           "dropoff_longitude", 
           "dropoff_latitude",
           "total_amount"]
columns3 = ["Trip_Pickup_DateTime", 
           "Start_Lon",
           "Start_Lat",
           "End_Lon", 
           "End_Lat",
           "Total_Amt"]

In [33]:
def add_lat_log_column(dataframe):
    
    print('add_lat_log_column')
    dftaxi = gpd.read_file('taxi_zones.shp')
    dftaxi = dftaxi.to_crs(CRS)
    
    lat1=[]
    lon1=[]
    lat2=[]
    lon2=[]
    for LocationID in dataframe["PULocationID"]:
        lat=dftaxi[dftaxi["LocationID"]==LocationID].geometry.centroid.x
        lon=dftaxi[dftaxi["LocationID"]==LocationID].geometry.centroid.y
        if lat.empty:
            lat1.append(0)
        else:
            lat1.append(lat[0])
        if lon.empty:
            lon1.append(0)
        else:
            lon1.append(log[0])
    
    for LocationID in dataframe["DOLocationID"]:
        lat=dftaxi[dftaxi["LocationID"]==LocationID].geometry.centroid.x
        lon=dftaxi[dftaxi["LocationID"]==LocationID].geometry.centroid.y
        if lat.empty:
            lat2.append(0)
        else:
            lat2.append(lat[0])
        if log.empty:
            lon2.append(0)
        else:
            lon2.append(lon[0])
    dataframe['pickup_latitude']=lat1
    dataframe['pickup_longitude']=lon1
    dataframe['dropoff_latitude']=lat2
    dataframe['dropoff_longitude']=lon2
    dataframe.to_csv("2.csv")

In [34]:
def get_and_clean_month(url):
    
    reponse = requests.get(url)

    filename=url.split('/')[-1]
    with open(filename, "wb") as f:
        f.write(reponse.content)
    
    df = pd.read_parquet(filename)
    print(filename)
    print(df.columns)
    df = df.sample(n=sample_size,ignore_index=True)
    try:
        if "PULocationID" in df.columns:
            add_latlog_column(df)
        df = df[columns]
    except:
        try:
            df = df[columns2]
        except:
            try:
                df = df[columns3]
            except:
                add_latlog_column(df)
                df = df[columns2]
    df.columns = columns2
    df = df.sample(n=sample_size,random_state = 1,ignore_index=True)
    return df

In [3]:
def get_and_clean_taxi_data(parquet_urls):
    all_taxi_dataframes = []

    all_parquet_urls = find_taxi_parquet_urls()
    
    for parquet_url in all_parquet_urls:
        
        dataframe = get_and_clean_month(parquet_url)
        all_taxi_dataframes.append(dataframe)
        taxi_data.to_csv(TAXI_CSV)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.contact(all_taxi_dataframes)
    return taxi_data

In [2]:
def get_taxi_data():
    all_urls = get_all_urls_from_taxi_page(TAXI_URL)
    all_parquet_urls = find_taxi_parquet_urls(all_urls)
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    return taxi_data

In [None]:
taxi_data = get_taxi_data()

In [None]:
taxi_data.head()

### Processing Uber Data

In [None]:
def load_and_clean_uber_data(csv_file):
    raise NotImplementedError()

In [None]:
def get_uber_data():
    uber_dataframe = load_and_clean_uber_data(UBER_DATA)
    add_distance_column(uber_dataframe)
    return uber_dataframe

In [None]:
uber_data = get_uber_data()

In [None]:
uber_data.head()

### Processing Weather Data

In [None]:
def get_all_weather_csvs(directory):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_hourly(csv_file):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_daily(csv_file):
    raise NotImplementedError()

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
daily_weather_data.head()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
engine.execute(QUERY_1).fetchall()

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)