In [1]:
# all import statements needed for the project, for example:
import math
import pandas as pd
import requests 
import sqlalchemy as db
from pandas import read_parquet
from pyarrow.parquet import ParquetDataset
from bs4 import BeautifulSoup
import re
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
from datetime import datetime
import datetime
import glob
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, scoped_session
import matplotlib.pyplot as plt
import sqlite3
from datetime import datetime
from scipy import stats
import numpy as np
from keplergl import KeplerGl
from pandas import DataFrame

In [2]:
# any constants you might need, for example:

TAXI_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
# add other constants to refer to any local data, e.g. uber & weather
UBER_CSV = "uber_rides_sample.csv"

NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
redius = 6378.137
DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"
TAXI_DIR = 'taxi_files'
WEATHER_DIR = "weather_files"

zones_df =gpd.read_file(r"taxi_zones.shp")
zones_df.to_csv("taxi_zones.csv")
zones_df = zones_df.to_crs(4326)
zones_df["lat"] = zones_df.centroid.x
zones_df["lon"] = zones_df.centroid.y
zones_data = []

zones_pickupID = zones_df.drop(
                ["OBJECTID","Shape_Leng",
                "Shape_Area","zone",
                "borough","geometry"],
                axis="columns").rename(
                columns={
                "LocationID":"PULocationID",
                "lon":"pickup_latitude",
                "lat":"pickup_longitude"
                })
zones_dropoffID = zones_df.drop(
                ["OBJECTID","Shape_Leng",
                "Shape_Area","zone",
                "borough","geometry"],
                axis="columns").rename(
                columns={
                "LocationID":"DOLocationID",
                "lon":"dropoff_latitude",
                "lat":"dropoff_longitude"
                })


  zones_df["lat"] = zones_df.centroid.x

  zones_df["lon"] = zones_df.centroid.y


Taxi data

In [3]:

# Calculate the distance through coordinates
def calculate_distance(from_coord, to_coord):
    lat1, lon1 = from_coord
    lat2, lon2 = to_coord

    diff_lat = math.radians(lat2 - lat1)
    diff_lon = math.radians(lon2 - lon1)
    
    cal_dis = (math.sin(diff_lat / 2) * math.sin(diff_lat / 2) +
             math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
             math.sin(diff_lon / 2) * math.sin(diff_lon / 2))
    cal_dis = 2 * math.atan2(math.sqrt(cal_dis), math.sqrt(1 - cal_dis))
    distance = redius * cal_dis

    return distance
    

# Add the coordinate columns to the dataframe
def add_distance_column(dataframe):
    dataframe["trip_distance"] = dataframe.apply(
                                lambda row: calculate_distance((row.pickup_longitude, row.pickup_latitude), 
                                                               (row.dropoff_longitude, row.dropoff_latitude)), 
                                                                axis=1)
    return dataframe

In [4]:
### Part 1: Data Preprocessing

In [5]:
# Get the download links of yellow taxi from 2009-01 to 2015-06
def find_taxi_csv_urls():
    response = requests.get(TAXI_URL)
    soup = BeautifulSoup(response.text, "html.parser")
    url_tags = soup.find_all("a", title="Yellow Taxi Trip Records")
    all_csv_urls = []
    pattern = "2009|2010|2011|2012|2013|2014|2015-0[1-6]"

    for url_tag in url_tags:
        if re.findall(pattern, url_tag["href"]):
            all_csv_urls.append(url_tag["href"])

    return all_csv_urls

In [6]:
def load_data_from_filename(filename):
    data = pd.read_parquet(filename)
    return data

In [7]:
# clean monthly yellow taxi data
def get_and_clean_month_taxi_data(url:str):
    file_name = url.split('/')[-1]
    if not os.path.exists(f'{TAXI_DIR}/{file_name}'):
        with open(f'{TAXI_DIR}/{file_name}', 'wb') as f:
            res = requests.get(url)
            f.write(res.content)

    dataframe = load_data_from_filename(f'{TAXI_DIR}/{file_name}')
    # Delete the useless columns
    # Attention: year 2009 and 2010 have unexpected columns' names.
    if "2010-" in file_name:
        dataframe = dataframe[
                            (dataframe["passenger_count"]> 0)
                            & (dataframe["total_amount"] > 0)
                                ]
        dataframe.drop([
                        "vendor_id", "dropoff_datetime", "rate_code", "store_and_fwd_flag",
                        "payment_type", "fare_amount", "surcharge",
                        "mta_tax","tolls_amount","passenger_count", 
                        "total_amount"
                        ], axis=1, inplace=True)
        
    elif "2009-" in file_name:
        dataframe = dataframe[
                            (dataframe["Passenger_Count"]> 0)
                            & (dataframe["Total_Amt"] > 0)
                                ]
        
        dataframe.drop([
                        "vendor_name", "Trip_Dropoff_DateTime", "Rate_Code",
                        "store_and_forward", "Payment_Type", "Fare_Amt", 
                        "surcharge", "mta_tax","Tolls_Amt",
                        "Passenger_Count",  'Total_Amt'
                        ], axis=1, inplace=True)
        
    else:
        dataframe = dataframe[
                            (dataframe["passenger_count"]> 0)
                            & (dataframe["total_amount"] > 0)
                                ]
        
        
        dataframe.drop([
                        "RatecodeID", "tolls_amount", "payment_type", 
                        "store_and_fwd_flag","mta_tax", "improvement_surcharge", 
                        "fare_amount", "extra","congestion_surcharge",
                        "airport_fee","VendorID", "tpep_dropoff_datetime", 
                        "passenger_count", "total_amount"
                        ], axis=1, inplace=True)
    
    # Unify the columns' names of year 2009 and 2012
    if "2010-" in file_name or "2009-" in file_name:
         dataframe.columns = [
                            "pickup_datetime", "trip_distance", "pickup_longitude", 
                             "pickup_latitude", "dropoff_longitude", "dropoff_latitude", 
                             "tip_amount"
                            ]
            
    else:
        # Change the IDs to the pick-up and drop-off coordinates, adding columns
        df = pd.merge(dataframe, zones_pickupID, on = "PULocationID")
        dataframe = pd.merge(df, zones_dropoffID, on = "DOLocationID")
        dataframe.drop(["PULocationID", "DOLocationID"], axis=1, inplace=True)
        dataframe.columns = [
            'pickup_datetime', 'trip_distance', 'tip_amount',
            'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'
                            ]
    

    # Remove the invalide datas
    dataframe = dataframe[
        (dataframe["pickup_longitude"] >= -74.242330)
         & (dataframe["pickup_latitude"] <= 40.908524)
         & (dataframe["pickup_longitude"] <= -73.717047)
         & (dataframe["pickup_latitude"] >= 40.560445)
         & (dataframe["dropoff_longitude"] >= -74.24233)
         & (dataframe["dropoff_latitude"] <= 40.908524)
         & (dataframe["dropoff_longitude"] <= -73.717047)
         & (dataframe["dropoff_latitude"] >= 40.560445)
        ]
    
    # Randomly choose a sample from every month.
    # Accumulated sample size is approximately to the uber data amount
    dataframe = dataframe.sample(n=2550)
    
    return dataframe

UBER DATA

In [8]:
# clean uber data
def load_and_clean_uber_data(csv_file):
    df = pd.read_csv(csv_file)
    df = df.drop(columns = ["key", "Unnamed: 0"])
    df = df[
            (df["pickup_longitude"] >= -74.242330)
            & (df["pickup_latitude"] <= 40.908524)
            & (df["pickup_longitude"] <= -73.717047)
            & (df["pickup_latitude"] >= 40.560445)
            & (df["dropoff_longitude"] >= -74.24233)
            & (df["dropoff_latitude"] <= 40.908524)
            & (df["dropoff_longitude"] <= -73.717047)
            & (df["dropoff_latitude"] >= 40.560445)
            & (df["passenger_count"] > 0)
            & (df["fare_amount"] > 0 )
            ]

    df.drop(["passenger_count","fare_amount"], axis=1, inplace=True)
    df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"]).apply(lambda t: t.replace(tzinfo=None))
    
    return df    

In [9]:
def get_uber_data():
    uber_dataframe = load_and_clean_uber_data(UBER_CSV)
    add_distance_column(uber_dataframe)
    uber_dataframe= uber_dataframe[uber_dataframe["trip_distance"] > 0]
    return uber_dataframe

WEATHER Data

In [29]:
#clean weather data, and deal with the datetime

def clean_month_weather_data_hourly(csv_file):
    import datetime
    df = pd.read_csv(csv_file)
    df2 = df.loc[:, ["DATE", "HourlyWindSpeed",'HourlyPrecipitation']]
    df2["DATE HOUR"] = df2["DATE"].apply(lambda x:x[:-6])
    df2.drop(columns = ["DATE"], axis=1, inplace=True)
    df2["HourlyPrecipitation"].replace("T", 0.001)
    
    df2["HourlyWindSpeed"] = df2["HourlyWindSpeed"].apply(pd.to_numeric, errors = "coerce")
    df2["HourlyPrecipitation"] = df2["HourlyPrecipitation"].apply(pd.to_numeric, errors = "coerce")
    
    df2["HourlyWindSpeed"].fillna(0, inplace=True)
    df2["HourlyPrecipitation"].fillna(0, inplace=True)
    
    df3= df2.groupby(["DATE HOUR"], as_index=False)["HourlyWindSpeed"].mean()
    
    df4= df2.groupby(["DATE HOUR"], as_index=False)["HourlyPrecipitation"].sum()
    
    df4= pd.merge(df3, df4, on="DATE HOUR", how="inner")
    
    df4["DATE HOUR"] = list(map(lambda x: datetime.datetime.strptime(x,"%Y-%m-%dT%H").strftime("%Y-%m-%d %H"), df4["DATE HOUR"]))
    
    return df4

In [30]:
#clean weather data, and deal with the datetime
def clean_month_weather_data_daily(csv_file):
    df = pd.read_csv(csv_file)
    df2 = df.loc[:, ["DATE", "HourlyWindSpeed",'HourlyPrecipitation']]
    df2["DATE DAYLY"] = df2["DATE"].apply(lambda x:x[:-9])
    df2.drop(columns=["DATE"],axis= 1,inplace=True)
    df2["HourlyPrecipitation"].replace('T', 0.001)
    
    df2["HourlyWindSpeed"] = df2["HourlyWindSpeed"].apply(pd.to_numeric, errors = "coerce")
    df2["HourlyPrecipitation"] = df2["HourlyPrecipitation"].apply(pd.to_numeric, errors = "coerce")
    
    
    df2["HourlyWindSpeed"].fillna(0, inplace=True)
    df2["HourlyPrecipitation"].fillna(0, inplace=True)
        
    df3= df2.groupby(["DATE DAYLY"], as_index=False)["HourlyWindSpeed"].mean()
    
    df4= df2.groupby(["DATE DAYLY"], as_index=False)["HourlyPrecipitation"].sum()
    
    df4= pd.merge(df3, df4, on="DATE DAYLY", how='inner')
    df4["DatelyWindSpeed"] = df4["HourlyWindSpeed"]
    df4["DatelyPrecipitation"] = df4["HourlyPrecipitation"]
    
    
    df4.drop(columns=["HourlyPrecipitation","HourlyWindSpeed"],axis= 1,inplace=True)
    
    return df4

In [31]:
def load_and_clean_weather_data():
    hourly_dataframes = []
    daily_dataframes = []

    weather_csv_files = [
                       "2009_weather.csv","2010_weather.csv","2011_weather.csv",
                       "2012_weather.csv","2013_weather.csv","2014_weather.csv",
                       "2015_weather.csv"
                        ]
    
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    daily_data.drop(daily_data.loc[daily_data['DATE DAYLY'] > "2015-06-31"].index, inplace=True)
    hourly_data.drop(hourly_data.loc[hourly_data['DATE HOUR'] > "2015-06-31"].index, inplace=True)
    
    hourly_data.columns = ["datetime", "wind_speed", "precipitation"]
    daily_data.columns = ["datetime", "wind_speed", "precipitation"]
    return hourly_data, daily_data

## Part 2 : Storing Data

In [13]:
engine = db.create_engine(DATABASE_URL, echo=True)

In [14]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA =  """
        CREATE TABLE IF NOT EXISTS hourly_weather_data
        (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            datetime DATETIME,
            precipitation FLOAT,
            wind_speed FLOAT
        )
        """

DAILY_WEATHER_SCHEMA = """
        CREATE TABLE IF NOT EXISTS daily_weather_data
        (
           id INTEGER PRIMARY KEY AUTOINCREMENT,
           datetime DATETIME,
           precipitation FLOAT,
           wind_speed FLOAT
        )
        """

TAXI_TRIPS_SCHEMA = """
        CREATE TABLE IF NOT EXISTS taxi_data
        (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            pickup_datetime DATETIME,
            trip_distance FLOAT,
            tip_amount FLOAT,
            pickup_longitude FLOAT,
            pickup_latitude FLOAT,
            dropoff_longitude FLOAT,
            dropoff_latitude FLOAT 
        )
        """

UBER_TRIPS_SCHEMA = """
        CREATE TABLE IF NOT EXISTS uber_data
        (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            pickup_datetime DATETIME,
            trip_distance FLOAT,
            fare_amount FLOAT,
            pickup_longitude FLOAT,
            pickup_latitude FLOAT,
            dropoff_longitude FLOAT,
            dropoff_latitude FLOAT
        )
        """

In [15]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [16]:
# create the tables with the schema files
with engine.connect() as connection:
    taxi_trips = connection.execute(TAXI_TRIPS_SCHEMA)
    uber_trips = connection.execute(TAXI_TRIPS_SCHEMA)
    daily_weather = connection.execute(DAILY_WEATHER_SCHEMA)
    hourly_weather = connection.execute(HOURLY_WEATHER_SCHEMA)

2022-12-11 15:47:31,723 INFO sqlalchemy.engine.Engine 
        CREATE TABLE IF NOT EXISTS taxi_data
        (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            pickup_datetime DATETIME,
            trip_distance FLOAT,
            tip_amount FLOAT,
            pickup_longitude FLOAT,
            pickup_latitude FLOAT,
            dropoff_longitude FLOAT,
            dropoff_latitude FLOAT 
        )
        
2022-12-11 15:47:31,724 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 15:47:31,727 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 15:47:31,728 INFO sqlalchemy.engine.Engine 
        CREATE TABLE IF NOT EXISTS taxi_data
        (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            pickup_datetime DATETIME,
            trip_distance FLOAT,
            tip_amount FLOAT,
            pickup_longitude FLOAT,
            pickup_latitude FLOAT,
            dropoff_longitude FLOAT,
            dropoff_latitude FLOAT 
        )
        
2022-12-11 15:47:31,728 

In [17]:
def get_and_clean_taxi_data():
    
    all_csv_urls = find_taxi_csv_urls()
    df_list = []
    for csv_url in all_csv_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_month_taxi_data(csv_url)
        add_distance_column(dataframe)
        dataframe = dataframe[dataframe["trip_distance"]> 0]
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        _df = dataframe
        _df.to_sql("taxi_trips", con=engine, index=False, if_exists="append")
        df_list.append(_df)
    
    
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(df_list)
    taxi_data.to_csv(f'finally_taxi_data.csv', index=False, header=taxi_data.columns, encoding="utf-8")

In [18]:
def add_uber_data_to_db():
    uber_data = get_uber_data()
    uber_data.to_csv(f'finally_uber_data.csv', index=False, header=uber_data.columns, encoding='utf-8')
    
    for i in range(40):
        uber_df = uber_data[i * 5000:(i + 1)*5000]
        uber_df.to_sql('uber_trips', con=engine, index=False, if_exists='append')

In [32]:
def add_weather_data_to_db():
    hourly_data, daily_data = load_and_clean_weather_data()

    daily_data.to_sql("daily_weather", con=engine, index=False, if_exists="append")
    daily_data.to_csv(f'finally_daily_weather_data.csv', index=False, header=daily_data.columns, encoding="utf-8")

    # Import the hourly weather data to the table in 13 batches and
    # every time has amount of around 5000
    hourly_data.to_csv(f'finally_hourly_weather_data.csv', index=False, header=hourly_data.columns, encoding="utf-8")
    for i in range(13):
        hourly_df = hourly_data[i * 5000:(i + 1)*5000]
        hourly_df.to_sql("hourly_weather", con=engine, index=False, if_exists='append')

### Add data to DataBase

In [20]:
# When we call these three functions, we could get add data to our database
# get_and_clean_taxi_data()
# add_weather_data_to_db()
# add_uber_data_to_db()

In [21]:
get_and_clean_taxi_data()

2022-12-11 15:47:53,075 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("taxi_trips")
2022-12-11 15:47:53,077 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 15:47:53,079 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("taxi_trips")
2022-12-11 15:47:53,079 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 15:47:53,080 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 15:47:53,081 INFO sqlalchemy.engine.Engine 
CREATE TABLE taxi_trips (
	pickup_datetime DATETIME, 
	trip_distance FLOAT, 
	tip_amount FLOAT, 
	pickup_longitude FLOAT, 
	pickup_latitude FLOAT, 
	dropoff_longitude FLOAT, 
	dropoff_latitude FLOAT
)


2022-12-11 15:47:53,082 INFO sqlalchemy.engine.Engine [no key 0.00044s] ()
2022-12-11 15:47:53,084 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 15:47:53,087 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 15:47:53,110 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, tip_amount, pickup_longitude, pickup_l

2022-12-11 15:48:41,044 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 15:48:57,217 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("taxi_trips")
2022-12-11 15:48:57,219 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 15:48:57,224 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 15:48:57,245 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, tip_amount, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 15:48:57,246 INFO sqlalchemy.engine.Engine [generated in 0.01817s] (('2015-05-29 15:10:26.000000', 5.4053130474177316, 4.26, -73.87362864289108, 40.77437570593249, -73.82871246597531, 40.70805099798807), ('2015-05-24 15:44:58.000000', 1.7993850197997374, 0.0, -73.98153220639182, 40.7736332930271, -73.96555356545913, 40.78247809974789), ('2015-05-13 21:31:44.000000', 3.375730463452032, 0.0, -73.96514579918421, 40.75672894163307, -73.93482888929137, 40.7542425205003), ('201

2022-12-11 15:50:07,440 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 15:50:07,463 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, tip_amount, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 15:50:07,464 INFO sqlalchemy.engine.Engine [generated in 0.01896s] (('2014-03-29 01:52:27.000000', 3.8478332399520605, 1.0, -73.99991742024713, 40.748427555065724, -73.9681683330924, 40.79796199310004), ('2014-03-22 18:09:20.000000', 1.4601279421660425, 1.5, -73.99089626184734, 40.71893835938025, -73.98419648907571, 40.75981761719184), ('2014-03-02 01:17:55.000000', 8.166680512552922, 7.05, -73.99089626184734, 40.71893835938025, -73.91771053131578, 40.7005221995192), ('2014-03-15 00:03:00.000000', 3.03931672722436, 0.0, -73.97304890061594, 40.791704934427074, -73.94575026755156, 40.79001075149545), ('2014-03-04 09:52:35.000000', 0.28456773175535766, 0.0, -74.01607915192923, 40.712038157

2022-12-11 15:51:14,891 INFO sqlalchemy.engine.Engine [generated in 0.01818s] (('2014-07-22 16:54:57.000000', 1.8185677374827074, 1.9, -73.97304890061594, 40.791704934427074, -73.95701169835736, 40.78043643718997), ('2014-07-18 19:23:00.000000', 1.0437759850457347, 3.62, -73.97649472376757, 40.7404389785964, -73.98419648907571, 40.75981761719184), ('2014-07-24 14:20:00.000000', 5.910955434317264, 0.0, -73.9887865991153, 40.75351275872571, -73.9373456081249, 40.8011694805865), ('2014-07-23 07:32:00.000000', 2.541622701099995, 1.7, -73.95473878380503, 40.765484087889845, -73.97756868222719, 40.7644214057868), ('2014-07-10 17:46:00.000000', 1.9543246554125229, 1.0, -73.97235594352028, 40.75668765218882, -73.98984464313301, 40.762252755319366), ('2014-07-05 11:35:00.000000', 3.665096871664437, 1.9, -73.94575026755156, 40.79001075149545, -73.97863194845081, 40.78396143031378), ('2014-07-04 14:38:00.000000', 1.0340179984091813, 2.25, -73.9859374568246, 40.727620195909545, -73.97849159965226,

2022-12-11 15:52:17,902 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 15:52:33,025 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("taxi_trips")
2022-12-11 15:52:33,027 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 15:52:33,031 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 15:52:33,052 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, tip_amount, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 15:52:33,053 INFO sqlalchemy.engine.Engine [generated in 0.01749s] (('2014-12-19 23:11:38.000000', 4.282735984785595, 3.0, -73.97044256869238, 40.74991407790217, -74.00787970866403, 40.717772736265175), ('2014-12-29 18:19:03.000000', 0.706394547084737, 0.0, -73.98405213268919, 40.736824057618975, -73.98419648907571, 40.75981761719184), ('2014-12-10 22:48:23.000000', 1.450834263271719, 2.0, -73.97235594352028, 40.75668765218882, -73.9596347414917, 40.766948216383234), ('20

2022-12-11 15:53:43,061 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 15:53:43,083 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, tip_amount, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 15:53:43,083 INFO sqlalchemy.engine.Engine [generated in 0.01801s] (('2013-04-29 20:42:23.000000', 4.737858174431907, 0.0, -73.98515639467684, 40.74857462935672, -73.9485217302088, 40.82701265492538), ('2013-04-24 16:52:33.000000', 2.5043941691584997, 0.0, -73.94651035601463, 40.775932403149945, -73.9681683330924, 40.79796199310004), ('2013-04-21 21:28:00.000000', 5.483678536218837, 0.0, -73.9859374568246, 40.727620195909545, -73.93679314491887, 40.715369882143555), ('2013-04-14 04:05:00.000000', 0.6780268589135603, 1.0, -73.99738026020043, 40.728340391372306, -73.99991742024713, 40.748427555065724), ('2013-04-11 22:10:00.000000', 5.411191303829279, 5.0, -73.87362864289108, 40.774375705

2022-12-11 15:54:45,120 INFO sqlalchemy.engine.Engine [generated in 0.01755s] (('2013-08-17 03:48:49.000000', 3.3980928641239845, 2.0, -74.00748581800953, 40.7262904102812, -73.97696825691766, 40.723752141584804), ('2013-08-25 04:05:00.000000', 0.5763416386100594, 0.0, -73.9681683330924, 40.79796199310004, -73.97304890061594, 40.791704934427074), ('2013-08-06 14:19:21.000000', 1.3347421960023054, 0.0, -73.96563453538072, 40.76861518381155, -73.97756868222719, 40.7644214057868), ('2013-08-06 22:02:54.000000', 5.012357428690962, 0.0, -73.98984464313301, 40.762252755319366, -73.9485217302088, 40.82701265492538), ('2013-08-18 08:48:09.000000', 1.287777550241077, 1.4, -73.94575026755156, 40.79001075149545, -73.95701169835736, 40.78043643718997), ('2013-08-29 16:42:44.000000', 6.6305891334641425, 0.0, -73.94889145447952, 40.74537943399859, -74.0074959856243, 40.70680845166585), ('2013-08-17 01:07:00.000000', 0.29785038903528255, 0.9, -73.97235594352028, 40.75668765218882, -73.97044256869238,

2022-12-11 15:55:52,687 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 15:56:07,904 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("taxi_trips")
2022-12-11 15:56:07,906 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 15:56:07,911 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 15:56:07,932 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, tip_amount, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 15:56:07,933 INFO sqlalchemy.engine.Engine [generated in 0.01780s] (('2012-01-26 10:32:31.000000', 1.5608115032014813, 0.0, -73.99045782354733, 40.74033744175702, -74.00401512528647, 40.753309065983395), ('2012-01-06 08:45:49.000000', 0.5468735721919958, 1.0, -73.95473878380503, 40.765484087889845, -73.9596347414917, 40.766948216383234), ('2012-01-09 17:26:40.000000', 1.0963624077945906, 0.0, -73.97235594352028, 40.75668765218882, -73.96555356545913, 40.78247809974789), (

2022-12-11 15:57:13,526 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 15:57:13,548 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, tip_amount, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 15:57:13,548 INFO sqlalchemy.engine.Engine [generated in 0.01788s] (('2012-05-03 17:29:00.000000', 1.0264381047479427, 0.0, -73.96563453538072, 40.76861518381155, -73.95701169835736, 40.78043643718997), ('2012-05-26 19:36:29.000000', 2.4091938978908094, 0.0, -73.95701169835736, 40.78043643718997, -73.97863194845081, 40.78396143031378), ('2012-05-24 18:23:00.000000', 1.369708433378178, 0.0, -73.96176359682921, 40.80945696112528, -73.97304890061594, 40.791704934427074), ('2012-05-28 23:02:38.000000', 1.8531281885998943, 0.0, -73.98419648907571, 40.75981761719184, -73.99691854183823, 40.72088889344527), ('2012-05-11 13:33:00.000000', 2.068743209769153, 1.38, -73.99045782354733, 40.74033744

2022-12-11 15:58:25,606 INFO sqlalchemy.engine.Engine [generated in 0.01812s] (('2012-09-01 01:26:00.000000', 4.271696515540299, 1.5, -73.99089626184734, 40.71893835938025, -73.95473878380503, 40.765484087889845), ('2012-09-24 07:39:00.000000', 2.0640023588135663, 2.7, -73.99697141558364, 40.74227862901228, -73.97849159965226, 40.747745793643915), ('2012-09-03 12:59:28.000000', 2.989236826323621, 0.0, -73.95100987481798, 40.77876585543437, -73.97756868222719, 40.7644214057868), ('2012-09-14 23:57:38.000000', 3.1340911731213112, 0.0, -73.94575026755156, 40.79001075149545, -73.97235594352028, 40.75668765218882), ('2012-09-13 22:27:00.000000', 0.7211936804693112, 1.0, -73.99045782354733, 40.74033744175702, -73.98405213268919, 40.736824057618975), ('2012-09-07 14:30:48.000000', 12.692398479635667, 10.0, -73.87362864289108, 40.77437570593249, -73.98764554944384, 40.77596522876096), ('2012-09-23 00:56:45.000000', 1.8909269509389472, 1.0, -73.99089626184734, 40.71893835938025, -74.00787970866

2022-12-11 15:59:34,094 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 15:59:51,483 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("taxi_trips")
2022-12-11 15:59:51,485 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 15:59:51,490 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 15:59:51,512 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, tip_amount, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 15:59:51,513 INFO sqlalchemy.engine.Engine [generated in 0.01813s] (('2011-02-10 12:11:00.000000', 1.3293693865994132, 1.3, -73.99697141558364, 40.74227862901228, -73.98515639467684, 40.74857462935672), ('2011-02-15 07:23:00.000000', 1.1989044026624172, 0.0, -73.99991742024713, 40.748427555065724, -73.98984464313301, 40.762252755319366), ('2011-02-21 15:14:00.000000', 2.2429592815393073, 0.0, -73.9887865991153, 40.75351275872571, -74.00748581800953, 40.7262904102812), ('2

2022-12-11 16:01:04,856 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 16:01:04,878 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, tip_amount, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 16:01:04,879 INFO sqlalchemy.engine.Engine [generated in 0.01816s] (('2011-06-10 12:02:00.000000', 0.45266869129274023, 0.0, -73.97863194845081, 40.78396143031378, -73.98153220639182, 40.7736332930271), ('2011-06-03 12:07:00.000000', 1.425751381299376, 1.0, -73.99045782354733, 40.74033744175702, -73.98153220639182, 40.7736332930271), ('2011-06-28 13:45:00.000000', 0.8946538448273352, 0.0, -73.94199705818421, 40.76031363502697, -73.94889145447952, 40.74537943399859), ('2011-06-09 20:11:36.000000', 0.8326595389483107, 1.18, -73.99243753697608, 40.748497181405, -73.99991742024713, 40.748427555065724), ('2011-06-22 19:12:00.000000', 11.003791384994715, 5.5, -73.87362864289108, 40.7743757059

2022-12-11 16:02:13,530 INFO sqlalchemy.engine.Engine [generated in 0.01795s] (('2011-10-23 13:38:00.000000', 1.7764277785782026, 0.0, -73.98153220639182, 40.7736332930271, -73.96563453538072, 40.76861518381155), ('2011-10-29 02:34:00.000000', 3.401047495682205, 0.0, -74.00153756565634, 40.72388811004171, -73.97235594352028, 40.75668765218882), ('2011-10-25 16:06:56.000000', 1.71341175216928, 0.0, -73.95100987481798, 40.77876585543437, -73.96514579918421, 40.75672894163307), ('2011-10-07 06:55:13.000000', 1.5810593215532944, 1.0, -73.97304890061594, 40.791704934427074, -73.98419648907571, 40.75981761719184), ('2011-10-12 10:10:29.000000', 0.3280408821771978, 0.0, -73.97769793122403, 40.758028043526274, -73.97849159965226, 40.747745793643915), ('2011-10-21 11:36:04.000000', 1.8531281885998943, 0.0, -73.98419648907571, 40.75981761719184, -73.99691854183823, 40.72088889344527), ('2011-10-11 19:13:00.000000', 0.4346136508266335, 0.0, -73.9887865991153, 40.75351275872571, -73.99243753697608

2022-12-11 16:03:39,395 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 16:04:02,200 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("taxi_trips")
2022-12-11 16:04:02,202 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 16:04:02,205 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 16:04:02,222 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, tip_amount) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 16:04:02,222 INFO sqlalchemy.engine.Engine [generated in 0.01254s] (('2010-03-09 04:45:53', 0.5771552307696297, -74.000557, 40.747738, -74.000862, 40.72896, 0.0), ('2010-03-23 22:32:00', 1.5810121558717285, -73.981155, 40.74167, -73.96751999999998, 40.756067, 2.0), ('2010-03-24 03:27:37', 0.02112554611290614, -73.950721, 40.771131, -73.950895, 40.770857, 0.0), ('2010-03-27 03:43:00', 0.3976391676770979, -73.992034, 40.726427, -73.98869999999998, 40.731076, 0.0), ('2010-03

2022-12-11 16:06:10,423 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 16:06:31,992 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("taxi_trips")
2022-12-11 16:06:31,994 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 16:06:31,997 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 16:06:32,013 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, tip_amount) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 16:06:32,014 INFO sqlalchemy.engine.Engine [generated in 0.01217s] (('2010-08-06 01:09:00', 3.305065791182032, -73.98340799999998, 40.767872, -73.953868, 40.778662, 2.0), ('2010-08-30 01:13:00', 3.3960985131402577, -73.98451199999998, 40.764423, -73.955967, 40.80341, 2.0), ('2010-08-09 07:17:35', 5.463430951908214, -73.922078, 40.743802, -73.971109, 40.751636, 0.0), ('2010-08-21 15:35:37', 4.899186549296567, -73.991189, 40.770575, -73.95078599999998, 40.70738, 0.0), ('201

2022-12-11 16:08:24,012 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 16:08:51,777 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("taxi_trips")
2022-12-11 16:08:51,780 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 16:08:51,783 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 16:08:51,800 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, tip_amount) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 16:08:51,800 INFO sqlalchemy.engine.Engine [generated in 0.01236s] (('2009-01-09 07:22:54', 1.236843413525995, -73.952651, 40.786395, -73.941996, 40.797785, 0.0), ('2009-01-12 08:02:00', 2.817144950475521, -73.95482699999998, 40.773432, -73.97973299999998, 40.757188, 0.0), ('2009-01-24 21:08:00', 2.9378351988374933, -73.94724999999998, 40.784133, -73.97179699999998, 40.749058, 0.0), ('2009-01-13 16:55:00', 0.8412075230136679, -73.980208, 40.767402, -73.97282699999998, 40.

2022-12-11 16:10:46,854 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 16:11:16,654 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("taxi_trips")
2022-12-11 16:11:16,656 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 16:11:16,662 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 16:11:16,678 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, tip_amount) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 16:11:16,679 INFO sqlalchemy.engine.Engine [generated in 0.01280s] (('2009-06-01 07:58:41', 2.864239471085297, -73.981944, 40.777083, -73.956283, 40.783897, 0.0), ('2009-06-08 07:36:00', 18.4294013408253, -73.77678, 40.645062, -73.94114299999998, 40.71637, 8.82), ('2009-06-20 10:57:00', 1.9173700684865866, -73.974175, 40.731277, -73.99136699999998, 40.735082, 0.0), ('2009-06-09 19:21:09', 2.2090549777864332, -73.991539, 40.742389, -74.01032499999998, 40.719191, 1.98), ('2

2022-12-11 16:13:21,421 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 16:13:52,104 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("taxi_trips")
2022-12-11 16:13:52,106 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 16:13:52,109 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 16:13:52,125 INFO sqlalchemy.engine.Engine INSERT INTO taxi_trips (pickup_datetime, trip_distance, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, tip_amount) VALUES (?, ?, ?, ?, ?, ?, ?)
2022-12-11 16:13:52,126 INFO sqlalchemy.engine.Engine [generated in 0.01203s] (('2009-11-24 11:56:06', 0.8106623808414518, -73.98530599999998, 40.744728, -73.97850699999998, 40.754182, 0.0), ('2009-11-30 13:48:00', 3.4107558335406676, -74.003393, 40.727408, -73.97452199999998, 40.7646, 0.0), ('2009-11-10 12:29:59', 3.775674147992888, -73.97232099999998, 40.764384, -74.00557499999998, 40.740181, 0.0), ('2009-11-09 18:07:13', 0.4273835535900282, -73.9871, 40.743666, -73.986924, 40.757

ValueError: No objects to concatenate

In [33]:
add_weather_data_to_db()

  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)


2022-12-11 16:19:21,376 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("daily_weather")
2022-12-11 16:19:21,377 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 16:19:21,378 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 16:19:21,383 INFO sqlalchemy.engine.Engine INSERT INTO daily_weather (datetime, wind_speed, precipitation) VALUES (?, ?, ?)
2022-12-11 16:19:21,383 INFO sqlalchemy.engine.Engine [generated in 0.00336s] (('2009-01-01', 11.041666666666666, 0.0), ('2009-01-02', 6.59375, 0.0), ('2009-01-03', 9.875, 0.0), ('2009-01-04', 7.37037037037037, 0.0), ('2009-01-05', 6.925925925925926, 0.0), ('2009-01-06', 6.9, 0.12), ('2009-01-07', 9.58974358974359, 1.82), ('2009-01-08', 11.192307692307692, 0.0)  ... displaying 10 of 1267 total bound parameter sets ...  ('2015-06-29', 4.28, 0.0), ('2015-06-30', 4.2, 0.08))
2022-12-11 16:19:21,386 INFO sqlalchemy.engine.Engine COMMIT


  df = pd.read_csv(csv_file)


2022-12-11 16:19:21,477 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("hourly_weather")
2022-12-11 16:19:21,478 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-11 16:19:21,480 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-12-11 16:19:21,495 INFO sqlalchemy.engine.Engine INSERT INTO hourly_weather (datetime, wind_speed, precipitation) VALUES (?, ?, ?)
2022-12-11 16:19:21,496 INFO sqlalchemy.engine.Engine [generated in 0.01267s] (('2009-01-01 00', 18.0, 0.0), ('2009-01-01 01', 18.0, 0.0), ('2009-01-01 02', 18.0, 0.0), ('2009-01-01 03', 8.0, 0.0), ('2009-01-01 04', 11.0, 0.0), ('2009-01-01 05', 18.0, 0.0), ('2009-01-01 06', 14.0, 0.0), ('2009-01-01 07', 8.0, 0.0)  ... displaying 10 of 5000 total bound parameter sets ...  ('2010-01-30 02', 10.0, 0.0), ('2010-01-30 03', 11.0, 0.0))
2022-12-11 16:19:21,504 INFO sqlalchemy.engine.Engine COMMIT
2022-12-11 16:19:21,506 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("hourly_weather")
2022-12-11 16:19:21,507 INFO sqlalchemy.

In [28]:
add_uber_data_to_db()

FileNotFoundError: [Errno 2] No such file or directory: 'uber_rides_sample.csv'

## Part 3 : Understanding Data

In [26]:
#Question 1: 
def get_hour_popular_yellow_taxi():
    with engine.connect() as connection:
        query_sql = f"select strftime('%H',pickup_datetime) AS cur_hour,count(*) as num from taxi_trips GROUP BY cur_hour order by num desc;"
        # Write the query to the query file
        with open(f'{QUERY_DIRECTORY}/question1.sql', "w") as f:
            f.write(query_sql)
        # Execute the query
        result = connection.execute(query_sql)
        return [r[0] for r in result]
    
print(get_hour_popular_yellow_taxi())

2022-12-10 20:44:38,551 INFO sqlalchemy.engine.Engine select strftime('%H',pickup_datetime) AS cur_hour,count(*) as num from taxi_trips GROUP BY cur_hour order by num desc;
2022-12-10 20:44:38,553 INFO sqlalchemy.engine.Engine [raw sql] ()
['19', '18', '20', '21', '22', '14', '17', '12', '13', '23', '15', '09', '11', '08', '10', '16', '00', '07', '01', '02', '06', '03', '04', '05']


In [27]:
# Question 2:
def get_weekday_popular_uber():
    weekdays_dict = {
        '0': 'Sun',
        '1': 'Mon',
        '2': 'Tue',
        '3': 'Wed',
        '4': 'Thur',
        '5': 'Fri',
        '6': 'Sat'
    }
    with engine.connect() as connection:
        query_sql = f"select strftime('%w',pickup_datetime) AS cur_weekday,count(*) as num from uber_trips GROUP BY cur_weekday order by num desc;"
        # Write the query to the query file
        with open(f'{QUERY_DIRECTORY}/question2.sql', "w") as f:
            f.write(query_sql)
        # Execute the query
        result = connection.execute(query_sql)
        return [weekdays_dict[r[0]] for r in result]
    
print(get_weekday_popular_uber())

2022-12-10 20:44:41,105 INFO sqlalchemy.engine.Engine select strftime('%w',pickup_datetime) AS cur_weekday,count(*) as num from uber_trips GROUP BY cur_weekday order by num desc;
2022-12-10 20:44:41,106 INFO sqlalchemy.engine.Engine [raw sql] ()
['Fri', 'Sat', 'Thur', 'Wed', 'Tue', 'Sun', 'Mon']


In [30]:
# Question 3
def get_sum_trip_distence():
    with engine.connect() as connection:
        distence_query_sql = """
                            select sum(trip_distance)*0.95 as total_distance
                            from 
                            (select * from taxi_trips where strftime('%Y-%m', pickup_datetime) = '2013-07'
                             union all
                             select * from taxi_trips where strftime('%Y-%m', pickup_datetime) = '2013-07')

                            """
        # Write the query to the query file
        with open(f'{QUERY_DIRECTORY}/question3.sql', "w") as f:
            f.write(distence_query_sql)
         # Execute the query
        result = list(connection.execute(distence_query_sql))
        distence = 0
        for row in result:
            print(row)
    
get_sum_trip_distence()

2022-12-10 21:02:18,944 INFO sqlalchemy.engine.Engine 
                            select sum(trip_distance)*0.95 as total_distance
                            from 
                            (select * from taxi_trips where strftime('%Y-%m', pickup_datetime) = '2013-07'
                             union all
                             select * from taxi_trips where strftime('%Y-%m', pickup_datetime) = '2013-07')

                            
2022-12-10 21:02:18,946 INFO sqlalchemy.engine.Engine [raw sql] ()
(14069.518999999997,)


In [33]:
# Question4
def get_2019_trips_info():
    with engine.connect() as connection:
        q4 = """
        select strftime('%Y-%m-%d', pickup_datetime) as date, count(*) as trips_amount, sum(trip_distance) as distance
        from
        (select * from taxi_trips where strftime('%Y', pickup_datetime) = '2014'
        union all
        select * from uber_trips where strftime('%Y', pickup_datetime) = '2014')
        group by strftime('%m-%d', pickup_datetime)
        order by trips_amount desc
        limit 10
        """
        
        # create the query.sql file
        with open(f'{QUERY_DIRECTORY}/question4.sql', "w") as f:
            f.write(q4)
            
        # check the result
        result = list(connection.execute(q4))
        for row in result:
            print(row)
            
get_2019_trips_info()

2022-12-10 21:04:27,985 INFO sqlalchemy.engine.Engine 
        select strftime('%Y-%m-%d', pickup_datetime) as date, count(*) as trips_amount, sum(trip_distance) as distance
        from
        (select * from taxi_trips where strftime('%Y', pickup_datetime) = '2014'
        union all
        select * from uber_trips where strftime('%Y', pickup_datetime) = '2014')
        group by strftime('%m-%d', pickup_datetime)
        order by trips_amount desc
        limit 10
        
2022-12-10 21:04:27,987 INFO sqlalchemy.engine.Engine [raw sql] ()
('2014-04-12', 231, 734.2488284338865)
('2014-03-29', 228, 660.2575145232145)
('2014-02-01', 222, 678.4233201164744)
('2014-02-28', 221, 631.1204102700872)
('2014-02-07', 219, 652.8391061550848)
('2014-05-16', 218, 639.2120721481293)
('2014-11-01', 214, 666.2615178182924)
('2014-03-01', 213, 596.5033025222677)
('2014-12-11', 211, 622.6627065275238)
('2014-12-13', 207, 649.18667000848)


In [35]:
# Question 5
def get_windest_days():
    with engine.connect() as connection:
        q5 = """
            select strftime('%Y-%m-%d', pickup_datetime) as date, wind_speed, count(*) as trips
            from 
            (select * from taxi_trips where strftime('%Y', pickup_datetime) = '2014'
            union all
            select * from uber_trips where strftime('%Y', pickup_datetime) = '2014')
            left join daily_weather on strftime('%Y-%m-%d', pickup_datetime) = datetime
            group by strftime('%m-%d', pickup_datetime)
            order by wind_speed desc
            limit 10
            """
         # create the query file
        with open(f'{QUERY_DIRECTORY}/question5.sql', "w") as f:
                f.write(q5)
        # check the result
        result = list(connection.execute(q5))

        for row in result:
            print(row)
        
get_windest_days()

2022-12-10 21:04:55,514 INFO sqlalchemy.engine.Engine 
            select strftime('%Y-%m-%d', pickup_datetime) as date, wind_speed, count(*) as trips
            from 
            (select * from taxi_trips where strftime('%Y', pickup_datetime) = '2014'
            union all
            select * from uber_trips where strftime('%Y', pickup_datetime) = '2014')
            left join daily_weather on strftime('%Y-%m-%d', pickup_datetime) = datetime
            group by strftime('%m-%d', pickup_datetime)
            order by wind_speed desc
            limit 10
            
2022-12-10 21:04:55,515 INFO sqlalchemy.engine.Engine [raw sql] ()
('2014-03-13', 12.923076923076923, 182)
('2014-01-07', 12.56, 163)
('2014-01-02', 12.352941176470589, 133)
('2014-02-13', 11.781818181818181, 132)
('2014-03-29', 11.666666666666666, 228)
('2014-03-26', 10.52, 176)
('2014-01-03', 9.710526315789474, 105)
('2014-02-14', 9.421052631578947, 170)
('2014-03-30', 8.790697674418604, 148)
('2014-02-15', 8.575, 197)

In [41]:
# Question6
def get_hurricane_trips_info():
    with engine.connect() as connection:

        start_date = datetime.date(2012, 10, 22)
        date_dict = {}
        for i in range(16):
            for j in range(24):
                date_dict[f'{str(start_date)} {j:02d}'] = {
                    'trips_num': 0, 'precipitation': 0, 'wind_speed': 0
                }
                j += 1
            start_date = start_date + datetime.timedelta(days=1)
              
        query1 = """
            select strftime('%Y-%m-%d %H',pickup_datetime) as cur_hour,count(*) as
            trips_num FROM uber_trips where pickup_datetime>='2012-10-22' and pickup_datetime<'2012-11-07' GROUP BY cur_hour;
        """
        result = list(connection.execute(query1))
        for row in result:
            cur_hour = row[0]
            trips_num = row[1]
            temp_dict = date_dict[cur_hour]
            temp_dict['trips_num'] = temp_dict['trips_num'] + trips_num
            
        query2 = """
            select strftime('%Y-%m-%d %H',pickup_datetime) as cur_hour,count(*) as
            trips_num FROM taxi_trips where pickup_datetime>='2012-10-22' and pickup_datetime<'2012-11-07' GROUP BY cur_hour;
        """
        result = list(connection.execute(query2))
        for row in result:
            cur_hour = row[0]
            trips_num = row[1]
            temp_dict = date_dict[cur_hour]
            temp_dict['trips_num'] = temp_dict['trips_num'] + trips_num
            
            
        query3 = """
            select * from hourly_weather where datetime>='2012-10-22' AND datetime<'2012-10-23';
        """
        result = list(connection.execute(query3))
        for row in result:
            cur_hour = str(row[1]).replace(':', '')
            precipitation = row[2]
            wind_speed = row[3]
            temp_dict = date_dict[cur_hour]
            temp_dict['precipitation'] = precipitation
            temp_dict['wind_speed'] = wind_speed
        

        with open(f'{QUERY_DIRECTORY}/question6.sql', "w") as f:
            f.write(query1)
            f.write(query2)
            f.write(query3)
        
        return date_dict
    
get_hurricane_trips_info()

2022-12-10 21:18:09,965 INFO sqlalchemy.engine.Engine 
            select strftime('%Y-%m-%d %H',pickup_datetime) as cur_hour,count(*) as
            trips_num FROM uber_trips where pickup_datetime>='2012-10-22' and pickup_datetime<'2012-11-07' GROUP BY cur_hour;
        
2022-12-10 21:18:09,966 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-10 21:18:10,047 INFO sqlalchemy.engine.Engine 
            select strftime('%Y-%m-%d %H',pickup_datetime) as cur_hour,count(*) as
            trips_num FROM taxi_trips where pickup_datetime>='2012-10-22' and pickup_datetime<'2012-11-07' GROUP BY cur_hour;
        
2022-12-10 21:18:10,047 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-10 21:18:10,113 INFO sqlalchemy.engine.Engine 
            select * from hourly_weather where datetime>='2012-10-22' AND datetime<'2012-10-23';
        
2022-12-10 21:18:10,113 INFO sqlalchemy.engine.Engine [raw sql] ()


{'2012-10-22 00': {'trips_num': 3, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 01': {'trips_num': 0, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 02': {'trips_num': 3, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 03': {'trips_num': 1, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 04': {'trips_num': 0, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 05': {'trips_num': 1, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 06': {'trips_num': 6, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 07': {'trips_num': 5, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 08': {'trips_num': 8, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 09': {'trips_num': 10, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 10': {'trips_num': 8, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 11': {'trips_num': 14, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 12': {'trips_num': 12, 'precipitation': 0, 'wind_speed': 0},
 '2012-10-22 13': {'trips_num': 12, 'precipitati

## Extra Credit

In [42]:
SUNSET_SUNRISE_SCHEMA = """
    CREATE TABLE IF NOT EXISTS sunset_sunrise (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        sunrise_time DATETIME,
        sunset_time DATETIME
    );
"""

# create a new table
with open(DATABASE_SCHEMA_FILE, "a") as f:
    f.write(SUNSET_SUNRISE_SCHEMA)

with engine.connect() as connection:
    connection.execute(SUNSET_SUNRISE_SCHEMA)

    
# clean the data
def load_and_clean_weather_sun_data(csv_file):
    df = pd.read_csv(csv_file, low_memory=False)
    df = df.loc[:, ["DATE", "Sunrise",'Sunset']]
    df = df[df['Sunrise'].notnull()]
    
    row_list = []
    for index, row in df.iterrows():
        record_date = str(row['DATE'])[0:10]
        sunrise = f"{int(row['Sunrise']):04d}"
        sunrise_time = f'{record_date} {sunrise[:2]}:{sunrise[2:]}:00'
        row['sunrise_time'] = sunrise_time
        sunset = f"{int(row['Sunset']):04d}"
        sunset_time = f'{record_date} {sunset[:2]}:{sunset[2:]}:00'
        row['sunset_time'] = sunset_time
        row_list.append(row)
    
    new_df = pd.DataFrame(row_list)
    new_df.drop(columns=["DATE", "Sunrise",'Sunset'], axis=1, inplace=True)
    return new_df


with engine.connect() as connection:
    test_sql = f"select * from sunset_sunrise;"
    result = list(connection.execute(test_sql)) 
    for row in result:
        print(row)

2022-12-10 21:21:52,426 INFO sqlalchemy.engine.Engine 
    CREATE TABLE IF NOT EXISTS sunset_sunrise (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        sunrise_time DATETIME,
        sunset_time DATETIME
    );

2022-12-10 21:21:52,427 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-12-10 21:21:52,428 INFO sqlalchemy.engine.Engine COMMIT
2022-12-10 21:21:52,430 INFO sqlalchemy.engine.Engine select * from sunset_sunrise;
2022-12-10 21:21:52,430 INFO sqlalchemy.engine.Engine [raw sql] ()
(1, '2015-01-01 07:20:00', '2015-01-01 16:39:00')
(2, '2015-01-02 07:20:00', '2015-01-02 16:40:00')
(3, '2015-01-03 07:20:00', '2015-01-03 16:41:00')
(4, '2015-01-04 07:20:00', '2015-01-04 16:42:00')
(5, '2015-01-05 07:20:00', '2015-01-05 16:43:00')
(6, '2015-01-06 07:20:00', '2015-01-06 16:44:00')
(7, '2015-01-07 07:20:00', '2015-01-07 16:44:00')
(8, '2015-01-08 07:20:00', '2015-01-08 16:45:00')
(9, '2015-01-09 07:20:00', '2015-01-09 16:46:00')
(10, '2015-01-10 07:20:00', '2015-01-10 16:47:00')
(11

In [46]:
# Question we made: what is the total hired trips in 2015.01.31 after sunset
def get_sunset_trips_info():
     with engine.connect() as connection:
        total_trips = f"""
            select count(*) as total_trips from
            (select * from uber_trips where pickup_datetime<'2015-02-01' and
            pickup_datetime>=(select sunset_time from sunset_sunrise where sunset_time like '%2015-01-31%')
            union all
            select * from taxi_trips where pickup_datetime<'2015-02-01' and
            pickup_datetime>=(select sunset_time from sunset_sunrise where sunset_time like '%2015-01-31%'))
            """
        
        # write to qurey file
        with open(f'{QUERY_DIRECTORY}/extra_credit.sql', "w") as f:
            f.write(total_trips)
        
        result = list(connection.execute(total_trips))
        print(result)
        
get_sunset_trips_info()

2022-12-10 21:38:04,261 INFO sqlalchemy.engine.Engine 
            select count(*) as total_trips from
            (select * from uber_trips where pickup_datetime<'2015-02-01' and
            pickup_datetime>=(select sunset_time from sunset_sunrise where sunset_time like '%2015-01-31%')
            union all
            select * from taxi_trips where pickup_datetime<'2015-02-01' and
            pickup_datetime>=(select sunset_time from sunset_sunrise where sunset_time like '%2015-01-31%'))
            
2022-12-10 21:38:04,262 INFO sqlalchemy.engine.Engine [raw sql] ()
[(69,)]


## Part 4: Visualizing the Data

In [None]:
#p4 q1 
#Create an appropriate visualization for the first query/question in part 3
def popular_hour():
    q1 = """
    select strftime('%H',pickup_datetime) AS cur_hour,count(*) as num
    from taxi_trips GROUP BY cur_hour
    """
    with engine.connect() as connection:

        test1 = connection.execute(q1)
        df1 = DataFrame(test1.fetchall())
        df1.columns = test1.keys()
    plt.bar(df1["cur_hour"], df1["num"], width = 0.2)
    plt.xlabel("hours")
    plt.ylabel("count")
    plt.title("count in each hour")
    plt.show()
popular_hour()

In [None]:
#p4 q2
#Create a visualization that shows the average distance traveled per month
def get_avg_distance_per_month():
    
    query_sql = """select strftime('%m',pickup_datetime) AS per_month,sum(trip_distance) as trip_distance from taxi_trips GROUP BY per_month;"""
    
    with engine.connect() as connection:
        t4 = connection.execute(query_sql)
        df_4 = DataFrame(t4.fetchall())
        df_4.columns = t4.keys()
    mean = df_4.groupby(["per_month"], as_index=False)['trip_distance'].mean()
    k_lower =  df_4.groupby(["per_month"], as_index=False)['trip_distance'].quantile(0.05)
    k_upper = df_4.groupby(["per_month"], as_index=False)['trip_distance'].quantile(0.95) 
   
    plt.bar(df_4["per_month"],df_4["trip_distance"],yerr=[k_lower['trip_distance'].to_numpy(),k_upper['trip_distance'].to_numpy()], alpha=0.5,width = 0.2)
    plt.xlabel("per_month")
    plt.ylabel("trip distance")
    plt.title("the average distance traveled per month")
    plt.show()
get_avg_distance_per_month()

In [None]:
#p4 q3
#Define three lat/long coordinate boxes around the three major New York airports: LGA, JFK, and EWR 
#Create a visualization that compares what day of the week was most popular for drop offs for each airport.
def coor_box():
    x= pd.read_sql_table('taxi_trips', engine)
    y = pd.read_sql_table('uber_trips', engine)
    x['pickup_datetime'] = pd.to_datetime(x['pickup_datetime'])
    y['pickup_datetime'] = pd.to_datetime(y['pickup_datetime'])
    k = pd.concat([x, y])

    k["day_of_the_week"] = k['pickup_datetime'].apply(lambda x:x.weekday())
    LGA = (-73.8702, 40.7730)
    JFK = (-73.780968, 40.641766)
    EWR = (-74.184601, 40.695213)
    center_distance = 0.02

    def filter_lnglat(x, lnglat_center, center_distance):
        x1 = x[lnglat_center[0] - center_distance <= x["dropoff_longitude"]]
        x2 = x1[x1["dropoff_longitude"] <= lnglat_center[0] + center_distance]
        x3 = x2[lnglat_center[1] - center_distance <= x2["dropoff_latitude"]]
        x4 = x3[x3["dropoff_latitude"]<= lnglat_center[1] + center_distance]
        return x4

    LGA_df = filter_lnglat(k, LGA, center_distance)
    JFK_df = filter_lnglat(k, JFK, center_distance)
    EWR_df = filter_lnglat(k, EWR, center_distance)
    LGA_count_df = LGA_df.groupby(["day_of_the_week"], as_index=False).count()
    JFK_count_df = JFK_df.groupby(["day_of_the_week"], as_index=False).count()
    EWR_count_df = EWR_df.groupby(["day_of_the_week"], as_index=False).count()
    tick_label = (LGA_count_df["day_of_the_week"] + 1).to_numpy()
    def txt_xy(X, YI, i, bar_width):
        for x,y in zip(X,YI):
            plt.text(x +i*bar_width,y,'%.2f' %y, ha='center',va='bottom', rotation=30)

    x_range = np.arange(7)
    bar_width = 0.1
    plt.bar(x_range, LGA_count_df["pickup_datetime"].to_numpy(), width=bar_width, label="LGA")
    plt.bar(x_range + bar_width, JFK_count_df["pickup_datetime"].to_numpy(), width=bar_width, label="JFK")
    plt.bar(x_range + 2 * bar_width, EWR_count_df["pickup_datetime"].to_numpy(), width=bar_width, label="EWR")

    plt.xticks(x_range + bar_width/2, tick_label, rotation=30)
    plt.legend()
    plt.show()  

In [None]:
#P4 Q4
# Create a heatmap of all hired trips over a map of the area. Consider using KeplerGL 
def heatmap():
    q4 = """
        select pickup_latitude, pickup_longitude
        from (select pickup_latitude, pickup_longitude
            from taxi_trips union all select pickup_latitude, pickup_longitude
            from uber_trips
        ) 
    """
    with engine.connect() as connection:
        t2 = connection.execute(q4)
        df_2 = DataFrame(t2.fetchall())
        df_2.columns = t2.keys()
        

    df_kepler = df_2[["pickup_longitude", "pickup_latitude"]]
    df_kepler["Latitude"] = df_kepler["pickup_latitude"]
    df_kepler["Longitude"] = df_kepler["pickup_longitude"]
    df_kepler["City"] = "NY"
    df_kepler = df_kepler.drop(columns="pickup_latitude")
    df_kepler = df_kepler.drop(columns="pickup_longitude")


    map_2 = KeplerGl(height=400, data={"data_1": df_kepler})
    return map_2

In [None]:
#p4 q5
#Create a scatter plot that compares tip amount versus distance.
def tip_versus_distance():
    q5 = """
        select trip_distance, tip_amount from taxi_trips
    """

    with engine.connect() as connection:
        t5 = connection.execute(q5)
        df_5 = DataFrame(t5.fetchall())
        df_5.columns = t5.keys()
        
    x_filtered = df_5[df_5["tip_amount"] < df_5["tip_amount"].quantile(0.9999)]
    x_filtered2 = x_filtered[x_filtered["trip_distance"] < x_filtered["trip_distance"].quantile(0.99999)]
    plt.scatter(x_filtered2["trip_distance"], x_filtered2["tip_amount"],s = 1)
    plt.xlabel("trip distance")
    plt.ylabel("tip amount")
    plt.title("tip amount versus distance")
    plt.show()

In [None]:
#p4 q6
#Create another scatter plot that compares tip amount versus precipitation amount.
def tip_versus_Precipitation():
    hourly_ps= pd.read_sql_table('hourly_weather', engine)
    x= pd.read_sql_table('taxi_trips', engine)
    x["pickup_hour"] = x["pickup_datetime"].astype("str").apply(lambda x: x[:13])
    hourly_ps["datetime"] = hourly_ps["datetime"].astype("str").apply(lambda x:x[:13])
    x_joined = x.set_index('pickup_hour').join(hourly_ps.set_index('datetime'))
    x_joined_filtered = x_joined[x_joined["tip_amount"] < x_joined["tip_amount"].quantile(0.9999)]
    plt.scatter(x_joined_filtered["tip_amount"], x_joined_filtered["precipitation"],s = 1)
    plt.xlabel("tip amount")
    plt.ylabel("HourlyPrecipitation")
    plt.title("tip amount versus Hourly Precipitation")
    plt.show()
tip_versus_Precipitation()