In [1]:
# Imports and constants
import time
import traceback
import datetime
import pandas as pd
import numpy as np
from sqlalchemy import MetaData
from sqlalchemy import create_engine, select, insert, event
from sqlalchemy.sql import text
from sqlalchemy.engine import URL
import matplotlib.pyplot as plt

CHUNK_SIZE=10**6

GREEN = 'green'
YELLOW = 'yellow'
MIN_YEAR = 2020
MAX_YEAR = 2023

COLS = ['id', 
        'pickup_datetime', 'dropoff_datetime', 'ratecode_id', 
        'pu_location_id', 'do_location_id', 'passenger_count', 'trip_distance', 
        'fare_amount', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount', 
        'congestion_surcharge', 'improvement_surcharge', 'extra', 'payment_type']

# Functions
# def getODBCString():
#     SERVER = 'tcp:nyc-taxi-2024.database.windows.net,1433'
#     DATABASE = 'nyc_taxi_2024'
#     USERNAME = 'ishmakwana'
#     PASSWORD = 'xxx'

#     con_str = f'DRIVER={{ODBC Driver 18 for SQL Server}};SERVER={SERVER};DATABASE={DATABASE};UID={USERNAME};PWD={PASSWORD};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;'
#     return URL.create("mssql+pyodbc", query={"odbc_connect": con_str})

def getSQLiteString():
    return 'sqlite:///db/taxi_db.db'

class TaxiDBReader:
    def __init__(self):
        self.md = MetaData()
        self.engn = create_engine(getSQLiteString())
        self.md.reflect(self.engn)
        self.year = MAX_YEAR
        self.taxi_type = YELLOW
        print('sql engine ready')

        with self.engn.connect() as conn:
            conn.rollback()

    def setTable(self, year=MAX_YEAR, taxi_type=YELLOW):
        self.year = year
        self.taxi_type = taxi_type

    def getTableName(self):
        return f'{self.taxi_type}_taxi_trips{self.year}'
    

dr = TaxiDBReader()
        

sql engine ready


In [2]:
dr.setTable(year=2023, taxi_type=YELLOW)
table_name = dr.getTableName()

In [None]:
# average passenger count by pickup location id
dr.setTable(year=2023, taxi_type=YELLOW)
table_name = dr.getTableName()

with dr.engn.connect() as conn:
    # start = 0
    # count = 10
    # sql = text(f'SELECT * FROM {table_name} LIMIT {count} OFFSET {start} ROWS FETCH NEXT {count} ROWS ONLY')
    # month = 1 # january
    # day = 1
    # hour = 1
    # sql = text(f'select * from {table_name} where strftime(\'%m\', pickup_datetime) = \'0{month}\'')
    sql = text(f'select pu_location_id, avg(passenger_count) as avg_passenger_count from {table_name} where passenger_count > 0 group by pu_location_id')
    df = pd.read_sql(sql, conn)

    print(df)



In [None]:
# number of trips by pickup location where no passengers where recorded
# with dr.engn.connect() as conn:
    
#     sql = text(f'select pu_location_id, count(passenger_count) as count_passenger_count from {table_name} where passenger_count = 0 group by pu_location_id')
#     df = pd.read_sql(sql, conn)

#     print(df)



In [None]:
# maximum and minimum number of passengers
with dr.engn.connect() as conn:
    
    sql = text(f'select max(passenger_count) as max_passenger_count, min(passenger_count) as min_passenger_count from {table_name}')
    df = pd.read_sql(sql, conn)

    print(df)

    # total trips where passenger_count is 0
    sql = text(f'select count(*) trips_no_passenger from {table_name} where passenger_count = 0')
    df = pd.read_sql(sql, conn)
    print(df)



In [20]:
# trips in jan where passenger count is 0
with dr.engn.connect() as conn:
    
    date = datetime.datetime(year=2023, month=1, day=1)
    yearMonthFmt = '%Y %m'
    yearMonthVal = f'{date.year:04d} {date.month:02d}'
    print(f'yearMonthVal: {yearMonthVal}')
    # sql = text(f'select distinct strftime(\'%Y\', pickup_datetime) as year, strftime(\'%m\', pickup_datetime) as month from {table_name} where passenger_count=0')
    sql = text(f'select trip_distance, total_amount from {table_name} where passenger_count=0 and strftime(\'{yearMonthFmt}\', pickup_datetime)=\'{yearMonthVal}\'')
    df = pd.read_sql(sql, conn)
    print(df)



yearMonthVal: 2023 01
        trip_distance  total_amount
0                1.90         20.85
1                1.30         16.30
2                1.00         13.80
3                2.30         19.65
4                2.10         17.80
...               ...           ...
122902           3.05         23.76
122903           5.80         29.07
122904           4.67         26.93
122905           3.15         26.58
122906           2.85         21.97

[122907 rows x 2 columns]


In [3]:
# number of trips where passenger count is 0 and also trip_distance and total_amount
with dr.engn.connect() as conn:
    
    sql = text(f'select count(*) from {table_name} where passenger_count=0 and trip_distance=0 and total_amount=0')
    df = pd.read_sql(sql, conn)
    print(df)



yearMonthVal: 2023 01
   count(*)
0       291
