In [29]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import glob
import numpy as np

collisions_df = pd.read_csv("../../input/collisions.csv")
geometry_df = gpd.read_file("../../input/taxi_zones/taxi_zones.shp")
collisions_df["crash_date"] = pd.to_datetime(collisions_df["crash_date"])
collisions_df = collisions_df[collisions_df['crash_date'].dt.year == 2024]


In [1]:
import glob
import pandas as pd

# Step 1: Find all 2024 parquet files
parquet_files = glob.glob("../../input/yellow_taxi_data/*2024*.parquet")

all_dfs = []

# Step 2: Load, convert datetime, and sample from each file
for file in parquet_files:
    df = pd.read_parquet(file)
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')
    
    sampled_df = df.sample(n=500_000, random_state=100_000)
    
    print(f"{file}: original={len(df):,}, sampled={len(sampled_df):,}")
    all_dfs.append(sampled_df)

# Step 3: Concatenate all sampled dataframes
taxi_df = pd.concat(all_dfs, ignore_index=True)

print(f"\n✅ Final combined dataframe: {len(taxi_df):,} rows")


../../input/yellow_taxi_data/yellow_tripdata_2024-04.parquet: original=3,514,289, sampled=500,000
../../input/yellow_taxi_data/yellow_tripdata_2024-05.parquet: original=3,723,833, sampled=500,000
../../input/yellow_taxi_data/yellow_tripdata_2024-07.parquet: original=3,076,903, sampled=500,000
../../input/yellow_taxi_data/yellow_tripdata_2024-06.parquet: original=3,539,193, sampled=500,000
../../input/yellow_taxi_data/yellow_tripdata_2024-03.parquet: original=3,582,628, sampled=500,000
../../input/yellow_taxi_data/yellow_tripdata_2024-02.parquet: original=3,007,526, sampled=500,000
../../input/yellow_taxi_data/yellow_tripdata_2024-12.parquet: original=3,668,371, sampled=500,000
../../input/yellow_taxi_data/yellow_tripdata_2024-09.parquet: original=3,633,030, sampled=500,000
../../input/yellow_taxi_data/yellow_tripdata_2024-10.parquet: original=3,833,771, sampled=500,000
../../input/yellow_taxi_data/yellow_tripdata_2024-01.parquet: original=2,964,624, sampled=500,000
../../input/yellow_t

In [31]:
taxi_df.drop(columns=['store_and_fwd_flag'], inplace=True)

taxi_df.info()
taxi_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 18 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1200000 non-null  int32         
 1   tpep_pickup_datetime   1200000 non-null  datetime64[us]
 2   tpep_dropoff_datetime  1200000 non-null  datetime64[us]
 3   passenger_count        1082251 non-null  float64       
 4   trip_distance          1200000 non-null  float64       
 5   RatecodeID             1082251 non-null  float64       
 6   PULocationID           1200000 non-null  int32         
 7   DOLocationID           1200000 non-null  int32         
 8   payment_type           1200000 non-null  int64         
 9   fare_amount            1200000 non-null  float64       
 10  extra                  1200000 non-null  float64       
 11  mta_tax                1200000 non-null  float64       
 12  tip_amount             12000

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2024-04-24 14:58:27,2024-04-24 15:28:12,1.0,9.9,1.0,148,138,1,43.6,7.5,0.5,10.5,0.0,1.0,63.1,2.5,0.0
1,2,2024-04-10 22:23:05,2024-04-10 22:43:14,,3.7,,143,113,0,-1.36,0.0,0.5,0.0,0.0,1.0,2.64,,
2,2,2024-04-03 15:14:00,2024-04-03 15:33:00,,2.61,,166,42,0,17.95,0.0,0.5,0.0,0.0,1.0,19.45,,
3,1,2024-04-20 10:01:51,2024-04-20 10:23:38,1.0,4.6,1.0,41,68,1,21.9,2.5,0.5,6.5,0.0,1.0,32.4,2.5,0.0
4,2,2024-04-18 16:17:08,2024-04-18 16:42:25,1.0,2.9,1.0,229,238,1,22.6,2.5,0.5,4.36,0.0,1.0,33.46,2.5,0.0


In [32]:
taxi_df.replace('NULL', np.nan, inplace=True)

In [33]:
collisions_df.replace('NULL', np.nan, inplace=True)
collisions_df.drop(columns=['location'], inplace=True)


In [34]:
cols = ['crash_date', 'crash_time', 'latitude', 'longitude',
       'on_street_name', 'number_of_persons_injured',
       'number_of_persons_killed', 'number_of_pedestrians_injured',
       'number_of_pedestrians_killed', 'number_of_cyclist_injured',
       'number_of_cyclist_killed', 'number_of_motorist_injured',
       'number_of_motorist_killed', 'contributing_factor_vehicle_1', 'collision_id','borough', 'zip_code',
       'cross_street_name', 'off_street_name']
collisions_df = collisions_df[cols]
collisions_df.info()
collisions_df.head()

# Replace 'Unspecified' in 'contributing_factor_vehicle_1' with NaN
collisions_df['contributing_factor_vehicle_1'].replace('Unspecified', np.nan, inplace=True)

# Drop rows where 'latitude' or 'longitude' are missing
collisions_df.dropna(subset=['latitude', 'longitude'], inplace=True)


<class 'pandas.core.frame.DataFrame'>
Index: 91264 entries, 200490 to 291753
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   crash_date                     91264 non-null  datetime64[ns]
 1   crash_time                     91264 non-null  object        
 2   latitude                       84262 non-null  float64       
 3   longitude                      84262 non-null  float64       
 4   on_street_name                 65077 non-null  object        
 5   number_of_persons_injured      91264 non-null  int64         
 6   number_of_persons_killed       91264 non-null  int64         
 7   number_of_pedestrians_injured  91264 non-null  int64         
 8   number_of_pedestrians_killed   91264 non-null  int64         
 9   number_of_cyclist_injured      91264 non-null  int64         
 10  number_of_cyclist_killed       91264 non-null  int64         
 11  number_of_moto

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  collisions_df['contributing_factor_vehicle_1'].replace('Unspecified', np.nan, inplace=True)


In [35]:
df = collisions_df
for col in df.columns:
    non_null_count = df[col].notnull().sum()
    unique_count = df[col].nunique(dropna=True)
    is_numeric = pd.api.types.is_numeric_dtype(df[col])

    # Skip numeric columns
    if is_numeric:
        continue

    # Print only if there are duplicate values (i.e., normalization candidates)
    if unique_count < non_null_count:
        print(f"{col}: {non_null_count} non-null values, {unique_count} unique values")
        print(" → Column is not purely numeric.")
        print(f" → {col} has duplicate values and could be normalized.")
        print("-" * 50)


crash_date: 84262 non-null values, 321 unique values
 → Column is not purely numeric.
 → crash_date has duplicate values and could be normalized.
--------------------------------------------------
crash_time: 84262 non-null values, 1440 unique values
 → Column is not purely numeric.
 → crash_time has duplicate values and could be normalized.
--------------------------------------------------
on_street_name: 58601 non-null values, 5179 unique values
 → Column is not purely numeric.
 → on_street_name has duplicate values and could be normalized.
--------------------------------------------------
contributing_factor_vehicle_1: 62351 non-null values, 54 unique values
 → Column is not purely numeric.
 → contributing_factor_vehicle_1 has duplicate values and could be normalized.
--------------------------------------------------
borough: 64358 non-null values, 5 unique values
 → Column is not purely numeric.
 → borough has duplicate values and could be normalized.
---------------------------

In [36]:
import pandas as pd

# 1. Create the lookup tables with IDs
def create_lookup_table(df, column_name, new_column_name):
    lookup_df = df[[column_name]].dropna().drop_duplicates().reset_index(drop=True)
    lookup_df.insert(0, f"{new_column_name}_id", range(1, len(lookup_df) + 1))
    return lookup_df

contributing_factor_lut = create_lookup_table(collisions_df, 'contributing_factor_vehicle_1', 'contributing_factor')
borough_lut = create_lookup_table(collisions_df, 'borough', 'borough')
cross_street_lut = create_lookup_table(collisions_df, 'cross_street_name', 'cross_street')
off_street_lut = create_lookup_table(collisions_df, 'off_street_name', 'off_street')

# 2. Map the original columns to their IDs
collisions_df = collisions_df.merge(contributing_factor_lut, how='left', left_on='contributing_factor_vehicle_1', right_on='contributing_factor_vehicle_1')
collisions_df = collisions_df.merge(borough_lut, how='left', left_on='borough', right_on='borough')
collisions_df = collisions_df.merge(cross_street_lut, how='left', left_on='cross_street_name', right_on='cross_street_name')
collisions_df = collisions_df.merge(off_street_lut, how='left', left_on='off_street_name', right_on='off_street_name')

# 3. Optional: Drop original text columns and rename *_id columns
collisions_df = collisions_df.drop(columns=[
    'contributing_factor_vehicle_1',
    'borough',
    'cross_street_name',
    'off_street_name'
])

collisions_df = collisions_df.rename(columns={
    'contributing_factor_id': 'contributing_factor_vehicle_1_id',
    'borough_id': 'borough_id',
    'cross_street_id': 'cross_street_name_id',
    'off_street_id': 'off_street_name_id'
})


In [37]:
contributing_factor_lut
borough_lut
cross_street_lut
off_street_lut
collisions_df
taxi_df
geometry_df


Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.918 192536.086, 933091.011 19..."
1,2,0.433470,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((1033269.244 172126.008, 103343..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.77 256767.698, 1026495.593 2..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.467 203714.076, 992068.667 20..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((935843.31 144283.336, 936046.565 144..."
...,...,...,...,...,...,...,...
258,259,0.126750,0.000395,Woodlawn/Wakefield,259,Bronx,"POLYGON ((1025414.782 270986.139, 1025138.624 ..."
259,260,0.133514,0.000422,Woodside,260,Queens,"POLYGON ((1011466.966 216463.005, 1011545.889 ..."
260,261,0.027120,0.000034,World Trade Center,261,Manhattan,"POLYGON ((980555.204 196138.486, 980570.792 19..."
261,262,0.049064,0.000122,Yorkville East,262,Manhattan,"MULTIPOLYGON (((999804.795 224498.527, 999824...."


In [2]:
import psycopg2
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv(override=True)

RDS_HOST = os.getenv("DB_HOST")
RDS_PORT = os.getenv("DB_PORT")
RDS_USER = os.getenv("DB_USER")
RDS_PASSWORD = os.getenv("DB_PASSWORD")
RDS_DB = os.getenv("DB_NAME")

try:
    # Establish PostgreSQL connection
    conn = psycopg2.connect(
        host=RDS_HOST,
        user=RDS_USER,
        password=RDS_PASSWORD,
        dbname=RDS_DB,
        port=RDS_PORT,
        sslmode="require"  
    )
    print("✅ Connected to PostgreSQL RDS instance successfully!")

    # Create a cursor object
    cursor = conn.cursor()

    # Execute a test query
    cursor.execute("SELECT version();")
    version = cursor.fetchone()
    print("Database version:", version)

    # Close connection
    # cursor.close()
    # conn.close()


except Exception as e:
    print("Error connecting to RDS:", e)



✅ Connected to PostgreSQL RDS instance successfully!
Database version: ('PostgreSQL 17.2 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 12.4.0, 64-bit',)


In [3]:
def create_table_from_df(df, table_name, cursor):
    # Generate column definitions
    columns = []
    for col_name, dtype in zip(df.columns, df.dtypes):
        if 'int' in str(dtype):
            col_type = 'INTEGER'
        elif 'float' in str(dtype):
            col_type = 'NUMERIC'
        elif 'datetime' in str(dtype):
            col_type = 'TIMESTAMP'
        elif 'bool' in str(dtype):
            col_type = 'BOOLEAN'
        else:
            col_type = 'TEXT'
        
        columns.append(f'"{col_name}" {col_type}')
    
    # Create table
    create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join(columns)})"
    cursor.execute(create_table_query)
    print(f"Created table {table_name}")

In [4]:
def copy_df_to_postgres(df, table_name, cursor):
    from io import StringIO
    import csv  # Import csv for quoting options

    # Create a buffer
    buffer = StringIO()

    # Write the DataFrame to the buffer
    df.to_csv(buffer, index=False, header=False, na_rep='', quoting=csv.QUOTE_MINIMAL)
    buffer.seek(0)

    try:
        # Use COPY command for fast data loading
        column_list = ','.join([f'"{col}"' for col in df.columns])
        cursor.copy_expert(
            f"COPY {table_name} ({column_list}) FROM STDIN WITH CSV NULL ''",
            buffer
        )
        print(f"Uploaded {len(df)} rows to {table_name}")
    except Exception as e:
        print(f"Error uploading to {table_name}: {e}")

In [None]:
# List of dataframes to upload
dataframes = {
    'geometry_df': geometry_df,
    'contributing_factor_lut': contributing_factor_lut,
    'borough_lut': borough_lut,
    'cross_street_lut': cross_street_lut,
    'off_street_lut': off_street_lut,
    'collisions_df': collisions_df,
    'taxi_df': taxi_df
}

# Process each dataframe
for table_name, df in dataframes.items():
    print(f"🔄 Processing {table_name} with {len(df)} rows...")
    
    # Create the table
    create_table_from_df(df, table_name, cursor)
    
    # Process in chunks to handle large dataframes
    chunk_size = 50000
    total_chunks = (len(df) + chunk_size - 1) // chunk_size  # Ceiling division
    
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size]
        chunk_num = i // chunk_size + 1
        print(f"Uploading chunk {chunk_num}/{total_chunks} ({i} to {min(i+chunk_size, len(df))} rows)")
        copy_df_to_postgres(chunk, table_name, cursor)
        # Commit after each chunk to avoid long transactions
        conn.commit()

print("All data uploaded successfully!")

# Close connections
cursor.close()
conn.close()
print("Connection closed.")

🔄 Processing taxi_df with 1200000 rows...
Created table taxi_df
Uploading chunk 1/24 (0 to 50000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 2/24 (50000 to 100000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 3/24 (100000 to 150000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 4/24 (150000 to 200000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 5/24 (200000 to 250000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 6/24 (250000 to 300000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 7/24 (300000 to 350000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 8/24 (350000 to 400000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 9/24 (400000 to 450000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 10/24 (450000 to 500000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 11/24 (500000 to 550000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 12/24 (550000 to 600000 rows)
Uploaded 50000 rows to taxi_df
Uploading chunk 13/24 (60000

In [42]:
# # Drop all tables in the public schema
# cursor.execute("""
#     DO $$ 
#     BEGIN
#         EXECUTE (
#             SELECT string_agg('DROP TABLE IF EXISTS "' || table_name || '" CASCADE;', ' ')
#             FROM information_schema.tables
#             WHERE table_schema = 'public'
#         );
#     END $$;
# """)
# conn.commit()

# print("All tables dropped successfully!")

In [None]:
cursor.execute("""
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = 'public'
""")
tables = cursor.fetchall()

# Print the table names
print("Tables in the database:")
for table in tables:
    print(table[0])

cursor.execute("""
    SELECT COUNT(*) AS row_count
    FROM taxi_df
""")
row_count = cursor.fetchone()

# Print the number of rows
print(f"Number of rows in taxi_df: {row_count[0]}")

In [None]:
cursor.execute("SELECT COUNT(*) FROM taxi_df")
uploaded_rows = cursor.fetchone()[0]
print(f"Rows already uploaded: {uploaded_rows}")

Rows already uploaded: 1200000


In [None]:
# start_row = uploaded_rows  # Start from the last uploaded row
# chunk_size = 50000  # Adjust chunk size if needed

# for i in range(start_row, len(taxi_df), chunk_size):
#     chunk = taxi_df.iloc[i:i+chunk_size]
#     chunk_num = i // chunk_size + 1
#     print(f"Uploading chunk {chunk_num} ({i} to {min(i+chunk_size, len(taxi_df))} rows)")
#     copy_df_to_postgres(chunk, 'taxi_df', cursor)
#     conn.commit()  # Commit after each chunk

In [None]:
# List of SQL commands to execute
sql_commands = [
    """
    CREATE TABLE borough_lut_temp AS
    SELECT DISTINCT * FROM borough_lut;

    DROP TABLE borough_lut;

    ALTER TABLE borough_lut_temp RENAME TO borough_lut;
    """,
    """
    CREATE TABLE contributing_factor_lut_temp AS
    SELECT DISTINCT * FROM contributing_factor_lut;

    DROP TABLE contributing_factor_lut;

    ALTER TABLE contributing_factor_lut_temp RENAME TO contributing_factor_lut;
    """,
    """
    CREATE TABLE cross_street_lut_temp AS
    SELECT DISTINCT * FROM cross_street_lut;

    DROP TABLE cross_street_lut;

    ALTER TABLE cross_street_lut_temp RENAME TO cross_street_lut;
    """,
    """
    CREATE TABLE off_street_lut_temp AS
    SELECT DISTINCT * FROM off_street_lut;

    DROP TABLE off_street_lut;

    ALTER TABLE off_street_lut_temp RENAME TO off_street_lut;
    """
]

# Execute each SQL command
for command in sql_commands:
    try:
        cursor.execute(command)
        conn.commit()
        print("Command executed successfully.")
    except Exception as e:
        print(f"Error executing command: {e}")
        conn.rollback()



Command executed successfully.
Command executed successfully.
Command executed successfully.
Command executed successfully.


In [None]:
# List of SQL commands to remove duplicates for each table
sql_remove_duplicates_commands = [
    """
    DELETE FROM collisions_df
    WHERE ctid NOT IN (
        SELECT MIN(ctid)
        FROM collisions_df
        GROUP BY collision_id
    );
    """,
    """
    DELETE FROM nyc_geometry_df
    WHERE ctid NOT IN (
        SELECT MIN(ctid)
        FROM nyc_geometry_df
        GROUP BY "LocationID"
    );
    """
]

# Execute each SQL command
for command in sql_remove_duplicates_commands:
    try:
        cursor.execute(command)
        conn.commit()
        print("Duplicate rows removed successfully.")
    except Exception as e:
        print(f"Error removing duplicates: {e}")
        conn.rollback()

# Add primary keys to the tables
sql_add_primary_keys = [
    "ALTER TABLE borough_lut ADD PRIMARY KEY (borough_id);",
    "ALTER TABLE contributing_factor_lut ADD PRIMARY KEY (contributing_factor_id);",
    "ALTER TABLE cross_street_lut ADD PRIMARY KEY (cross_street_id);",
    "ALTER TABLE off_street_lut ADD PRIMARY KEY (off_street_id);",
    "ALTER TABLE collisions_df ADD PRIMARY KEY (collision_id);",
    "ALTER TABLE nyc_geometry_df ADD PRIMARY KEY (\"LocationID\");",
    "ALTER TABLE taxi_df ADD COLUMN trip_id SERIAL PRIMARY KEY;"
]

# Execute each SQL command to add primary keys
for command in sql_add_primary_keys:
    try:
        cursor.execute(command)
        conn.commit()
        print("Primary key added successfully.")
    except Exception as e:
        print(f"Error adding primary key: {e}")
        conn.rollback()

Duplicate rows removed successfully.
Duplicate rows removed successfully.
Error adding primary key: multiple primary keys for table "collisions_df" are not allowed



In [None]:
# List of SQL commands to alter column types
sql_alter_columns = [
    """
    ALTER TABLE collisions_df
    ALTER COLUMN borough_id TYPE integer USING borough_id::integer;
    """,
    """
    ALTER TABLE collisions_df
    ALTER COLUMN contributing_factor_vehicle_1_id TYPE integer USING contributing_factor_vehicle_1_id::integer;
    """,
    """
    ALTER TABLE collisions_df
    ALTER COLUMN cross_street_name_id TYPE integer USING cross_street_name_id::integer;
    """,
    """
    ALTER TABLE collisions_df
    ALTER COLUMN off_street_name_id TYPE integer USING off_street_name_id::integer;
    """
]

# Execute each SQL command to alter column types
for command in sql_alter_columns:
    try:
        cursor.execute(command)
        conn.commit()
        print("Column type altered successfully.")
    except Exception as e:
        print(f"Error altering column type: {e}")
        conn.rollback()

In [None]:
# List of SQL commands to rename tables
sql_rename_tables = [
    "ALTER TABLE collisions_df RENAME TO collision;",
    "ALTER TABLE taxi_df RENAME TO taxi;",
    "ALTER TABLE nyc_geometry_df RENAME TO nyc_geometry;"
]

# Execute each SQL command to rename tables
for command in sql_rename_tables:
    try:
        cursor.execute(command)
        conn.commit()
        print("Table renamed successfully.")
    except Exception as e:
        print(f"Error renaming table: {e}")
        conn.rollback()

In [None]:
# Add foreign key constraints
sql_add_foreign_keys = [
    """
    ALTER TABLE taxi_df
    ADD CONSTRAINT fk_taxi_pickup_location
    FOREIGN KEY (PULocationID)
    REFERENCES nyc_geometry_df (LocationID)
    NOT VALID;
    """,
    """
    ALTER TABLE taxi_df
    ADD CONSTRAINT fk_taxi_dropoff_location
    FOREIGN KEY (DOLocationID)
    REFERENCES nyc_geometry_df (LocationID)
    NOT VALID;
    """,
    """
    ALTER TABLE collisions_df
    ADD CONSTRAINT fk_collision_borough
    FOREIGN KEY (borough_id)
    REFERENCES borough_lut (borough_id);
    """,
    """
    ALTER TABLE collisions_df
    ADD CONSTRAINT fk_collision_contributing_factor
    FOREIGN KEY (contributing_factor_id)
    REFERENCES contributing_factor_lut (contributing_factor_id);
    """,
    """
    ALTER TABLE collisions_df
    ADD CONSTRAINT fk_collision_cross_street
    FOREIGN KEY (cross_street_id)
    REFERENCES cross_street_lut (cross_street_id);
    """,
    """
    ALTER TABLE collisions_df
    ADD CONSTRAINT fk_collision_off_street
    FOREIGN KEY (off_street_id)
    REFERENCES off_street_lut (off_street_id);
    """
]

# Execute each SQL command to add foreign keys
for command in sql_add_foreign_keys:
    try:
        cursor.execute(command)
        conn.commit()
        print("Foreign key added successfully.")
    except Exception as e:
        print(f"Error adding foreign key: {e}")
        conn.rollback()

In [None]:
alter_commands = [
    "ALTER TABLE nyc_geometry RENAME COLUMN \"LocationID\" TO location_id;",
    "ALTER TABLE nyc_geometry RENAME COLUMN \"Shape_Leng\" TO shape_leng;",
    "ALTER TABLE nyc_geometry RENAME COLUMN \"Shape_Area\" TO shape_area;",
    "ALTER TABLE nyc_geometry RENAME COLUMN \"OBJECTID\" TO object_id;",
    "ALTER TABLE taxi RENAME COLUMN \"VendorID\" TO vendor_id;",
    "ALTER TABLE taxi RENAME COLUMN \"RatecodeID\" TO ratecode_id;",
    "ALTER TABLE taxi RENAME COLUMN \"PULocationID\" TO pu_location_id;",
    "ALTER TABLE taxi RENAME COLUMN \"DOLocationID\" TO do_location_id;",
    "ALTER TABLE taxi RENAME COLUMN \"Airport_fee\" TO airport_fee;",
    """UPDATE borough_lut
    SET borough = INITCAP(borough);"""
]

# Execute each command
for cmd in alter_commands:
    try:
        print(f"Executing: {cmd}")
        cursor.execute(cmd)
    except Exception as e:
        print(f"Failed to execute: {cmd}\nError: {e}")

-- Renaming nyc_geometry.LocationID → locationid
-- Renaming nyc_geometry.Shape_Leng → shape_leng
-- Renaming nyc_geometry.Shape_Area → shape_area
-- Renaming nyc_geometry.OBJECTID → objectid
-- Renaming taxi.VendorID → vendorid
-- Renaming taxi.RatecodeID → ratecodeid
-- Renaming taxi.PULocationID → pulocationid
-- Renaming taxi.DOLocationID → dolocationid
-- Renaming taxi.Airport_fee → airport_fee


In [48]:
# cursor.execute("""
#     SELECT column_name, data_type, is_nullable
#     FROM information_schema.columns
#     WHERE table_name = 'geometry'
#     ORDER BY ordinal_position
# """)
# schema = cursor.fetchall()

# print("Geometry Table Schema:")
# for column in schema:
#     print(f"{column[0]} — {column[1]} — {'NULLABLE' if column[2] == 'YES' else 'NOT NULL'}")
