In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# -------------------------------
# Number of records
# -------------------------------
num_records = 10000

# -------------------------------
# Helper functions
# -------------------------------
def random_datetime(start, end):
    """Generate a random datetime between start and end"""
    delta = end - start
    rand_seconds = random.randint(0, int(delta.total_seconds()))
    return start + timedelta(seconds=rand_seconds)

def round2(x):
    return round(x, 2)

# -------------------------------
# Column generation
# -------------------------------
start_date = datetime(2018, 1, 1)
end_date = datetime(2019, 12, 31)

data = {
    "VendorID": np.random.choice([1, 2], size=num_records),
    "tpep_pickup_datetime": [random_datetime(start_date, end_date).strftime("%Y %b %d %I:%M:%S %p") for _ in range(num_records)],
    "tpep_dropoff_datetime": [random_datetime(start_date, end_date).strftime("%Y %b %d %I:%M:%S %p") for _ in range(num_records)],
    "passenger_count": np.random.randint(1, 6, size=num_records),
    "trip_distance": np.round(np.random.uniform(0.5, 50.0, size=num_records), 2),
    "RatecodeID": np.random.choice([1, 2, 3, 4, 5, 6], size=num_records),
    "store_and_fwd_flag": np.random.choice(["Y", "N"], size=num_records),
    "PULocationID": np.random.randint(1, 265, size=num_records),
    "DOLocationID": np.random.randint(1, 265, size=num_records),
    "payment_type": np.random.choice([1, 2, 3, 4], size=num_records),
    "fare_amount": np.round(np.random.uniform(3, 200, size=num_records), 2),
    "extra": np.round(np.random.uniform(0, 5, size=num_records), 2),
    "mta_tax": 0.5,
    "tip_amount": np.round(np.random.uniform(0, 50, size=num_records), 2),
    "tolls_amount": np.round(np.random.uniform(0, 30, size=num_records), 2),
    "improvement_surcharge": 0.3,
    "total_amount": 0  # placeholder, will calculate
}

df = pd.DataFrame(data)

# Calculate total_amount
df["total_amount"] = (
    df["fare_amount"] + df["extra"] + df["mta_tax"] + df["tip_amount"] + df["tolls_amount"] + df["improvement_surcharge"]
).round(2)

# -------------------------------
# Save to CSV
# -------------------------------
csv_file = "NYC_Taxi_Data.csv"
df.to_csv(csv_file, index=False, quoting=1)  # quoting=1 for double quotes

print(f"✅ CSV file '{csv_file}' created with {num_records} records")


✅ CSV file 'NYC_Taxi_Data.csv' created with 10000 records
