In [1]:
# 📦 DuckDB Sampling from Large Parquet File
import duckdb
import pandas as pd

# === User Inputs ===
parquet_path = 'rideshare_data.parquet'  
sample_size = 50000                       
output_csv = 'actual_ridedata_sample.csv'          

# === Query & Load ===
print(f"📥 Sampling {sample_size} rows from Parquet (DuckDB)...")
query = f"""
    SELECT * FROM read_parquet('{parquet_path}')
    USING SAMPLE {sample_size} ROWS
"""
df_sample = duckdb.query(query).to_df()

# === Save to CSV ===
df_sample.to_csv(output_csv, index=False)
print(f"✅ Sample saved to: {output_csv}")
print(f"📊 Sampled Rows: {len(df_sample)}")


📥 Sampling 50000 rows from Parquet (DuckDB)...
✅ Sample saved to: actual_ridedata_sample.csv
📊 Sampled Rows: 50000


In [5]:
df_sample.count()

business               10000
pickup_location        10000
dropoff_location       10000
trip_length            10000
request_to_dropoff     10000
request_to_pickup      10000
total_ride_time        10000
on_scene_to_pickup      7278
on_scene_to_dropoff     7278
time_of_day            10000
date                   10000
hour_of_day            10000
week_of_year           10000
month_of_year          10000
passenger_fare         10000
driver_total_pay       10000
rideshare_profit       10000
hourly_rate             7278
dollars_per_mile       10000
dtype: int64

In [2]:
import pandas as pd

# === Load CSV Files ===
csv1 = 'actual_ridedata_sample.csv'
csv2 = 'sample_ride_hailing_dataset.csv'

df1 = pd.read_csv(csv1)
df2 = pd.read_csv(csv2)

# === Compare Column Names ===
cols_match = list(df1.columns) == list(df2.columns)
print(f"🧾 Column names match: {cols_match}")
if not cols_match:
    print("📌 Columns in file1:", list(df1.columns))
    print("📌 Columns in file2:", list(df2.columns))

# === Compare Data Types ===
dtypes_match = df1.dtypes.equals(df2.dtypes)
print(f"🔍 Data types match: {dtypes_match}")

if not dtypes_match:
    print("\n📋 Data types in file1:")
    print(df1.dtypes)
    print("\n📋 Data types in file2:")
    print(df2.dtypes)


🧾 Column names match: False
📌 Columns in file1: ['business', 'pickup_location', 'dropoff_location', 'trip_length', 'request_to_dropoff', 'request_to_pickup', 'total_ride_time', 'on_scene_to_pickup', 'on_scene_to_dropoff', 'time_of_day', 'date', 'hour_of_day', 'week_of_year', 'month_of_year', 'passenger_fare', 'driver_total_pay', 'rideshare_profit', 'hourly_rate', 'dollars_per_mile']
📌 Columns in file2: ['Business', 'Pickup Location', 'Dropoff Location', 'Trip Length', 'Request to Dropoff', 'Request to Pickup', 'Total Ride Time', 'On Scene to Pickup', 'On Scene to Dropoff', 'Time of Day', 'Date', 'Hour of Day', 'Week of Year', 'Month of Year', 'Passenger Fare', 'Driver Total Pay', 'Rideshare Profit', 'Hourly Rate', 'Dollars per Mile']
🔍 Data types match: False

📋 Data types in file1:
business                object
pickup_location          int64
dropoff_location         int64
trip_length            float64
request_to_dropoff     float64
request_to_pickup      float64
total_ride_time     

In [3]:
import pandas as pd

# === Load the first CSV ===
file1 = 'actual_ridedata_sample.csv'
df1 = pd.read_csv(file1)

# === Define the new column names (copied from file2) ===
new_column_names = [
    'Business', 'Pickup Location', 'Dropoff Location', 'Trip Length', 'Request to Dropoff',
    'Request to Pickup', 'Total Ride Time', 'On Scene to Pickup', 'On Scene to Dropoff',
    'Time of Day', 'Date', 'Hour of Day', 'Week of Year', 'Month of Year',
    'Passenger Fare', 'Driver Total Pay', 'Rideshare Profit', 'Hourly Rate', 'Dollars per Mile'
]

# === Apply the new column names ===
df1.columns = new_column_names

# === Save the updated file if needed ===
df1.to_csv('renamed_actual_ridedata_sample.csv', index=False)
print("✅ Column names updated and saved as 'file1_renamed.csv'")


✅ Column names updated and saved as 'file1_renamed.csv'
