In [11]:
print(df.columns.tolist())


['vendorid', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'ratecodeid', 'store_and_fwd_flag', 'pulocationid', 'dolocationid', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee', 'airport_fee']


In [17]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install fpdf2
from fpdf import FPDF
from fpdf.enums import XPos, YPos

sns.set(style="whitegrid")

# 1. Folder path to the  data
data_folder = r"D:\new downloads\Datasets and Dictionary-NYC\Datasets and Dictionary\trip_records"

# 2. Listing all parquet files 
monthly_files = [os.path.join(data_folder, f) for f in sorted(os.listdir(data_folder)) if f.endswith('.parquet')]
print("Monthly parquet files found:")
print(monthly_files)

# 3. Sampling function 
def sample_month_file_parquet(file_path, frac=0.01):  # 1% sample for quick testing
    df = pd.read_parquet(file_path)
    return df.sample(frac=frac, random_state=42)

# 4. Load and sample
dfs = [sample_month_file_parquet(f) for f in monthly_files]
df = pd.concat(dfs, ignore_index=True)
print(f"Combined sampled dataframe shape: {df.shape}")

#  Optional: saved sampled dataframe
df.to_parquet("./taxi_2023_sample.parquet")

# 5. Data Cleaning 
df.columns = [col.strip().lower() for col in df.columns]

df.rename(columns={
    'tpep_pickup_datetime': 'pickup_datetime',
    'tpep_dropoff_datetime': 'dropoff_datetime'
}, inplace=True)

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

# Drop rows with missing critical data including passenger_count to avoid errors
df = df.dropna(subset=['pickup_datetime', 'dropoff_datetime', 'fare_amount', 'passenger_count'])

# Now safe to convert passenger_count to int
df['passenger_count'] = df['passenger_count'].astype(int)

df = df[(df['fare_amount'] > 0) & (df['fare_amount'] < 500)]

print(f"Cleaned dataframe shape: {df.shape}")

# 6. Feature Engineering 
df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_week'] = df['pickup_datetime'].dt.day_name()

#  7. General EDA 
plt.figure(figsize=(10,6))
sns.countplot(x='hour', data=df)
plt.title('Number of Trips by Hour of Day')
plt.savefig('trips_by_hour.png')
plt.close()

plt.figure(figsize=(10,6))
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sns.countplot(x='day_of_week', data=df, order=order)
plt.title('Number of Trips by Day of Week')
plt.savefig('trips_by_day.png')
plt.close()

plt.figure(figsize=(10,6))
df.groupby('hour')['fare_amount'].mean().plot()
plt.title('Average Fare by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Average Fare Amount')
plt.savefig('avg_fare_by_hour.png')
plt.close()

# 8. Detailed EDA 
if 'pulocationid' in df.columns:
    top_zones = df['pulocationid'].value_counts().head(10)
    plt.figure(figsize=(12,6))
    sns.barplot(x=top_zones.index, y=top_zones.values)
    plt.title('Top 10 Pickup Location IDs')
    plt.ylabel('Number of Trips')
    plt.xticks(rotation=45)
    plt.savefig('top_pickup_zones.png')
    plt.close()
else:
    print("Column 'pulocationid' not found, skipping pickup zones plot.")

if 'tip_amount' in df.columns:
    plt.figure(figsize=(10,6))
    sns.histplot(df['tip_amount'], bins=30, kde=False)
    plt.title('Distribution of Tips')
    plt.savefig('tip_distribution.png')
    plt.close()
else:
    print("Column 'tip_amount' not found, skipping tip distribution plot.")

# 9. Report Summary Text 
report_summary = """
### NYC Taxi Operations 2023 Report Summary

- Peak taxi trips occur between 4 PM and 8 PM.
- Weekdays have more trips compared to weekends.
- Average fare peaks in late evening hours.
- Top pickup locations identified by Location IDs.
- Tips are generally low with occasional high outliers.

### Recommendations:
- Implement surge pricing during peak hours.
- Increase driver availability in busiest zones.
- Encourage tipping to boost driver income.
"""

# 10. Generating PDF Report 
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()

pdf.set_font("helvetica", 'B', 16)
pdf.cell(0, 10, "EDA_NYC_Taxi_Analysis_<Khushpreet>", new_x=XPos.LMARGIN, new_y=YPos.NEXT, align='C')

pdf.set_font("helvetica", '', 12)
pdf.multi_cell(0, 10, report_summary)

images = ['trips_by_hour.png', 'trips_by_day.png', 'avg_fare_by_hour.png']
if 'pulocationid' in df.columns:
    images.append('top_pickup_zones.png')
if 'tip_amount' in df.columns:
    images.append('tip_distribution.png')

for img_file in images:
    pdf.add_page()
    pdf.image(img_file, x=15, w=180)

pdf.output("EDA_NYC_Taxi_Analysis_Khushpreet.pdf")
print("PDF report saved as EDA_NYC_Taxi_Analysis_Khushpreet.pdf")

Monthly parquet files found:
['D:\\new downloads\\Datasets and Dictionary-NYC\\Datasets and Dictionary\\trip_records\\2023-1.parquet', 'D:\\new downloads\\Datasets and Dictionary-NYC\\Datasets and Dictionary\\trip_records\\2023-10.parquet', 'D:\\new downloads\\Datasets and Dictionary-NYC\\Datasets and Dictionary\\trip_records\\2023-11.parquet', 'D:\\new downloads\\Datasets and Dictionary-NYC\\Datasets and Dictionary\\trip_records\\2023-12.parquet', 'D:\\new downloads\\Datasets and Dictionary-NYC\\Datasets and Dictionary\\trip_records\\2023-2.parquet', 'D:\\new downloads\\Datasets and Dictionary-NYC\\Datasets and Dictionary\\trip_records\\2023-3.parquet', 'D:\\new downloads\\Datasets and Dictionary-NYC\\Datasets and Dictionary\\trip_records\\2023-4.parquet', 'D:\\new downloads\\Datasets and Dictionary-NYC\\Datasets and Dictionary\\trip_records\\2023-5.parquet', 'D:\\new downloads\\Datasets and Dictionary-NYC\\Datasets and Dictionary\\trip_records\\2023-6.parquet', 'D:\\new downloads\\Da

In [18]:
import os
print(os.getcwd())

C:\Users\Ahc
