# NYC Delivery Truck Congestion â€“ Step 3: Feature Engineering
*Author: Karan Chauhan*  

This notebook extracts temporal features from the filtered 311 complaint dataset to prepare it for machine learning modeling.

**Features created:**
- Hour of day (0-23)
- Day of week (0=Monday, 6=Sunday)
- Weekend flag (binary)
- Rush hour flag (binary)
- Month (1-12)

---

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Filtered Dataset

In [None]:
df = pd.read_csv('../data/311_truck_broad_filtered.csv')

print(f"Loaded {len(df):,} truck-related complaints")
print(f"Date range: {df['created_date'].min()} to {df['created_date'].max()}")

df.head()

## Convert to Datetime Format

In [None]:
df['created_date'] = pd.to_datetime(df['created_date'])

print(f"Converted to datetime: {df['created_date'].dtype}")
print(f"Example: {df['created_date'].iloc[0]}")

## Extract Temporal Features

### Hour of Day

In [None]:
df['hour'] = df['created_date'].dt.hour

print("Hour distribution:")
print(df['hour'].value_counts().sort_index())

In [None]:
plt.figure(figsize=(12, 4))
df['hour'].value_counts().sort_index().plot(kind='bar', color='steelblue')
plt.title('Complaint Distribution by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Complaints')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

### Day of Week

In [None]:
df['day_of_week'] = df['created_date'].dt.dayofweek

day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 
             4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
df['day_name'] = df['day_of_week'].map(day_names)

print("Day of week distribution:")
print(df['day_name'].value_counts().reindex(['Monday', 'Tuesday', 'Wednesday', 
                                               'Thursday', 'Friday', 'Saturday', 'Sunday']))

In [None]:
plt.figure(figsize=(10, 4))
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
counts = df['day_name'].value_counts().reindex(day_order)
counts.plot(kind='bar', color=['steelblue']*5 + ['coral']*2)
plt.title('Complaint Distribution by Day of Week')
plt.xlabel('Day')
plt.ylabel('Number of Complaints')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Weekend Flag

In [None]:
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

print("Weekend vs Weekday counts:")
print(df['is_weekend'].value_counts())
print(f"\nWeekend percentage: {df['is_weekend'].mean()*100:.1f}%")

### Rush Hour Flag

In [None]:
rush_hours = [7, 8, 9, 16, 17, 18]
df['is_rush_hour'] = df['hour'].isin(rush_hours).astype(int)

print("Rush hour vs Non-rush hour counts:")
print(df['is_rush_hour'].value_counts())
print(f"\nRush hour percentage: {df['is_rush_hour'].mean()*100:.1f}%")

### Month

In [None]:
df['month'] = df['created_date'].dt.month

print("Month distribution:")
month_names = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
               7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
month_counts = df['month'].value_counts().sort_index()
print(month_counts)

In [None]:
plt.figure(figsize=(12, 4))
month_counts.plot(kind='bar', color='teal')
plt.title('Complaint Distribution by Month')
plt.xlabel('Month')
plt.ylabel('Number of Complaints')
plt.xticks(range(len(month_counts)), [month_names[m] for m in month_counts.index], rotation=0)
plt.tight_layout()
plt.show()

## Feature Validation

In [None]:
feature_cols = ['hour', 'day_of_week', 'is_weekend', 'is_rush_hour', 'month']

print("Missing values:")
print(df[feature_cols].isnull().sum())

print("\nValue ranges:")
print(f"Hour: {df['hour'].min()} to {df['hour'].max()}")
print(f"Day of week: {df['day_of_week'].min()} to {df['day_of_week'].max()}")
print(f"Is weekend: {sorted(df['is_weekend'].unique())}")
print(f"Is rush hour: {sorted(df['is_rush_hour'].unique())}")
print(f"Month: {df['month'].min()} to {df['month'].max()}")

## Feature Summary Statistics

In [None]:
print("Descriptive statistics:")
print(df[feature_cols].describe())

print("\nFeature correlations:")
correlation = df[feature_cols].corr()
print(correlation)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## Save Enhanced Dataset

In [None]:
output_df = df[[
    'created_date', 'complaint_type', 'descriptor', 'borough',
    'latitude', 'longitude', 'street_name',
    'hour', 'day_of_week', 'day_name', 'is_weekend', 'is_rush_hour', 'month'
]].copy()

output_path = '../data/complaints_with_features.csv'
output_df.to_csv(output_path, index=False)

print(f"Saved {len(output_df):,} rows with {len(output_df.columns)} columns")
print(f"Output file: {output_path}")
print(f"\nNew features: hour, day_of_week, day_name, is_weekend, is_rush_hour, month")

output_df.head(10)