# CAP FINAL PROJECT A

## 1. Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from math import radians, sin, cos, sqrt, atan2

### Loading dataset

In [2]:
df_test = pd.read_csv('new-york-city-taxi-fare-prediction/train.csv' )
df_train = pd.read_csv('new-york-city-taxi-fare-prediction/test.csv' )
df = pd.concat([df_train, df_test], axis=0)
df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1,
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1,
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1,
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1,
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1,


### Clean Data - Remove Outliers

In [None]:
# Filter unrealistic fare amounts
df = df[df['fare_amount'] >= 2]

# Define NYC latitude and longitude boundaries
nyc_min_longitude, nyc_max_longitude = -74.05, -73.75
nyc_min_latitude, nyc_max_latitude = 40.63, 40.85

# Filter out coordinates outside NYC boundaries
df = df[
    (df['pickup_longitude'].between(nyc_min_longitude, nyc_max_longitude)) &
    (df['pickup_latitude'].between(nyc_min_latitude, nyc_max_latitude)) &
    (df['dropoff_longitude'].between(nyc_min_longitude, nyc_max_longitude)) &
    (df['dropoff_latitude'].between(nyc_min_latitude, nyc_max_latitude))
]


Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount
0,2009-06-15 17:26:21.0000001,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,4.5
1,2010-01-05 16:52:16.0000002,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,16.9
2,2011-08-18 00:35:00.00000049,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,5.7
3,2012-04-21 04:30:42.0000001,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,7.7
4,2010-03-09 07:51:00.000000135,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,5.3


### Extract Time-based Features

In [3]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['weekday'] = df['pickup_datetime'].dt.weekday
df['month'] = df['pickup_datetime'].dt.month 


### Calculate Distance using Haversine Formula

In [5]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

df['distance_km'] = df.apply(
    lambda row: haversine_distance(
        row['pickup_latitude'], row['pickup_longitude'],
        row['dropoff_latitude'], row['dropoff_longitude']
    ), axis=1
)

MemoryError: Unable to allocate 4.96 GiB for an array with shape (12, 55433770) and data type object

### Perform Geographical Clustering

In [None]:
# Pickup clusters
coords = df[['pickup_latitude', 'pickup_longitude']].values
kmeans = KMeans(n_clusters=10, random_state=42)
df['pickup_cluster'] = kmeans.fit_predict(coords)

# Drop-off clusters
coords = df[['dropoff_latitude', 'dropoff_longitude']].values
df['dropoff_cluster'] = kmeans.fit_predict(coords)

# 8. Add Time-based Features (e.g., Peak Hours)
def is_peak_hour(hour):
    return 1 if 7 <= hour <= 9 or 16 <= hour <= 19 else 0

df['is_peak_hour'] = df['hour'].apply(is_peak_hour)

### Normalize Numerical Features

In [None]:
scaler = StandardScaler()
df[['distance_km', 'pickup_latitude', 'pickup_longitude', 
    'dropoff_latitude', 'dropoff_longitude']] = scaler.fit_transform(
        df[['distance_km', 'pickup_latitude', 'pickup_longitude', 
            'dropoff_latitude', 'dropoff_longitude']]
)
