In [1]:
import pandas as pd
import numpy as np
import requests
import io
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


## Pull data from S3

In [2]:
url = "https://caiimowdcnrwheoelnfg.supabase.co/storage/v1/object/public/data/data.csv"

# Fetch the data using requests
response = requests.get(url)

# Ensure the request was successful
response.raise_for_status()

# Read the CSV data into a pandas DataFrame
df = pd.read_csv(io.StringIO(response.text))

### Preprocessing functions

In [3]:
def preprocess_data(df):
    """
    Handle missing values, outliers, and perform feature engineering.
    """
    # Convert pickup_datetime to datetime object
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    
    # Handle missing values
    df = df.dropna()
    
    # Remove extreme outliers in fare_amount
    df = df[(df['fare_amount'] > 0) & (df['fare_amount'] < 1000)]
    
    # Extract features from pickup_datetime
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day'] = df['pickup_datetime'].dt.day
    df['month'] = df['pickup_datetime'].dt.month
    df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
    
    # Segment data based on time of day
    df['time_segment'] = pd.cut(df['hour'], 
                                bins=[0, 6, 12, 18, 24], 
                                labels=['night', 'morning', 'afternoon', 'evening'])
    
    # Calculate trip distances
    df['distance'] = haversine_distance(df['pickup_latitude'], df['pickup_longitude'],
                                        df['dropoff_latitude'], df['dropoff_longitude'])
    
    # Segment data based on passenger count
    df['vehicle_type'] = pd.cut(df['passenger_count'], 
                                bins=[0, 3, 5, np.inf], 
                                labels=['min', 'mid', 'max'])
    
    df = df.dropna()

    return df

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points on the earth.
    """
    R = 6371  # Earth's radius in kilometers

    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c

    return distance

df = preprocess_data(df)

In [7]:
df

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,day_of_week,time_segment,distance,vehicle_type
0,24238194,2015-05-07 19:52:06,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,19,7,5,3,evening,1.683323,min
1,27835199,2009-07-17 20:04:56,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,1,20,17,7,4,evening,2.457590,min
2,44984355,2009-08-24 21:45:00,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,1,21,24,8,0,evening,5.036377,min
3,25894730,2009-06-26 8:22:21,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,8,26,6,4,morning,1.661683,min
4,17610152,2014-08-28 17:47:00,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,17,28,8,3,afternoon,4.475450,mid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199994,3189201,2014-01-31 14:42:00,12.0,2014-01-31 14:42:00+00:00,-73.983070,40.760770,-73.972972,40.754177,1,14,31,1,4,afternoon,1.122878,min
199995,42598914,2012-10-28 10:49:00,3.0,2012-10-28 10:49:00+00:00,-73.987042,40.739367,-73.986525,40.740297,1,10,28,10,6,morning,0.112210,min
199996,16382965,2014-03-14 1:09:00,7.5,2014-03-14 01:09:00+00:00,-73.984722,40.736837,-74.006672,40.739620,1,1,14,3,4,night,1.875050,min
199998,20259894,2015-05-20 14:56:25,14.5,2015-05-20 14:56:25+00:00,-73.997124,40.725452,-73.983215,40.695416,1,14,20,5,2,afternoon,3.539715,min


# Train the model

In [8]:
def train_and_evaluate_model(df):
    # Select features and target
    features = ['hour', 'day', 'month', 'day_of_week', 'distance', 'passenger_count']
    X = df[features]
    y = df['fare_amount']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"Root Mean Squared Error: {rmse:.2f}")
    print(f"R-squared Score: {r2:.2f}")

    return model

model = train_and_evaluate_model(df)

Root Mean Squared Error: 5.12
R-squared Score: 0.72


# Test with sample input

In [10]:
sample_input = pd.DataFrame({
    'hour': [19],
    'day': [7],
    'month': [5],
    'day_of_week': [3],
    'distance': [1.683323],
    'passenger_count': [1]
})

prediction = model.predict(sample_input)
print(f"Predicted fare for sample input: ${prediction[0]:.2f}")

Predicted fare for sample input: $7.37
