In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Load the full dataset
df = pd.read_csv("../datasets/synthetic_ride_hailing_dataset.csv")

In [3]:
# Add a 'Base Fare' column with realistic values (e.g., $5 to $20)
np.random.seed(42)  # for reproducibility
df['Base Fare'] = np.round(np.random.uniform(5, 20, size=len(df)), 2)

# Label encode categorical features
df['Time of Day'] = LabelEncoder().fit_transform(df['Time of Day'])
df['Month of Year'] = LabelEncoder().fit_transform(df['Month of Year'])
df['Pickup Location'] = LabelEncoder().fit_transform(df['Pickup Location'])

# Shuffle the dataset (important!)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
# Step 1: First split into train and temp (80% train, 20% temp)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Step 2: Split temp into val and test (50% each of 20% → 10% each overall)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Optional: Save splits
train_df.to_csv("../datasets/train_split.csv", index=False)
val_df.to_csv("../datasets/val_split.csv", index=False)
test_df.to_csv("../datasets/test_split.csv", index=False)

In [5]:
mean_base_fare = df["Base Fare"].mean()
mean_request_to_pickup = df["Request to Pickup"].mean()

print("Mean Base Fare:", mean_base_fare)
print("Mean Request to Pickup:", mean_request_to_pickup)


Mean Base Fare: 12.482933000000001
Mean Request to Pickup: 295.55106
