In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
import numpy as np

# --- Modify this line to load your CSV file ---
# The filename is 'uber.csv'
# Changed sep='\t' to sep=',' because the output showed comma separation.
try:
    df = pd.read_csv('uber.csv', sep=',') # Changed separator here!

    # --- Added lines to check the loaded data (keep these for verification) ---
    print("Columns loaded:", df.columns.tolist())
    print("\nFirst 5 rows of the DataFrame:")
    print(df.head())
    print("-" * 30) # Separator for clarity
    # --- End of added lines ---

except FileNotFoundError:
    print("Error: 'uber.csv' not found. Please make sure the CSV file is in the correct directory.")
    exit() # Exit if the file is not found
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")
    exit()

# --- Preprocessing ---
# Convert pickup_datetime to datetime objects
# We use errors='coerce' to turn any parsing errors into NaT (Not a Time)
# Ensure the column exists before attempting to process it
if 'pickup_datetime' not in df.columns:
    print("Error: 'pickup_datetime' column not found in the CSV file after loading.")
    print("Please check the column names printed above and the separator used.")
    exit()

# Now this line should work as 'pickup_datetime' will be a recognized column
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'].astype(str).str.replace(' UTC', '', regex=False), errors='coerce')

# Drop rows where pickup_datetime is NaT after coercion
df.dropna(subset=['pickup_datetime'], inplace=True)

# Extract features from pickup_datetime
df['year'] = df['pickup_datetime'].dt.year
df['month'] = df['pickup_datetime'].dt.month
df['day'] = df['pickup_datetime'].dt.day
df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek # Monday=0, Sunday=6

# Calculate distance between pickup and dropoff coordinates
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    # Check for NaN before converting to radians
    if pd.isna([lat1, lon1, lat2, lon2]).any():
        return np.nan
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

# Apply distance calculation
# Ensure coordinate columns exist before applying
coordinate_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
# Check column existence after changing separator
if not all(col in df.columns for col in coordinate_cols):
     print(f"Error: Missing one or more coordinate columns: {coordinate_cols} after loading with comma separator.")
     print("Please check the actual column names and separator in your CSV file.")
     exit()


df['distance'] = df.apply(lambda row: haversine_distance(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

# Drop rows with NaN values created during preprocessing
# This includes NaNs from datetime conversion, distance calculation, etc.
df.dropna(inplace=True)

# Ensure the DataFrame is not empty after dropping NaNs
if df.empty:
    print("Error: DataFrame is empty after dropping rows with missing or invalid data.")
    print("Please check your data file for completeness and correct formatting.")
    exit()

# Define features (X) and target (y)
features = ['year', 'month', 'day', 'hour', 'day_of_week', 'passenger_count', 'distance']
target = 'fare_amount'

# Ensure that the selected features and target columns exist in the DataFrame after preprocessing
# This check is redundant if dropna is done, but good practice
missing_features = [f for f in features if f not in df.columns]
if missing_features:
     print(f"Error: Missing feature column(s) in DataFrame after preprocessing: {missing_features}")
     print("Please check your data and preprocessing steps.")
     exit()

if target not in df.columns:
    print(f"Error: Target column '{target}' not found in the DataFrame after preprocessing.")
    print("Please check your data and preprocessing steps.")
    exit()


X = df[features]
y = df[target]

# Split data into training and testing sets
# We check if there's enough data to split
if len(df) < 2:
    print("Error: Not enough data to perform a train-test split. Need at least 2 data points.")
    exit()

# Adjust test_size if the dataset is very small, ensuring at least one sample in each set
test_size = 0.2
if len(df) * test_size < 1 and len(df) > 1:
    test_size = 1 / len(df) # Ensure at least one sample in test set if possible
if len(df) * (1 - test_size) < 1 and len(df) > 1:
     test_size = (len(df) - 1) / len(df) # Ensure at least one sample in training set if possible
if test_size >= 1 or test_size <= 0: # Fallback for extremely small datasets (<=1)
     if len(df) == 1:
          print("Warning: Only one data point available. Cannot perform train-test split.")
          exit()
     test_size = 0.5 # Default to 50/50 split for small >1 datasets


try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
except ValueError as e:
    print(f"Error during train-test split: {e}")
    print("This might happen if the dataset is too small or test_size calculation resulted in an invalid split.")
    exit()


# Initialize and train different models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

results = {}

# Check if test set is empty before making predictions
if X_test.empty:
    print("Error: Test set is empty after splitting. Cannot evaluate models.")
else:
    for name, model in models.items():
        print(f"Training {name}...")
        # Ensure there's enough data in training set
        if X_train.empty or y_train.empty:
             print(f"Error: Training set is empty for {name}. Cannot train model.")
             continue # Skip to the next model

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred) # Added MSE
        rmse = np.sqrt(mse) # Calculate RMSE from MSE
        medae = median_absolute_error(y_test, y_pred) # Added Median Absolute Error

        # Calculate R-squared
        if np.std(y_test) == 0:
            r2 = np.nan # R-squared is undefined if the target variable has no variance
        else:
            r2 = r2_score(y_test, y_pred)

        # Calculate Mean Absolute Percentage Error (MAPE)
        # Avoid division by zero if actual fare is 0
        y_test_no_zero = y_test[y_test != 0]
        y_pred_no_zero = y_pred[y_test != 0]
        if len(y_test_no_zero) > 0:
             mape = np.mean(np.abs((y_test_no_zero - y_pred_no_zero) / y_test_no_zero)) * 100
        else:
             mape = np.nan # Cannot calculate MAPE if all actual values are zero


        results[name] = {
            "MAE": mae,
            "MSE": mse, # Added MSE
            "RMSE": rmse,
            "MedAE": medae, # Added Median Absolute Error
            "R-squared": r2,
            "MAPE": mape # Added MAPE
            }

    # Print comparison of results
    print("\n" + "="*40) # Wider Separator
    print("Model Performance Comparison:")
    print("="*40) # Wider Separator
    if results: # Only print results if models were evaluated
        for name, metrics in results.items():
            print(f"\n--- {name} ---")
            print(f"  Mean Absolute Error (MAE): {metrics['MAE']:.2f}")
            print(f"  Mean Squared Error (MSE): {metrics['MSE']:.2f}") # Print MSE
            print(f"  Root Mean Squared Error (RMSE): {metrics['RMSE']:.2f}")
            print(f"  Median Absolute Error (MedAE): {metrics['MedAE']:.2f}") # Print MedAE
            print(f"  R-squared: {metrics['R-squared']:.2f}" if not np.isnan(metrics['R-squared']) else "  R-squared: Not applicable (target variable is constant in test set)")
            print(f"  Mean Absolute Percentage Error (MAPE): {metrics['MAPE']:.2f}%" if not np.isnan(metrics['MAPE']) else "  Mean Absolute Percentage Error (MAPE): Not applicable (contains zero actual values)")
    else:
        print("No models were successfully trained and evaluated.")

Columns loaded: ['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

First 5 rows of the DataFrame:
   Unnamed: 0                            key  fare_amount  \
0    24238194    2015-05-07 19:52:06.0000003          7.5   
1    27835199    2009-07-17 20:04:56.0000002          7.7   
2    44984355   2009-08-24 21:45:00.00000061         12.9   
3    25894730    2009-06-26 08:22:21.0000001          5.3   
4    17610152  2014-08-28 17:47:00.000000188         16.0   

           pickup_datetime  pickup_longitude  pickup_latitude  \
0  2015-05-07 19:52:06 UTC        -73.999817        40.738354   
1  2009-07-17 20:04:56 UTC        -73.994355        40.728225   
2  2009-08-24 21:45:00 UTC        -74.005043        40.740770   
3  2009-06-26 08:22:21 UTC        -73.976124        40.790844   
4  2014-08-28 17:47:00 UTC        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
from sklearn.preprocessing import StandardScaler # Import StandardScaler
import numpy as np
import warnings

# Ignore potential warnings from sklearn or pandas
warnings.filterwarnings('ignore')

# --- File Loading ---
# The filename is 'uber.csv'
# Changed sep='\t' to sep=',' because the output showed comma separation.
try:
    # Added low_memory=False to handle potential mixed data types in large files
    df = pd.read_csv('uber.csv', sep=',', low_memory=False) # Changed separator here!

    # --- Added lines to check the loaded data (keep these for verification) ---
    print("Columns loaded:", df.columns.tolist())
    print("\nFirst 5 rows of the DataFrame:")
    print(df.head())
    print("-" * 30) # Separator for clarity
    # --- End of added lines ---

except FileNotFoundError:
    print("Error: 'uber.csv' not found. Please make sure the CSV file is in the correct directory.")
    exit() # Exit if the file is not found
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")
    exit()

# --- Preprocessing and Feature Engineering ---

# Explicitly drop the 'Unnamed: 0' column if it exists (from previous saving)
if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)
    print("Dropped 'Unnamed: 0' column.")

# Ensure essential columns exist before proceeding
required_cols = ['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
if not all(col in df.columns for col in required_cols):
    missing = [col for col in required_cols if col not in df.columns]
    print(f"Error: Missing required column(s) in the CSV file: {missing}")
    print("Please check the column names and separator in your CSV file.")
    exit()

# Convert pickup_datetime to datetime objects
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'].astype(str).str.replace(' UTC', '', regex=False), errors='coerce')

# Drop rows where pickup_datetime is NaT after coercion
df.dropna(subset=['pickup_datetime'], inplace=True)

# Extract time-based features
df['year'] = df['pickup_datetime'].dt.year
df['month'] = df['pickup_datetime'].dt.month
df['day'] = df['pickup_datetime'].dt.day
df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek # Monday=0, Sunday=6
df['day_of_year'] = df['pickup_datetime'].dt.dayofyear # Added Day of Year

# Add cyclical features for hour and day_of_week
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['dayofweek_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)


# Calculate distance between pickup and dropoff coordinates
coordinate_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
# Check coordinate columns exist before applying - checked above, but double check here
if not all(col in df.columns for col in coordinate_cols):
     print(f"Error: Missing one or more coordinate columns: {coordinate_cols} before distance calculation.")
     exit()

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    # Check for valid numbers before converting to radians
    if pd.isna([lat1, lon1, lat2, lon2]).any():
        return np.nan
    # Add basic check for realistic coordinates (within a broad range)
    # This is a very rough check, more precise checks are possible for specific regions
    if not (-180 <= lon1 <= 180 and -90 <= lat1 <= 90 and -180 <= lon2 <= 180 and -90 <= lat2 <= 90):
         return np.nan
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

df['distance'] = df.apply(lambda row: haversine_distance(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

# --- Basic Outlier and Data Cleaning ---
# Remove rows with invalid fare amounts (<= 0)
df = df[df['fare_amount'] > 0]

# Remove rows with invalid passenger counts (0 or potentially very high - assuming max 6 for typical car)
df = df[df['passenger_count'] > 0]
# You might adjust the upper limit based on domain knowledge or data exploration
# df = df[df['passenger_count'] <= 6] # Optional: Uncomment to limit passenger count

# Remove rows with invalid distance (<= 0) or NaN distances from haversine calculation
df = df[df['distance'] > 0] # Keep only trips with positive distance

# Drop any rows that might still have NaNs after cleaning and feature engineering
df.dropna(inplace=True)

# Ensure the DataFrame is not empty after dropping NaNs and outliers
if df.empty:
    print("Error: DataFrame is empty after dropping rows with missing, invalid, or outlier data.")
    print("Consider reviewing the cleaning steps or the input data.")
    exit()
else:
    print(f"\nDataFrame size after cleaning and feature engineering: {len(df)} rows")
    print("-" * 30)


# Define features (X) and target (y)
# Updated features list to include cyclical features
features = ['year', 'month', 'day', 'day_of_year', 'passenger_count', 'distance',
            'hour_sin', 'hour_cos', 'dayofweek_sin', 'dayofweek_cos']
target = 'fare_amount'

# Ensure that the selected features and target columns exist in the DataFrame
missing_features = [f for f in features if f not in df.columns]
if missing_features:
     print(f"Error: Missing feature column(s) in DataFrame after preprocessing: {missing_features}")
     print("Please check your data and preprocessing steps.")
     exit()

if target not in df.columns:
    print(f"Error: Target column '{target}' not found in the DataFrame after preprocessing.")
    print("Please check your data and preprocessing steps.")
    exit()


X = df[features]
y = df[target]

# Split data into training and testing sets
# We check if there's enough data to split
if len(df) < 2:
    print("Error: Not enough data to perform a train-test split. Need at least 2 data points.")
    exit()

# Adjust test_size if the dataset is very small, ensuring at least one sample in each set
test_size = 0.2
if len(df) * test_size < 1 and len(df) > 1:
    test_size = 1 / len(df) # Ensure at least one sample in test set if possible
if len(df) * (1 - test_size) < 1 and len(df) > 1:
     test_size = (len(df) - 1) / len(df) # Ensure at least one sample in training set if possible
if test_size >= 1 or test_size <= 0: # Fallback for extremely small datasets (<=1)
     if len(df) == 1:
          print("Warning: Only one data point available. Cannot perform train-test split.")
          exit()
     test_size = 0.5 # Default to 50/50 split for small >1 datasets


try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    print(f"Data split into {len(X_train)} training and {len(X_test)} testing samples.")
except ValueError as e:
    print(f"Error during train-test split: {e}")
    print("This might happen if the dataset is too small or test_size calculation resulted in an invalid split.")
    exit()

# --- Feature Scaling ---
# Initialize StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
# This prevents data leakage from the test set into the scaling calculation
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert scaled arrays back to DataFrames (optional, but keeps structure)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=features, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=features, index=X_test.index)


# Initialize and train different models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42), # Tree models are less sensitive to scaling
    "Random Forest": RandomForestRegressor(random_state=42), # Tree models are less sensitive to scaling
    "Gradient Boosting": GradientBoostingRegressor(random_state=42) # Tree models are less sensitive to scaling
}

results = {}

# Check if test set is empty before making predictions
if X_test_scaled_df.empty: # Use the scaled test set
    print("Error: Test set is empty after splitting or scaling. Cannot evaluate models.")
else:
    for name, model in models.items():
        print(f"Training {name}...")
        # Ensure there's enough data in training set
        if X_train_scaled_df.empty or y_train.empty: # Use the scaled training set
             print(f"Error: Training set is empty for {name}. Cannot train model.")
             continue # Skip to the next model

        # Train on scaled data
        model.fit(X_train_scaled_df, y_train)
        # Predict using scaled test data
        y_pred = model.predict(X_test_scaled_df)

        # Calculate metrics (using original y_test for evaluation)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        medae = median_absolute_error(y_test, y_pred)

        # Calculate R-squared
        if np.std(y_test) == 0:
            r2 = np.nan # R-squared is undefined if the target variable has no variance
        else:
            r2 = r2_score(y_test, y_pred)

        # Calculate Mean Absolute Percentage Error (MAPE)
        # Avoid division by zero if actual fare is 0
        y_test_no_zero = y_test[y_test != 0]
        # Need to align y_pred with the filtered y_test
        y_pred_no_zero = y_pred[y_test.index.isin(y_test_no_zero.index)]

        if len(y_test_no_zero) > 0:
             mape = np.mean(np.abs((y_test_no_zero - y_pred_no_zero) / y_test_no_zero)) * 100
        else:
             mape = np.nan # Cannot calculate MAPE if all actual values are zero


        results[name] = {
            "MAE": mae,
            "MSE": mse,
            "RMSE": rmse,
            "MedAE": medae,
            "R-squared": r2,
            "MAPE": mape
            }

    # --- Print comparison of results with context ---
    print("\n" + "="*60) # Wider Separator
    print("Model Performance Comparison on Test Set")
    print("="*60) # Wider Separator

    print("\nMetrics Explanation:")
    print("--------------------")
    print("MAE (Mean Absolute Error): Average magnitude of prediction errors ($). Lower is better.")
    print("MSE (Mean Squared Error): Average of squared prediction errors. Penalizes large errors more. Lower is better.")
    print("RMSE (Root Mean Squared Error): Square root of MSE ($). Typical magnitude of prediction errors. Lower is better.")
    print("MedAE (Median Absolute Error): Median magnitude of prediction errors ($). Robust to outliers. Lower is better.")
    print("R-squared (R²): Proportion of variance in fare explained by the model (0 to 1). Higher is better.")
    print("MAPE (Mean Absolute Percentage Error): Average percentage error relative to the actual fare (%). Lower is better.")
    print("-" * 60)

    if results: # Only print results if models were evaluated
        # Create a pandas DataFrame for better comparison output
        results_df = pd.DataFrame(results).T # Transpose to have models as rows
        # Format the MAPE column as a string with '%' for better readability
        results_df['MAPE'] = results_df['MAPE'].apply(lambda x: f"{x:.2f}%" if not np.isnan(x) else "N/A")
        print(results_df.round(2)) # Print other metrics with 2 decimal places
    else:
        print("No models were successfully trained and evaluated.")

    print("="*60) # Bottom Separator

Columns loaded: ['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

First 5 rows of the DataFrame:
   Unnamed: 0                            key  fare_amount  \
0    24238194    2015-05-07 19:52:06.0000003          7.5   
1    27835199    2009-07-17 20:04:56.0000002          7.7   
2    44984355   2009-08-24 21:45:00.00000061         12.9   
3    25894730    2009-06-26 08:22:21.0000001          5.3   
4    17610152  2014-08-28 17:47:00.000000188         16.0   

           pickup_datetime  pickup_longitude  pickup_latitude  \
0  2015-05-07 19:52:06 UTC        -73.999817        40.738354   
1  2009-07-17 20:04:56 UTC        -73.994355        40.728225   
2  2009-08-24 21:45:00 UTC        -74.005043        40.740770   
3  2009-06-26 08:22:21 UTC        -73.976124        40.790844   
4  2014-08-28 17:47:00 UTC        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude 