In [6]:
import pandas as pd

data = pd.read_csv("C:\\Users\\workk\\fare-sense-smart-route-fare-estimation\\taxi_data.csv")  # replace with your file path


In [7]:
# Drop rows with NaN target
data_clean = data.dropna(subset=['Trip_Price'])

# Separate features and target
X = data_clean.drop('Trip_Price', axis=1)
y = data_clean['Trip_Price']

# Numeric and categorical columns
numeric_features = ['Trip_Distance_km','Passenger_Count','Base_Fare','Per_Km_Rate','Per_Minute_Rate','Trip_Duration_Minutes']
categorical_features = ['Time_of_Day','Day_of_Week','Traffic_Conditions','Weather']

# Impute numeric
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy='mean')
X[numeric_features] = num_imputer.fit_transform(X[numeric_features])

# Fill categorical NaNs
X[categorical_features] = X[categorical_features].fillna('Unknown')


In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(sparse_output=False), categorical_features)  # use sparse_output
    ]
)

X_preprocessed = preprocessor.fit_transform(X)




In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=42
)


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Ensure X_train and y_train are defined
try:
    X_train
    y_train
except NameError:
    raise RuntimeError("Please run the cell that defines X_train and y_train before running this cell.")

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=3,    # smaller CV for speed
    n_jobs=-1,
    scoring='r2'
)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [11]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = best_rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Tuned Model RMSE: {rmse}")
print(f"Tuned Model R² Score: {r2}")


Tuned Model RMSE: 24.042287437886753
Tuned Model R² Score: 0.7527009436193269


In [12]:
import joblib

joblib.dump(best_rf, 'tuned_random_forest_model.pkl')


['tuned_random_forest_model.pkl']