In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the full dataset (it's over 5 GB — we won't use it all at once)
df = pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [4]:
# Take a 1 million row sample for faster processing and to avoid memory errors
# This is enough to train a solid model while keeping performance smooth
df_sample = df.sample(n=1_000_000, random_state=42).reset_index(drop=True)

In [5]:
0# Drop rows that contain missing values (like missing dropoff locations)
# Only a few rows have missing data — safe to remove for a clean dataset
df_sample.dropna(inplace=True)

In [6]:
# Convert 'pickup_datetime' from text to actual datetime format
# This lets us extract useful info like hour, day, month, etc.
df_sample['pickup_datetime'] = pd.to_datetime(df_sample['pickup_datetime'], errors='coerce')

In [7]:
# Extract time-based features that may influence taxi fare

# Year of the trip — helpful if fare structure or city regulations changed over time
df_sample['year'] = df_sample['pickup_datetime'].dt.year

# Month of the year — fares may vary in holidays or seasonal weather (e.g., snow)
df_sample['month'] = df_sample['pickup_datetime'].dt.month

# Day of the month — captures monthly trends (e.g., start/end of month demand)
df_sample['day'] = df_sample['pickup_datetime'].dt.day

# Hour of the day — very important! Rush hours (e.g., 8am, 6pm) may increase fare due to traffic or demand
df_sample['hour'] = df_sample['pickup_datetime'].dt.hour

# Day of the week — weekends (Saturday, Sunday) may have different traffic/fare patterns
# Monday = 0, Sunday = 6
df_sample['day_of_week'] = df_sample['pickup_datetime'].dt.dayofweek

In [8]:
# A function is defined to calculate the Haversine distance between two points
# This uses math to compute the shortest path "as the crow flies" between pickup and dropoff
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers

    # Convert latitude and longitude from degrees to radians
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    # Haversine formula to compute distance
    a = np.sin(delta_phi / 2) ** 2 + \
        np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2

    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

In [9]:
# Apply the function to each row to calculate the trip distance
# We add a new column 'distance_km' with the calculated value for each trip
df_sample['distance_km'] = haversine_distance(
    df_sample['pickup_latitude'],
    df_sample['pickup_longitude'],
    df_sample['dropoff_latitude'],
    df_sample['dropoff_longitude']
)

In [10]:
# Keep only clean, realistic data for better model performance
df_sample = df_sample[

    # Fare must be positive and below $500 (beyond that is probably an error or outlier)
    (df_sample['fare_amount'] > 0) &
    (df_sample['fare_amount'] < 500) &

    # Distance must be greater than 0 km and less than 100 km (NYC trips are local)
    (df_sample['distance_km'] > 0.01) & 
    (df_sample['distance_km'] < 100) &

    # Passenger count must be between 1 and 6 (yellow cabs don't carry buses)
    (df_sample['passenger_count'] >= 1) &
    (df_sample['passenger_count'] <= 6)
]

## First Model: Linear Regression

In [11]:
# Import required ML libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [12]:
# Define input features and target column

# These are the features we believe influence the fare amount
features = ['distance_km', 'passenger_count', 'hour', 'day_of_week']

# Target is the column we want to predict
target = 'fare_amount'

# Split data into X (input) and y (target)
X = df_sample[features]
y = df_sample[target]

In [13]:
# Split into training and testing data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
# Train a simple Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [15]:
# Make predictions on the test set
y_pred = lr_model.predict(X_test)

# Evaluate performance using RMSE (Root Mean Squared Error)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Linear Regression RMSE: {rmse_lr:.2f}")

Linear Regression RMSE: 4.87


# Second Model: Random Forrest Regression

In [16]:
# Import the Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

In [17]:
# Create a Random Forest model
# n_estimators = number of decision trees (more = better but slower)
# n_jobs = -1 uses all CPU cores
rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

In [18]:
# Train the Random Forest on the training data
rf_model.fit(X_train, y_train)

In [19]:
# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

In [20]:
# Evaluate using RMSE (Root Mean Squared Error)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print(f"Random Forest RMSE: {rmse_rf:.2f}")

Random Forest RMSE: 4.57


In [21]:
print("Sample shape:", df_sample.shape)

Sample shape: (963281, 14)


# Third Model: Gradient Boosting Regressor

In [22]:
from sklearn.ensemble import GradientBoostingRegressor

In [23]:
# Create the GBR model
gbr_model = GradientBoostingRegressor(
    n_estimators=100,    # number of boosting rounds
    learning_rate=0.1,   # how much each tree corrects the last one
    max_depth=5,         # max depth of each tree
    random_state=42
)

# Train on training data
gbr_model.fit(X_train, y_train)

In [24]:
# Predict on the test set
y_pred_gbr = gbr_model.predict(X_test)

# Calculate RMSE
rmse_gbr = np.sqrt(mean_squared_error(y_test, y_pred_gbr))
print(f"Gradient Boosting RMSE: {rmse_gbr:.2f}")

Gradient Boosting RMSE: 4.25


## Log Models in MLflow

In [25]:
# Import MLflow and the sklearn integration tools
import mlflow
import mlflow.sklearn

In [28]:
# MLflow logging for Linear Regression
with mlflow.start_run(run_name="LinearRegression_v1"):
    mlflow.log_param("model_type", "LinearRegression")

    # Log RMSE
    mlflow.log_metric("rmse", rmse_lr)

    # Log model
    mlflow.sklearn.log_model(lr_model, "model", input_example=X_test.iloc[[0]])



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [29]:
# MLflow logging for Random Forest
with mlflow.start_run(run_name="RandomForest_v1"):
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("model_type", "RandomForest")

    mlflow.log_metric("rmse", rmse_rf)

    mlflow.sklearn.log_model(rf_model, "model", input_example=X_test.iloc[[0]])



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [27]:
# It shows MLflow what kind of input your model expects
input_example = X_test.iloc[[0]]  

# Start recording the experiment in MLflow
with mlflow.start_run(run_name="GradientBoosting_v1"):

    # Log the model's settings (hyperparameters)
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_param("max_depth", 5)

    # Log how well the model performed
    mlflow.log_metric("rmse", rmse_gbr)

    # Save the model to MLflow with an input example
    # This makes the model easier to deploy or use later
    mlflow.sklearn.log_model(gbr_model, "model", input_example=input_example)



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

## Restart with Stratified Sampling Strategy (Smarter 1M Selection)

In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
# Load original dataset
df_v2 = pd.read_csv("train.csv", low_memory=False)

In [9]:
df_v2.shape[0]

55423480

In [8]:
# handling missing values
df_v2 = df_v2.dropna()

In [10]:
df_v2 = df_v2[df_v2['fare_amount'] > 0]

In [12]:
# remove all rows that have passengers count between 0 and 6
df_v2 = df_v2[df_v2['passenger_count'] > 0]
df_v2 = df_v2[df_v2['passenger_count'] <= 6]

In [13]:
df_v2.shape[0]

55224509

In [16]:
df_v2 = df_v2[df_v2['fare_amount'] <= 1000]

In [29]:
# removing all zero value coordinates (not applicable on NYC)
df_v2 = df_v2[
    (df_v2['pickup_latitude'] != 0) &
    (df_v2['pickup_longitude'] != 0) &
    (df_v2['dropoff_latitude'] != 0) &
    (df_v2['dropoff_longitude'] != 0)
]

In [30]:
# removing all positive longitude values (not applicable on NYC)
df_v2 = df_v2[
    (df_v2['pickup_longitude'] < 0) &
    (df_v2['dropoff_longitude'] < 0)
]

In [26]:
df_v2.shape[0]

54094981

In [31]:
# removing all negative values for latitudes (not applicable to NYC)
df_v2 = df_v2[
    (df_v2['pickup_latitude'] > 0) &
    (df_v2['dropoff_latitude'] > 0)
]

In [32]:
df_v2.shape[0]

54093898

In [33]:
# NYC lies roughly between:
#   Latitude:   39 to 41 (North)
#   Longitude: -75 to -72 (West)

# Filter rows where ANY coordinate falls outside these valid bounds
outside_bounds = df_v2[
    (~df_v2['pickup_latitude'].between(39, 41)) |         # Pickup latitude too far south or north
    (~df_v2['dropoff_latitude'].between(39, 41)) |        # Dropoff latitude too far south or north
    (~df_v2['pickup_longitude'].between(-75, -72)) |      # Pickup longitude too far west or east
    (~df_v2['dropoff_longitude'].between(-75, -72))       # Dropoff longitude too far west or east
]

# Display the rows to manually inspect if needed
outside_bounds

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
1483,2009-08-07 21:49:13.0000003,165.0,2009-08-07 21:49:13 UTC,-73.633363,41.032490,-73.633671,41.032598,1
1684,2009-05-02 19:01:01.0000002,2.5,2009-05-02 19:01:01 UTC,-73.137393,41.366138,-73.996510,40.745320,2
3075,2009-01-06 10:53:36.0000004,6.9,2009-01-06 10:53:36 UTC,-73.137393,41.366138,-73.974598,40.751311,1
4487,2009-08-26 07:43:16.0000001,4.9,2009-08-26 07:43:16 UTC,-73.137393,41.366138,-73.957685,40.765644,1
5648,2010-04-28 19:06:13.0000003,7.7,2010-04-28 19:06:13 UTC,-73.137393,41.366138,-73.137393,41.366138,1
...,...,...,...,...,...,...,...,...
55419982,2012-06-06 23:41:00.00000020,12.1,2012-06-06 23:41:00 UTC,-73.958153,40.760655,-7.583332,40.751807,6
55420949,2012-05-01 14:13:00.000000172,11.7,2012-05-01 14:13:00 UTC,-73.981765,40.758052,-0.016667,40.771392,1
55421389,2010-10-15 17:50:00.00000051,2.5,2010-10-15 17:50:00 UTC,-73.937350,40.758262,-1.220690,40.758257,1
55422205,2010-06-15 20:23:51.0000002,4.5,2010-06-15 20:23:51 UTC,-73.137393,41.366138,-73.969837,40.785663,1


In [34]:
# Define your valid NYC coordinate ranges
valid_lat_range = (39, 41)        # NYC roughly spans latitudes 39 to 41
valid_lon_range = (-75, -72)      # NYC roughly spans longitudes -75 to -72

# Keep only rows where ALL coordinates fall within these valid bounds
df_v2 = df_v2[
    df_v2['pickup_latitude'].between(*valid_lat_range) &
    df_v2['dropoff_latitude'].between(*valid_lat_range) &
    df_v2['pickup_longitude'].between(*valid_lon_range) &
    df_v2['dropoff_longitude'].between(*valid_lon_range)
]

In [35]:
df_v2.shape[0]

54042460

In [36]:
# Creating Fare Bins and Take a Smart Sample
# Group fares into levels (cheap to expensive)
df_v2['fare_bin'] = pd.cut(
    df_v2['fare_amount'],
    bins=[0, 10, 20, 30, 50, 100, df_v2['fare_amount'].max()],
    labels=[1, 2, 3, 4, 5, 6]
)

# Pick 1 million rows — but keep fare levels balanced. This method keeps expensive rides too, so your model doesn’t get biased.

split = StratifiedShuffleSplit(n_splits=1, test_size=1_000_000, random_state=42)
for _, sample_index in split.split(df_v2, df_v2['fare_bin']):
    df_sample = df_v2.iloc[sample_index].drop(columns=['fare_bin'])

In [38]:
df_sample.shape[0]

1000000

In [39]:
# Extract Time Features (hour, day, month)

# Convert pickup time from text to datetime format
df_sample['pickup_datetime'] = pd.to_datetime(df_sample['pickup_datetime'])

# Break it into parts (hour, day, etc.)
df_sample['hour'] = df_sample['pickup_datetime'].dt.hour
df_sample['day'] = df_sample['pickup_datetime'].dt.day
df_sample['month'] = df_sample['pickup_datetime'].dt.month
df_sample['year'] = df_sample['pickup_datetime'].dt.year
df_sample['day_of_week'] = df_sample['pickup_datetime'].dt.dayofweek  # Monday = 0

# Mark peak hours (rush traffic times). Morning and evening rush hours often have more traffic = longer time = more fare.
df_sample['is_peak_hour'] = df_sample['hour'].apply(lambda x: 1 if x in [7,8,9,17,18,19] else 0)

In [42]:
df_sample.drop(columns=['pickup_datetime'], inplace=True)

In [44]:
df_sample.drop(columns=['key'], inplace=True)

In [47]:
import numpy as np

# Function to calculate Haversine distance (shortest distance over the Earth’s surface)
def haversine_distance(lat1, lon1, lat2, lon2):
    # Earth radius in kilometers
    R = 6371  

    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Differences in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula for great-circle distance
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # Return distance in kilometers
    return R * c

# Add a new column 'distance_km' to capture how far each ride was. It helps model understand fare logic (longer distance = higher fare, generally)
df_sample['distance_km'] = haversine_distance(
    df_sample['pickup_latitude'],
    df_sample['pickup_longitude'],
    df_sample['dropoff_latitude'],
    df_sample['dropoff_longitude']
)

In [49]:
print("Max distance:", df_sample['distance_km'].max())
print("Min distance:", df_sample['distance_km'].min())
print("Average distance:", df_sample['distance_km'].mean())
print("Standard deviation:", df_sample['distance_km'].std())

Max distance: 161.6861920323465
Min distance: 0.0
Average distance: 3.329978634647537
Standard deviation: 3.8405344120985685


In [51]:
# Remove rows where calculated distance is zero. Distance = 0 usually means bad or incorrect coordinates, which don't help model training.
df_sample = df_sample[df_sample['distance_km'] > 0]

In [52]:
df_sample

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,year,day_of_week,is_peak_hour,distance_km
41732521,14.0,-73.991501,40.749802,-73.973831,40.757240,6,10,26,3,2015,3,0,1.702764
18934690,6.1,-74.004452,40.751948,-73.985082,40.746818,1,19,21,12,2010,1,1,1.728529
26844282,12.5,-73.997728,40.756262,-73.952148,40.811237,1,0,23,3,2012,4,0,7.217697
46649761,14.1,-74.000882,40.725652,-74.006125,40.744935,1,17,28,6,2012,3,1,2.189205
11108668,8.1,-73.989998,40.741774,-74.008457,40.749600,1,19,28,8,2012,1,1,1.781969
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28800003,9.0,-73.961902,40.780060,-73.985408,40.764010,1,20,6,12,2012,3,0,2.665185
38858097,5.5,-73.987373,40.770921,-73.978158,40.785999,1,20,28,11,2013,3,0,1.847437
30762041,4.5,-73.951018,40.777660,-73.955997,40.784836,1,21,22,8,2010,6,0,0.901358
11785642,5.7,-73.988015,40.754617,-73.982093,40.769722,1,12,24,7,2011,6,0,1.752090


In [54]:
# Since our goal is to predict fare amount using only the most useful features. We'll keep features that strongly affect fare, and drop ones that confuse or add noise

# Keeping these features:
# - 'passenger_count' → fare changes by group size
# - 'pickup/dropoff coordinates' → used for location and distance
# - 'distance_km' → longer rides = higher fare
# - 'hour' → rush hours, night time affect fare
# - 'is_peak_hour' → special flag for known rush hours

# Dropping these features:
# - 'year', 'month', 'day', 'day_of_week' → don't add much learning value

final_features = [
    'passenger_count',
    'pickup_latitude', 'pickup_longitude',
    'dropoff_latitude', 'dropoff_longitude',
    'distance_km',
    'hour',              # keep hour to capture rush hour patterns
    'is_peak_hour'       # flag for morning/evening rush
]

# Set up feature matrix (fare_features) and label vector (target_fare)
fare_features = df_sample[final_features]
target_fare = df_sample['fare_amount']

## First Model: Linear Regression

In [55]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [58]:
# We'll use 80% of data for training and 20% for testing
X_train, X_test, Y_train, Y_test = train_test_split(
    fare_features, target_fare, test_size=0.2, random_state=42
)

In [59]:
# Initialize the model
linear_model = LinearRegression()

# Train the model using training data
linear_model.fit(X_train, Y_train)

In [60]:
# Predict fares on test set
Y_pred = linear_model.predict(X_test)

# Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
print(f"Linear Regression RMSE: {rmse:.2f}")

Linear Regression RMSE: 5.89


## Second Model: Random Forrest Regression

In [63]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest model
# Random Forest uses many decision trees to make better predictions
rf_model = RandomForestRegressor(
    n_estimators=100,   # number of trees
    max_depth=10,       # limit tree depth to avoid overfitting
    random_state=42,    # for reproducibility
    n_jobs=-1           # use all CPU cores for speed
)

# Train the model
rf_model.fit(X_train, Y_train)

# Predict on test data
Y_pred_rf = rf_model.predict(X_test)

# Evaluate performance using RMSE
rmse_rf = np.sqrt(mean_squared_error(Y_test, Y_pred_rf))
print(f"Random Forest RMSE: {rmse_rf:.2f}")

Random Forest RMSE: 4.43


## Third Model: Gradient Boost Regression

In [64]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the GBR model
# Fewer trees than RF, but more targeted learning
gbr_model = GradientBoostingRegressor(
    n_estimators=100,    # Number of boosting stages
    learning_rate=0.1,   # Step size for updates, smaller = slower but safer
    max_depth=5,         # Tree depth — balances bias/variance
    random_state=42      # For reproducibility
)

# Train the model
gbr_model.fit(X_train, Y_train)

# Predict on test data
Y_pred_gbr = gbr_model.predict(X_test)

# Evaluate performance
rmse_gbr = np.sqrt(mean_squared_error(Y_test, Y_pred_gbr))
print(f"Gradient Boosting RMSE: {rmse_gbr:.2f}")

Gradient Boosting RMSE: 4.36


## Gradient Boost Regression with Tunned Parameters

In [65]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint, uniform

# 🎯 Define the hyperparameter search space
param_dist = {
    "n_estimators": sp_randint(50, 200),           # Number of boosting rounds (trees). More trees = better fit but slower and risk of overfitting

    "learning_rate": uniform(0.01, 0.15),           # Step size to shrink each tree's contribution. Lower values = more reliable but need more trees

    "max_depth": sp_randint(3, 8),                  # Maximum depth of each tree. Controls how complex each tree can get

    "min_samples_split": sp_randint(2, 10),         # Minimum samples needed to split a node. Prevents trees from being too specific to noise

    "min_samples_leaf": sp_randint(1, 6),           # Minimum samples at a leaf (end node). Adds stability to predictions

    "subsample": uniform(0.6, 0.3)                  # Fraction of rows to use in each tree (0.6–0.9). Introduces randomness, helps generalization
}

# Initialize the base Gradient Boosting Regressor
gbr_base = GradientBoostingRegressor(random_state=42)

# Setup Randomized Search for hyperparameter tuning
gbr_search = RandomizedSearchCV(
    estimator=gbr_base,                            # The model we want to tune
    param_distributions=param_dist,                # The hyperparameter ranges
    n_iter=20,                                     # Try 20 random combinations of hyperparameters
    scoring='neg_root_mean_squared_error',         # We want to minimize RMSE (lower is better)
    cv=2,                                          # 2-fold cross-validation to balance speed and reliability
    n_jobs=-1,                                     # Use all available CPU cores for faster training
    verbose=2,                                     # Show detailed logs while running
    random_state=42                                # Ensures reproducibility of the search
)

# Fit the randomized search on training data
gbr_search.fit(X_train, Y_train)

# Pick the best model from the search
best_gbr = gbr_search.best_estimator_

# Predict using the best model
Y_pred_best = best_gbr.predict(X_test)

# 📏 Evaluate performance using RMSE
rmse_best = np.sqrt(mean_squared_error(Y_test, Y_pred_best))
print(f"Tuned Gradient Boosting RMSE: {rmse_best:.2f}")

Fitting 2 folds for each of 20 candidates, totalling 40 fits
Tuned Gradient Boosting RMSE: 4.32


## Fourth Model: XGBoost Regressor

In [70]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Initialize the XGBoost model with reasonable defaults
xgb_model = XGBRegressor(
    n_estimators=100,      # Number of trees
    max_depth=5,           # Tree depth controls complexity
    learning_rate=0.1,     # Step size per tree
    n_jobs=-1,             # Use all CPU cores
    random_state=42        # Reproducibility
)

xgb_model.fit(X_train, Y_train)
Y_pred_xgb = xgb_model.predict(X_test)

# Evaluate RMSE
rmse_xgb = np.sqrt(mean_squared_error(Y_test, Y_pred_xgb))
print(f"XGBoost RMSE: {rmse_xgb:.2f}")

XGBoost RMSE: 4.39


In [72]:
import mlflow
import mlflow.sklearn

# Log Tuned Data Linear Regression Model
with mlflow.start_run(run_name="LinearRegression"):
    mlflow.sklearn.log_model(linear_model, "model", input_example=X_test[:5])
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("rmse", rmse)

# Log Tuned Data Random Forest Model
with mlflow.start_run(run_name="RandomForest"):
    mlflow.sklearn.log_model(rf_model, "model", input_example=X_test[:5])
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)
    mlflow.log_metric("rmse", rmse_rf)

# Log Tuned Data Gradient Boosting Model (Tuned or Untuned)
with mlflow.start_run(run_name="GradientBoosting"):
    mlflow.sklearn.log_model(gbr_model, "model", input_example=X_test[:5])
    mlflow.log_param("model_type", "GradientBoosting")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 5)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_metric("rmse", rmse_gbr)


# Log Tuned Data Tuned Gradient Boosting Model
with mlflow.start_run(run_name="Tuned_GradientBoosting"):
    mlflow.sklearn.log_model(best_gbr, "model", input_example=X_test[:5])
    mlflow.log_param("model_type", "Tuned_GradientBoosting")
    mlflow.log_params(best_gbr.get_params())
    mlflow.log_metric("rmse", rmse_best)

# Log Tuned Data XGBoost Model
with mlflow.start_run(run_name="XGBoost"):
    mlflow.sklearn.log_model(xgb_model, "model", input_example=X_test[:5])
    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 5)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_metric("rmse", rmse_xgb)



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]