In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the cleaned data
cleaned_df = pd.read_csv('../data/cleaned_car_data.csv')

# Define features (X) and target variable (y)
X = cleaned_df.drop(columns=['price'])  # Exclude 'price'
y = cleaned_df['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Training data shape: (86392, 19)
Testing data shape: (21599, 19)


In [7]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optional: Check the first few rows of the scaled data
print("First few rows of the scaled training data:")
print(X_train_scaled[:5])

First few rows of the scaled training data:
[[ 0.9171049  -1.06487922  0.4380831  -1.8997133   2.4206718   3.05252791
  -0.21935334 -0.24096514 -0.44371169 -0.21594036 -0.36700329 -0.24611195
  -0.25485438 -0.37469737 -0.39570361 -1.1391218   1.83331115 -0.16062932
   0.88124322]
 [-1.41870071  0.30215763 -2.09510711  1.04440611 -1.20361943 -0.32759733
  -0.21935334 -0.24096514  2.25371569 -0.21594036 -0.36700329 -0.24611195
  -0.25485438 -0.37469737 -0.39570361  0.87786925 -0.54546114 -0.16062932
   0.88124322]
 [ 0.9171049  -0.96480263  0.35364342 -0.75735782 -0.84119031 -0.32759733
  -0.21935334 -0.24096514 -0.44371169  4.63090817 -0.36700329 -0.24611195
  -0.25485438 -0.37469737 -0.39570361  0.87786925 -0.54546114 -0.16062932
   0.88124322]
 [ 0.9171049  -0.99095472  0.35364342  0.30141068 -1.20361943 -0.32759733
  -0.21935334 -0.24096514 -0.44371169 -0.21594036 -0.36700329 -0.24611195
   3.92380939 -0.37469737 -0.39570361  0.87786925 -0.54546114 -0.16062932
   0.88124322]
 [-0.951

In [10]:
import warnings
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error

# Suppress specific warnings
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')
warnings.filterwarnings('ignore', category=UserWarning, module='_distutils_hack')

EC2_PUBLIC_DNS='ec2-16-16-217-131.eu-north-1.compute.amazonaws.com'
mlflow.set_tracking_uri(f"http://{EC2_PUBLIC_DNS}:5000")

# Set the MLflow experiment
mlflow.set_experiment("car_price_prediction_vanilla_models")

def train_and_evaluate(model, model_name):
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train_scaled, y_train)

        # Predict on test set
        y_pred = model.predict(X_test_scaled)

        # Calculate RMSE
        rmse = root_mean_squared_error(y_test, y_pred)

        # Log parameters and metrics
        mlflow.log_param("model", model_name)
        mlflow.log_metric("rmse", rmse)

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

        print(f"{model_name} - RMSE: {rmse:.4f}")

# Define models
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)
xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')

# Train and evaluate models
train_and_evaluate(rf_model, "RandomForestRegressor")
train_and_evaluate(gb_model, "GradientBoostingRegressor")
train_and_evaluate(xgb_model, "XGBRegressor")


RandomForestRegressor - RMSE: 2423.3161
GradientBoostingRegressor - RMSE: 3323.8886
XGBRegressor - RMSE: 2426.0349


### Recommendations:

- **XGBoost** appears to be the best model based on the metrics provided. It offers a good balance between accuracy and error metrics.
- **Random Forest** is also a strong performer and a good alternative if you need a model that's potentially easier to tune and interpret.

- Consider further hyperparameter tuning for XGBoost and Random Forest to possibly improve performance even more.


In [14]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')


In [15]:
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=5, n_jobs=-1, verbose=1)


In [16]:
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 729 candidates, totalling 3645 fits


In [1]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)


NameError: name 'grid_search' is not defined

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred_best = best_model.predict(X_test)
print("Best XGBoost MAE:", mean_absolute_error(y_test, y_pred_best))
print("Best XGBoost RMSE:", mean_squared_error(y_test, y_pred_best, squared=False))
print("Best XGBoost R² Score:", r2_score(y_test, y_pred_best))


In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=5, n_jobs=-1, verbose=1)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Evaluate the best model
y_pred_best = best_model.predict(X_test)
print("Best XGBoost MAE:", mean_absolute_error(y_test, y_pred_best))
print("Best XGBoost RMSE:", mean_squared_error(y_test, y_pred_best, squared=False))
print("Best XGBoost R² Score:", r2_score(y_test, y_pred_best))


In [None]:
import pickle

# Save the best XGBoost model
with open('xgboost_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)


In [None]:
# Load the model
with open('xgboost_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Make predictions with the loaded model
y_pred_loaded = loaded_model.predict(X_test)


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

In [None]:
import mlflow
import mlflow.xgboost
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define your parameters and model
best_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 100
}

# Start MLflow run
with mlflow.start_run():
    # Initialize and train the model
    xgb_model = xgb.XGBRegressor(**best_params)
    xgb_model.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb_model.predict(X_test)

    # Log parameters
    mlflow.log_params(best_params)

    # Log metrics
    mlflow.log_metric("MAE", mean_absolute_error(y_test, y_pred))
    mlflow.log_metric("RMSE", mean_squared_error(y_test, y_pred, squared=False))
    mlflow.log_metric("R2", r2_score(y_test, y_pred))

    # Log the model
    mlflow.xgboost.log_model(xgb_model, "model")

    # Print the run ID
    print("Run ID:", mlflow.active_run().info.run_id)
