In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the cleaned data
cleaned_df = pd.read_csv('../data/cleaned_car_data.csv')

# Define features (X) and target variable (y)
X = cleaned_df.drop(columns=['price'])  # Exclude 'price'
y = cleaned_df['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Training data shape: (86392, 19)
Testing data shape: (21599, 19)


In [10]:
import warnings
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error

# Suppress specific warnings
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')
warnings.filterwarnings('ignore', category=UserWarning, module='_distutils_hack')

EC2_PUBLIC_DNS='ec2-16-16-217-131.eu-north-1.compute.amazonaws.com'
mlflow.set_tracking_uri(f"http://{EC2_PUBLIC_DNS}:5000")

# Set the MLflow experiment
mlflow.set_experiment("car_price_prediction_vanilla_models")

def train_and_evaluate(model, model_name):
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train, y_train)

        # Predict on test set
        y_pred = model.predict(X_test)

        # Calculate RMSE
        rmse = root_mean_squared_error(y_test, y_pred)

        # Log parameters and metrics
        mlflow.log_param("model", model_name)
        mlflow.log_metric("rmse", rmse)

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

        print(f"{model_name} - RMSE: {rmse:.4f}")

# Define models
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)
xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')

# Train and evaluate models
train_and_evaluate(rf_model, "RandomForestRegressor")
train_and_evaluate(gb_model, "GradientBoostingRegressor")
train_and_evaluate(xgb_model, "XGBRegressor")


RandomForestRegressor - RMSE: 2423.3161
GradientBoostingRegressor - RMSE: 3323.8886
XGBRegressor - RMSE: 2426.0349


### Recommendations:

- **XGBoost** appears to be the best model based on the metrics provided. It offers a good balance between accuracy and error metrics.
- **Random Forest** is also a strong performer and a good alternative if you need a model that's potentially easier to tune and interpret.

- Consider further hyperparameter tuning for XGBoost and Random Forest to possibly improve performance even more.
