![snap](https://lever-client-logos.s3.amazonaws.com/2bd4cdf9-37f2-497f-9096-c2793296a75f-1568844229943.png)


# Part 2 : Pricing Prediction, ML & API

Author : Youenn PATAT

Here we will do multiple ML model, that we will put on MLFlow and after we will compared them to choose the best one to use for our API. The models will be the following:

* Multiple Linear Regression
* Ridge (if overfitting)
* Lasso (if overfitting)
* Random Forest
* AdaBoost
* XGBoost

## 📌 1 - Data Introduction & exploration (EDA)

In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,  OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

import os
import mlflow
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
os.environ["APP_URI"] = "https://hyraxuna-mlflow-tracking.hf.space"

In [5]:
data = pd.read_csv("https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/get_around_pricing_project.csv", index_col=0)
data.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [6]:
print("Number of lines:", data.shape[0])

print("Statistics :")
display(data.describe(include="all"))

print("percentage of missing values :")
display(100*data.isnull().sum()/data.shape[0])

Number of lines: 4843
Statistics :


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,28,,,4,10,8,2,2,2,2,2,2,2,
top,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,,140962.8,128.98823,,,,,,,,,,,121.214536
std,,60196.74,38.99336,,,,,,,,,,,33.568268
min,,-64.0,0.0,,,,,,,,,,,10.0
25%,,102913.5,100.0,,,,,,,,,,,104.0
50%,,141080.0,120.0,,,,,,,,,,,119.0
75%,,175195.5,135.0,,,,,,,,,,,136.0


percentage of missing values :


model_key                    0.0
mileage                      0.0
engine_power                 0.0
fuel                         0.0
paint_color                  0.0
car_type                     0.0
private_parking_available    0.0
has_gps                      0.0
has_air_conditioning         0.0
automatic_car                0.0
has_getaround_connect        0.0
has_speed_regulator          0.0
winter_tires                 0.0
rental_price_per_day         0.0
dtype: float64

There is no missing values, but some aberrant values (negative mileage). So we will clean that a little.

In [7]:
data = data[(data["mileage"] >= 0) & (data["mileage"] <= 800000.0)]


In [8]:
print("Number of lines:", data.shape[0])

print("Statistics :")
display(data.describe(include="all"))

Number of lines: 4841
Statistics :


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4841,4841.0,4841.0,4841,4841,4841,4841,4841,4841,4841,4841,4841,4841,4841.0
unique,28,,,4,10,8,2,2,2,2,2,2,2,
top,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,968,,,4639,1631,1606,2660,3838,3863,3880,2611,3672,4512,
mean,,140814.403222,128.975418,,,,,,,,,,,121.200372
std,,58893.227773,38.970348,,,,,,,,,,,33.481423
min,,476.0,0.0,,,,,,,,,,,10.0
25%,,102943.0,100.0,,,,,,,,,,,104.0
50%,,141080.0,120.0,,,,,,,,,,,119.0
75%,,175174.0,135.0,,,,,,,,,,,136.0


In [9]:
data.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [10]:
fig = px.scatter_matrix(data)
fig.update_layout(
        title = go.layout.Title(text = "Bivariate analysis", x = 0.5), showlegend = False, 
            autosize=False, height=2000, width = 2000)
fig.show()

## 📌 2 - Preprocessing

* Separating target and features

In [11]:
print("Separating labels from features...")
target_variable = "rental_price_per_day"

X = data.drop(target_variable, axis = 1)
Y = data.loc[:, target_variable]

print("...Done.")
print()

print("Y : ")
print(Y.head())
print()
print("X :")
print(X.head())

Separating labels from features...
...Done.

Y : 
0    106
1    264
2    101
3    158
4    183
Name: rental_price_per_day, dtype: int64

X :
  model_key  mileage  engine_power    fuel paint_color     car_type  \
0   Citroën   140411           100  diesel       black  convertible   
1   Citroën    13929           317  petrol        grey  convertible   
2   Citroën   183297           120  diesel       white  convertible   
3   Citroën   128035           135  diesel         red  convertible   
4   Citroën    97097           160  diesel      silver  convertible   

   private_parking_available  has_gps  has_air_conditioning  automatic_car  \
0                       True     True                 False          False   
1                       True     True                 False          False   
2                      False    False                 False          False   
3                       True     True                 False          False   
4                       True     True     

* Prepare preprocessing pipeline

In [12]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)
     

Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [13]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
    ]
)

# Create pipeline for categorical features 
categorical_transformer = Pipeline(
    steps=[
        (
            "encoder",
            OneHotEncoder(drop="first"),
        ),  # first column will be dropped to avoid creating correlations between features
    ]
)

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

* Separating train and test

In [14]:
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print("...Done.")

Dividing into train and test sets...
...Done.


* Apply preprocessing

In [15]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print("...Done.")
print(X_train[0:5])  
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head())
X_test = preprocessor.transform(X_test)  
print("...Done.")
print(X_test[0:5, :]) 

Performing preprocessings on train set...
     model_key  mileage  engine_power    fuel paint_color    car_type  \
3738       BMW   132937            85  diesel       black  subcompact   
418    Renault   164633            85  diesel       black      estate   
1697   Peugeot   145890           100  diesel       black      estate   
988    Peugeot   108380           100  diesel       brown      estate   
1062   Renault   148128           135  diesel        grey      estate   

      private_parking_available  has_gps  has_air_conditioning  automatic_car  \
3738                      False     True                 False          False   
418                        True     True                 False          False   
1697                       True     True                  True          False   
988                       False     True                 False          False   
1062                       True     True                 False           True   

      has_getaround_connect  has

In [46]:
import pickle

# Save preprocessor object
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

## 📌 3 - Models

### 🟣 - Linear Regression

In [47]:
linear_regression = LinearRegression()

print("Training model...")
linear_regression.fit(X_train, Y_train)
print("...Done")

Training model...
...Done


In [48]:
print("R2 score on training set : ", linear_regression.score(X_train, Y_train))
print("R2 score on test set : ", linear_regression.score(X_test, Y_test))

R2 score on training set :  0.7219610785582187
R2 score on test set :  0.6923980140377068


In [49]:
scores = cross_val_score(linear_regression,X_train, Y_train, cv=5)
avg = scores.mean()
std = scores.std()
print('Cross-validated accuracy : {}\nstandard deviation : {}'.format(avg, std))
print("R2 score on test set is finally: ", linear_regression.score(X_test, Y_test) + std, "or ", linear_regression.score(X_test, Y_test) - std)
print("and that's under the R2 score on training set :", linear_regression.score(X_train, Y_train))

Cross-validated accuracy : 0.7022015903001093
standard deviation : 0.034752299691016945
R2 score on test set is finally:  0.7271503137287237 or  0.65764571434669
and that's under the R2 score on training set : 0.7219610785582187


It seems to be a little overfitting, so we will try some ridge and lasso just after.

In [53]:
# Set your variables for your environment
EXPERIMENT_NAME="getaround-project"

run_name = 'linear_regression_baseline'

# Set tracking URI to your Hugging Face application
mlflow.set_tracking_uri(os.environ["APP_URI"])

# Set experiment's info 
mlflow.set_experiment(EXPERIMENT_NAME)

# Get our experiment info
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# Call mlflow autolog
mlflow.sklearn.autolog()


with mlflow.start_run(run_name = run_name):

    # Instanciate and fit the model 
    linear_regression = LinearRegression()
    linear_regression.fit(X_train, Y_train)

    # Store metrics 
    predicted_qualities = linear_regression.predict(X_test)
    r2 = linear_regression.score(X_test, Y_test)

    # Print results 
    print("LinearRegression model")
    print("R2: {}".format(r2))



LinearRegression model
R2: 0.6923980140377068
🏃 View run linear_regression_baseline at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/405ce27f213c4eec8b5ad72cf25297c0
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3


#### Ridge

In [25]:
print("Grid search...")
regressor = Ridge()

params = {
    'alpha': [0.0001, 0.001, 0.0011, 0.0012, 0.0013, 0.0015, 0.002, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
}
best_ridge = GridSearchCV(regressor, param_grid = params, cv = 10) # cv : the number of folds to be used for CV
best_ridge.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_ridge.best_params_)
print("Best R2 score : ", best_ridge.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 1}
Best R2 score :  0.7141139610516515


In [None]:
# Set your variables for your environment
EXPERIMENT_NAME="getaround-project"

run_name = 'ridge_grid'

# Set tracking URI to your Hugging Face application
mlflow.set_tracking_uri(os.environ["APP_URI"])

# Set experiment's info 
mlflow.set_experiment(EXPERIMENT_NAME)

# Get our experiment info
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# Call mlflow autolog
mlflow.sklearn.autolog()


with mlflow.start_run(run_name = run_name):

    # Instanciate and fit the model 
    regressor = Ridge()

    params = {
        'alpha': [0.0001, 0.001, 0.0011, 0.0012, 0.0013, 0.0015, 0.002, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
    }
    best_ridge = GridSearchCV(regressor, param_grid = params, cv = 10) # cv : the number of folds to be used for CV
    best_ridge.fit(X_train, Y_train)

    # Store metrics 
    predicted_qualities = best_ridge.predict(X_test)
    r2 = best_ridge.score(X_test, Y_test)

    # Print results 
    print("ridge model")
    print("R2: {}".format(r2))

2025/03/23 14:24:44 INFO mlflow.sklearn.utils: Logging the 5 best runs, 9 runs will be omitted.


🏃 View run gaudy-mouse-122 at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/07eac53a88594e56b75732cb6c699044
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3
🏃 View run adaptable-wren-156 at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/71342ec9cdab4d3c8aa0ff919f481eb2
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3
🏃 View run industrious-loon-954 at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/89c1c3cfae66436fb47446fadc7db6aa
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3
🏃 View run unleashed-loon-434 at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/90e90229c8f544389bb39220afe3aa27
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3
🏃 View run gaudy-doe-482 at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/ef7aeb8737b94860a9ce8dd4e51693b9
🧪 View experiment at: https://hyraxu

🏃 View run dapper-colt-17 at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/41aa6f22645848bf955defbb5dafdc1f
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3
🏃 View run traveling-sloth-925 at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/6b1c18f2283e4d079219800b7757f5e7
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3
🏃 View run debonair-fawn-507 at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/09f9c16ed1264ab595945a804146699f
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3
🏃 View run bustling-shrike-292 at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/445f252ee30c4362852248b367f998c9
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3
🏃 View run wistful-lamb-551 at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/3653940051274d899dc098519883c61b
🧪 View experiment at: https://hyrax

#### Lasso

In [26]:
print("Grid search...")
regressor = Lasso()

params = {
    'alpha': [0.005, 0.01, 0.1, 0.5, 0.8, 1, 2, 3, 5, 10, 20, 25, 26, 27, 28, 30, 32, 35, 40,],
}
best_lasso = GridSearchCV(regressor, param_grid = params, cv = 10) # cv : the number of folds to be used for CV
best_lasso.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_lasso.best_params_)
print("Best R2 score : ", best_lasso.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 0.005}
Best R2 score :  0.7131319960672331


In [55]:
# Set your variables for your environment
EXPERIMENT_NAME="getaround-project"

run_name = 'lasso_grid'

# Set tracking URI to your Hugging Face application
mlflow.set_tracking_uri(os.environ["APP_URI"])

# Set experiment's info 
mlflow.set_experiment(EXPERIMENT_NAME)

# Get our experiment info
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# Call mlflow autolog
mlflow.sklearn.autolog()


with mlflow.start_run(run_name = run_name):

    # Instanciate and fit the model 
    regressor = Lasso()

    params = {
        'alpha': [0.005, 0.01, 0.1, 0.5, 0.8, 1, 2, 3, 5, 10, 20, 25, 26, 27, 28, 30, 32, 35, 40,],
    }
    best_lasso = GridSearchCV(regressor, param_grid = params, cv = 10) # cv : the number of folds to be used for CV
    best_lasso.fit(X_train, Y_train)
    
    # Store metrics 
    predicted_qualities = best_lasso.predict(X_test)
    r2 = best_lasso.score(X_test, Y_test)

    # Print results 
    print("Lasso model")
    print("R2: {}".format(r2))

2025/03/23 14:25:16 INFO mlflow.sklearn.utils: Logging the 5 best runs, 14 runs will be omitted.


Lasso model
R2: 0.6907629100720292
🏃 View run lasso_grid at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/671f7cb92f89480396dd25aeace146cb
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3


### 🟣 - Random Forest

In [27]:
rf = RandomForestRegressor(oob_score=True)
params = {'max_depth' : [5, 7, 10],
         'min_samples_split' : [3, 5, 7, 10],
         'min_samples_leaf': [2, 5, 10],
         'n_estimators': [50, 80, 100, 150],}

print("Grid search...")

grid =  GridSearchCV(estimator=rf, param_grid= params, cv = 5, scoring='r2')
grid.fit(X_train, Y_train)

print("...Done.")
print("Best hyperparameters : ", grid.best_params_)
print("Best validation score : ", grid.best_score_)
print()
print("score on training set : ", grid.score(X_train, Y_train))
print("score on test set : ", grid.score(X_test, Y_test))

Grid search...
...Done.
Best hyperparameters :  {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 80}
Best validation score :  0.7498131591616308

score on training set :  0.8659785790312162
score on test set :  0.6988046855886725


In [56]:
# Set your variables for your environment
EXPERIMENT_NAME="getaround-project"

run_name = 'randomforest_grid'

# Set tracking URI to your Hugging Face application
mlflow.set_tracking_uri(os.environ["APP_URI"])

# Set experiment's info 
mlflow.set_experiment(EXPERIMENT_NAME)

# Get our experiment info
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# Call mlflow autolog
mlflow.sklearn.autolog()


with mlflow.start_run(run_name = run_name):

    # Instanciate and fit the model 
    rf = RandomForestRegressor(oob_score=True)
    params = {'max_depth' : [5, 7, 10],
            'min_samples_split' : [3, 5, 7, 10],
            'min_samples_leaf': [2, 5, 10],
            'n_estimators': [50, 80, 100, 150],}

    grid =  GridSearchCV(estimator=rf, param_grid= params, cv = 5, scoring='r2')
    grid.fit(X_train, Y_train)
    # Store metrics 
    predicted_qualities = grid.predict(X_test)
    r2 = grid.score(X_test, Y_test)

    # Print results 
    print("RandomForest model")
    print("R2: {}".format(r2))

2025/03/23 14:42:29 INFO mlflow.sklearn.utils: Logging the 5 best runs, 139 runs will be omitted.


RandomForest model
R2: 0.7219914197852075
🏃 View run randomforest_grid at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/98de0d148adc49cd8636bee7ab82c59e
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3


### 🟣 - AdaBoost

In [28]:
print("Grid search...")
lr = LinearRegression()
regressor_adaboost = AdaBoostRegressor(estimator=lr)

param_grid = {
    'n_estimators': [10, 15, 20, 30, 40, 50, 100],         # Number of estimators (weak learners)
    'learning_rate': [0.01, 0.1, 1.0],    # Learning rate
}

print(param_grid)
gridsearch_ada = GridSearchCV(
    regressor_adaboost, param_grid=param_grid, cv=10, verbose = 1, scoring='r2')  
gridsearch_ada.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_ada.best_params_)
print("Best validation score : ", gridsearch_ada.best_score_)
print()
print("score on training set : ", gridsearch_ada.score(X_train, Y_train))
print("score on test set : ", gridsearch_ada.score(X_test, Y_test))

Grid search...
{'n_estimators': [10, 15, 20, 30, 40, 50, 100], 'learning_rate': [0.01, 0.1, 1.0]}
Fitting 10 folds for each of 21 candidates, totalling 210 fits
...Done.
Best hyperparameters :  {'learning_rate': 0.01, 'n_estimators': 20}
Best validation score :  0.7127546499783961

score on training set :  0.7232947853404637
score on test set :  0.6716005039507367


In [57]:
# Set your variables for your environment
EXPERIMENT_NAME="getaround-project"

run_name = 'adaboost_grid'

# Set tracking URI to your Hugging Face application
mlflow.set_tracking_uri(os.environ["APP_URI"])

# Set experiment's info 
mlflow.set_experiment(EXPERIMENT_NAME)

# Get our experiment info
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# Call mlflow autolog
mlflow.sklearn.autolog()


with mlflow.start_run(run_name = run_name):

    # Instanciate and fit the model 
    lr = LinearRegression()
    regressor_adaboost = AdaBoostRegressor(estimator=lr)

    param_grid = {
        'n_estimators': [10, 15, 20, 30, 40, 50, 100],         # Number of estimators (weak learners)
        'learning_rate': [0.01, 0.1, 1.0],    # Learning rate
    }

    print(param_grid)
    gridsearch_ada = GridSearchCV(
        regressor_adaboost, param_grid=param_grid, cv=10, verbose = 1, scoring='r2')  
    gridsearch_ada.fit(X_train, Y_train)
    # Store metrics 
    predicted_qualities = gridsearch_ada.predict(X_test)
    r2 = gridsearch_ada.score(X_test, Y_test)

    # Print results 
    print("AdaGrid model")
    print("R2: {}".format(r2))



{'n_estimators': [10, 15, 20, 30, 40, 50, 100], 'learning_rate': [0.01, 0.1, 1.0]}
Fitting 10 folds for each of 21 candidates, totalling 210 fits


2025/03/23 14:44:48 INFO mlflow.sklearn.utils: Logging the 5 best runs, 16 runs will be omitted.


AdaGrid model
R2: 0.6915719760279171
🏃 View run adaboost_grid at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/74defaa036cf419e97ebd29875bb6cf1
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3


### 🟣 - XGBoost

In [16]:
# Perform grid search
print("Grid search...")
xgboost = XGBRegressor(reg_lambda=1)

# Grid of values to be tested
param_grid = {
    'n_estimators': [50, 100, 150],       # Number of estimators (trees)
    'learning_rate': [0.01, 0.1, 1.0],    # Learning rate
    'max_depth': [3, 5, 10],               # Maximum depth of trees
    'min_child_weight': [1, 2, 4],        # Minimum sum of instance weight (hessian) needed in a child
    'gamma': [0, 0.1, 0.2],               # Minimum loss reduction required to make a further partition on a leaf node
    'subsample': [0.8, 0.9, 1.0],         # Fraction of samples used for fitting the trees
    'colsample_bytree': [0.1, 0.3, 0.5, 0.7]   # Fraction of features used for fitting the trees
}
print(param_grid)
gridsearch_xgboost = GridSearchCV(xgboost, param_grid = param_grid, cv = 5, verbose = 1, scoring='r2') # cv : the number of folds to be used for CV
gridsearch_xgboost.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_xgboost.best_params_)
print("Best validation score : ", gridsearch_xgboost.best_score_)
print()
print("score on training set : ", gridsearch_xgboost.score(X_train, Y_train))
print("score on test set : ", gridsearch_xgboost.score(X_test, Y_test))


Grid search...
{'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 1.0], 'max_depth': [3, 5, 10], 'min_child_weight': [1, 2, 4], 'gamma': [0, 0.1, 0.2], 'subsample': [0.8, 0.9, 1.0], 'colsample_bytree': [0.1, 0.3, 0.5, 0.7]}
Fitting 5 folds for each of 2916 candidates, totalling 14580 fits
...Done.
Best hyperparameters :  {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 150, 'subsample': 0.9}
Best validation score :  0.7634634852409363

score on training set :  0.8552883267402649
score on test set :  0.7565619945526123


In [58]:
# Set your variables for your environment
EXPERIMENT_NAME="getaround-project"

run_name = 'xgboost_grid'

# Set tracking URI to your Hugging Face application
mlflow.set_tracking_uri(os.environ["APP_URI"])

# Set experiment's info 
mlflow.set_experiment(EXPERIMENT_NAME)

# Get our experiment info
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# Call mlflow autolog
mlflow.sklearn.autolog()


with mlflow.start_run(run_name = run_name):

    # Instanciate and fit the model 
    xgboost = XGBRegressor(reg_lambda=1)

    # Grid of values to be tested
    param_grid = {
        'n_estimators': [50, 100, 150],       # Number of estimators (trees)
        'learning_rate': [0.01, 0.1, 1.0],    # Learning rate
        'max_depth': [3, 5, 10],               # Maximum depth of trees
        'min_child_weight': [1, 2, 4],        # Minimum sum of instance weight (hessian) needed in a child
        'gamma': [0, 0.1, 0.2],               # Minimum loss reduction required to make a further partition on a leaf node
        'subsample': [0.1, 0.3, 0.5, 0.7, 0.8, 0.9, 1.0],         # Fraction of samples used for fitting the trees
        'colsample_bytree': [0.1, 0.3, 0.5, 0.7,0.8, 0.9, 1.0]   # Fraction of features used for fitting the trees
    }
    print(param_grid)
    gridsearch_xgboost = GridSearchCV(xgboost, param_grid = param_grid, cv = 5, verbose = 1, scoring='r2') # cv : the number of folds to be used for CV
    gridsearch_xgboost.fit(X_train, Y_train)

    # Store metrics 
    predicted_qualities = gridsearch_xgboost.predict(X_test)
    r2 = gridsearch_xgboost.score(X_test, Y_test)

    # Print results 
    print("XGBoostGrid model")
    print("R2: {}".format(r2))



{'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 1.0], 'max_depth': [3, 5, 10], 'min_child_weight': [1, 2, 4], 'gamma': [0, 0.1, 0.2], 'subsample': [0.1, 0.3, 0.5, 0.7, 0.8, 0.9, 1.0], 'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.8, 0.9, 1.0]}
Fitting 5 folds for each of 11907 candidates, totalling 59535 fits


2025/03/23 16:31:29 INFO mlflow.sklearn.utils: Logging the 5 best runs, 11902 runs will be omitted.


XGBoostGrid model
R2: 0.7565619945526123
🏃 View run xgboost_grid at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3/runs/c2037b0c2c9e4c629a02b7b8a7eb2642
🧪 View experiment at: https://hyraxuna-mlflow-tracking.hf.space/#/experiments/3
