# Imports

In [None]:
import pandas as pd
import time
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor, StackingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, r2_score

# Dataset exploring

In [2]:
data_price = pd.read_csv(r"G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\CERTIF_PROJECTS\ML_Engineer_Certification_Projects\07_DEPLOYMENT_Getaround\src\get_around_pricing_project.csv")
data_price.head()

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [11]:
data_price = data_price.drop(columns="Unnamed: 0")
data_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   model_key                  4843 non-null   object
 1   mileage                    4843 non-null   int64 
 2   engine_power               4843 non-null   int64 
 3   fuel                       4843 non-null   object
 4   paint_color                4843 non-null   object
 5   car_type                   4843 non-null   object
 6   private_parking_available  4843 non-null   bool  
 7   has_gps                    4843 non-null   bool  
 8   has_air_conditioning       4843 non-null   bool  
 9   automatic_car              4843 non-null   bool  
 10  has_getaround_connect      4843 non-null   bool  
 11  has_speed_regulator        4843 non-null   bool  
 12  winter_tires               4843 non-null   bool  
 13  rental_price_per_day       4843 non-null   int64 
dtypes: bool(

# Training model : Gradient boosting
### Choice of the model because there are lots of categorical variables, we need a model that captures the complexity of the data to achieve thiner predictions (tried linear regression, and other ensamble models, this is the one with best scores and less overfitting)

In [None]:
mlflow.set_tracking_uri("https://mlflow-s3-5c46c0d9d46b.herokuapp.com/")

# Set your variables for your environment
EXPERIMENT_NAME="getaround_model_test_1"
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment:
    print("Experiment ID:", experiment.experiment_id)
    print("Artifact Location:", experiment.artifact_location)
else:
    print(f"Experiment '{EXPERIMENT_NAME}' does not exist.")

# start experiment time tracking
start_time = time.time()
mlflow.sklearn.autolog(log_models=False)

#load dataset
data_price = pd.read_csv(r"G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\CERTIF_PROJECTS\ML_Engineer_Certification_Projects\07_DEPLOYMENT_Getaround\src\get_around_pricing_project.csv")
data_price = data_price.drop(columns="Unnamed: 0")

#Preprocessing
target_variable = "rental_price_per_day"
X = data_price.drop(target_variable, axis = 1)
Y = data_price.loc[:,target_variable]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

numeric_features = list(X.select_dtypes(include=['float', 'int']).columns)
categorical_features = list(X.select_dtypes(exclude=['float', 'int']).columns)

#check
print("Found numeric features ", numeric_features)
print("Found categorical features ", categorical_features)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
    ]
)
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor())
])
print("preprocess OK")

#start training
with mlflow.start_run(experiment_id = experiment.experiment_id):
    model.fit(X_train, Y_train)
    print("model trained")
    X_train_pred = model.predict(X_train)
    mlflow.sklearn.log_model(model, "XGBoost")


print('train_score', model.score(X_train, Y_train))
print('test_score', model.score(X_test, Y_test))

mlflow.end_run()