In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
# Export model
import os
import joblib
import re

In [61]:
#* Data set
df = pd.read_csv("../data/cleaned_set.csv")


#* PARMAS
NEW_VERSION=False

# Encodage des variables categorielles

In [62]:
categorical_cols = ['transmission', 'fuel_type', 'brand', 'model', 'origin', 'condition', 'first_owner']

# Initialize encoder
encoder = OneHotEncoder(drop='first', sparse_output=False,handle_unknown='ignore')

# Fit and transform only the target columns
encoded_array = encoder.fit_transform(df[categorical_cols])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols), index=df.index)

# Drop the original categorical columns and add encoded ones
df = df.drop(columns=categorical_cols)
df = pd.concat([df, encoded_df], axis=1)

for column in df.columns:
    print(column)


price
model_year
mileage
number_of_doors
tax_horsepower
abs
airbags
multimedia
backup_camera
air_conditioning
esp
aluminum_wheels
speed_limiter
onboard_computer
parking_sensors
cruise_control
leather_seats
navigation_gps
sunroof
remote_central_locking
power_windows
price_log
car_age
transmission_Manuelle
fuel_type_Electrique
fuel_type_Essence
fuel_type_Hybride
fuel_type_LPG
brand_Alfa Romeo
brand_Audi
brand_Autres
brand_BMW
brand_Bentley
brand_Cadillac
brand_Changan
brand_Chery
brand_Chevrolet
brand_Chrysler
brand_Citroen
brand_Cupra
brand_DFSK
brand_DS
brand_Dacia
brand_Daewoo
brand_Daihatsu
brand_Dodge
brand_Fiat
brand_Ford
brand_Foton
brand_GMC
brand_GWM Motors
brand_Geely
brand_Honda
brand_Hyundai
brand_Infiniti
brand_Isuzu
brand_Iveco
brand_Jaguar
brand_Jeep
brand_Kia
brand_Lancia
brand_Land Rover
brand_Lexus
brand_Lincoln
brand_MG
brand_Mahindra
brand_Maserati
brand_Mazda
brand_Mercedes-Benz
brand_Mini
brand_Mitsubishi
brand_Nissan
brand_Opel
brand_Peugeot
brand_Porsche
brand_Ren

# Normalization

In [63]:

scaler = StandardScaler()
df[['price', 'mileage', 'tax_horsepower', 'car_age', 'number_of_doors']] = scaler.fit_transform(
    df[['price', 'mileage', 'tax_horsepower', 'car_age', 'number_of_doors']]
)


# Separation des donnees (train | test) + Entrainement du modele

In [64]:
X = df.drop("price", axis=1)
y = df["price"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


def transformData(raw_input):
    # Step 2: Encode categorical columns
    categorical_cols = ['transmission', 'fuel_type', 'brand', 'model', 'origin', 'condition', 'first_owner']
    encoded_input = encoder.transform(raw_input[categorical_cols])
    encoded_df = pd.DataFrame(encoded_input, columns=encoder.get_feature_names_out(categorical_cols))

    # Step 3: Drop original categorical columns and add encoded ones
    raw_input = raw_input.drop(columns=categorical_cols)
    final_input = pd.concat([raw_input.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

    # Step 4: Scale numerical columns
    numerical_cols = ['mileage', 'tax_horsepower', 'number_of_doors']
    final_input[numerical_cols] = scaler.transform(final_input[numerical_cols])

    # Optional: If `car_age` is part of the training set, you might need to compute and scale it as well
    # final_input["car_age"] = 2024 - final_input["model_year"]
    # final_input["car_age"] = scaler.transform(final_input[["car_age"]])

    # Step 5: Drop unused or extra columns (like 'model_year' if not in training)
    if "model_year" in final_input.columns and "model_year" not in X.columns:
        final_input = final_input.drop(columns=["model_year"])
    
    return final_input


#* my own test
my_custom_df = pd.DataFrame([{
    "model_year": 2020,
    "transmission": "automatic",
    "fuel_type": "diesel",
    "mileage": 45000.5,
    "brand": "Toyota",
    "model": "Corolla",
    "number_of_doors": 4,
    "origin": "Germany",
    "first_owner": True,
    "tax_horsepower": 7.5,
    "condition": "used",
    "abs": True,
    "airbags": 6,
    "multimedia": True,
    "backup_camera": True,
    "air_conditioning": True,
    "esp": True,
    "aluminum_wheels": True,
    "speed_limiter": False,
    "onboard_computer": True,
    "parking_sensors": True,
    "cruise_control": True,
    "leather_seats": False,
    "navigation_gps": True,
    "sunroof": False,
    "remote_central_locking": True,
    "power_windows": True
}])

car_price= model.predict(transformData(my_custom_df))
car_price


ValueError: y contains previously unseen labels: np.True_

In [None]:
# Directory to save models
if NEW_VERSION:
    
    MODEL_DIR = "."
    os.makedirs(MODEL_DIR, exist_ok=True)

    def get_next_model_version():
        existing_models = [f for f in os.listdir(MODEL_DIR) if re.match(r"v\d+\.pkl", f)]
        versions = [int(re.findall(r'\d+', fname)[0]) for fname in existing_models]
        next_version = max(versions) + 1 if versions else 1
        return f"v{next_version}.pkl"


    # Save with version
    model_filename = get_next_model_version()
    joblib.dump(model, os.path.join(MODEL_DIR, model_filename))
    print(f"Model saved as {model_filename}")


# Evaluation du modele 

In [None]:
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

MAE: 0.024829234850177018
R2 Score: 0.9358914727430943
