In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression
import numpy as np
# Export model
import os
import joblib
import re

In [30]:
#* Data set
df = pd.read_csv("../data/cleaned_set.csv")


#* PARMAS
NEW_VERSION=True

ENCODER_SCALER_URL="./lib"
SAVE_ENCODER_SCALER=False


# Encodage des variables categorielles

In [31]:
categorical_cols = [
    'transmission',
    'fuel_type',
    'brand',
    'model',
    'origin',
    'condition',
]

# Initialize encoder with sparse=False to get a NumPy array
encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

# Fit and transform
df_catcol_encoded = encoder.fit_transform(df[categorical_cols])

# Create DataFrame with proper column names
df_encoded = pd.DataFrame(df_catcol_encoded, 
                          columns=encoder.get_feature_names_out(categorical_cols),
                          index=df.index)  # keep index aligned with original df

# Optionally, concatenate back with the original dataframe (without the original categorical columns)
df = pd.concat([df.drop(columns=categorical_cols), df_encoded], axis=1)

# Normalization

In [32]:

scaler = StandardScaler()
df[['mileage', 'tax_horsepower', 'car_age', 'number_of_doors']] = scaler.fit_transform(
    df[['mileage', 'tax_horsepower', 'car_age', 'number_of_doors']]
)


In [33]:
if SAVE_ENCODER_SCALER:
    joblib.dump(encoder, 'lib/encoder.pkl')
    joblib.dump(scaler, 'lib/scaler.pkl')

# Separation des donnees (train | test) + Entrainement du modele

In [34]:
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)


#* Custom test input
my_custom_df = pd.DataFrame([{
    "model_year": 2020,
    "mileage": 45000.5,
    "number_of_doors": 4,
    "first_owner": True,
    "tax_horsepower": 7.5,
    "abs": True,
    "airbags": 6,
    "multimedia": True,
    "backup_camera": True,
    "air_conditioning": True,
    "esp": True,
    "aluminum_wheels": True,
    "speed_limiter": False,
    "onboard_computer": True,
    "parking_sensors": True,
    "cruise_control": False,
    "leather_seats": False,
    "navigation_gps": True,
    "sunroof": False,
    "remote_central_locking": True,
    "power_windows": True,
    "car_age": 30,
    "transmission": "somehting ejse",
    "fuel_type": "diesel",
    "brand": "pegeut",
    "model": "Corolla",
    "origin": "ww maroc",
    "condition": "used",
}])

# Fit and transform
cstm_catcol_encoded = encoder.transform(my_custom_df[categorical_cols])

# Create DataFrame with proper column names
cstm_encoded = pd.DataFrame(cstm_catcol_encoded, 
                          columns=encoder.get_feature_names_out(categorical_cols),
                          index=my_custom_df.index)  # keep index aligned with original df

# Optionally, concatenate back with the original dataframe (without the original categorical columns)
my_custom_df = pd.concat([my_custom_df.drop(columns=categorical_cols), cstm_encoded], axis=1)

my_custom_df[['mileage', 'tax_horsepower', 'car_age', 'number_of_doors']] = scaler.fit_transform(
    my_custom_df[['mileage', 'tax_horsepower', 'car_age', 'number_of_doors']]
)

# # Predict
car_price = model.predict(my_custom_df)
print("Predicted car price:", car_price[0])


Predicted car price: 194758.1350634396


In [35]:
# Directory to save models
if NEW_VERSION:
    
    MODEL_DIR = "."
    os.makedirs(MODEL_DIR, exist_ok=True)

    def get_next_model_version():
        existing_models = [f for f in os.listdir(MODEL_DIR) if re.match(r"v\d+\.pkl", f)]
        versions = [int(re.findall(r'\d+', fname)[0]) for fname in existing_models]
        next_version = max(versions) + 1 if versions else 1
        return f"v{next_version}.pkl"


    # Save with version
    model_filename = get_next_model_version()
    joblib.dump(model, os.path.join(MODEL_DIR, model_filename))
    print(f"Model saved as {model_filename}")


Model saved as v1.pkl


# Evaluation du modele 

In [36]:
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

MAE: 28994.073414207658
R2 Score: 0.7998427865196754
