In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
# Data preprocessing
# Clean price column
data['prix'] = data['prix'].str.replace('€', '').str.replace(' ', '').astype(float)

# Clean kilometrage column
data['kilometrage'] = data['kilometrage'].str.replace(' km', '').str.replace(' ', '').astype(float)

# Clean puissance column
data['puissance'] = data['puissance'].str.extract('(\d+)').astype(float)

# Clean emission_CO2 column
data['emission_CO2'] = data['emission_CO2'].str.extract('(\d+)').astype(float)

# Clean annee_mise_en_circulation
data['annee_mise_en_circulation'] = pd.to_datetime(data['annee_mise_en_circulation'], format='%d.%m.%Y').dt.year


  data['puissance'] = data['puissance'].str.extract('(\d+)').astype(float)
  data['emission_CO2'] = data['emission_CO2'].str.extract('(\d+)').astype(float)
  data['puissance'] = data['puissance'].str.extract('(\d+)').astype(float)
  data['emission_CO2'] = data['emission_CO2'].str.extract('(\d+)').astype(float)


ValueError: could not convert string to float: '13\u202f190'

In [None]:
# Group rare car models (appearing less than 10 times)
model_counts = data['modele'].value_counts()
rare_models = model_counts[model_counts < 10].index
data['modele'] = data['modele'].apply(lambda x: 'Other' if x in rare_models else x)

In [None]:
# Define features for the model
numeric_features = ['kilometrage', 'puissance', 'nb_porte', 'nb_place', 
                   'emission_CO2', 'annee_mise_en_circulation']
categorical_features = ['modele', 'carburant', 'transmission', 'classe_vehicule', 
                      'couleur', 'sellerie', 'crit_air']

In [None]:
# Create preprocessing steps
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Create full pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Prepare X and y
X = data.drop(['prix', 'scraped_at', 'finition', 'url_annonce', 
               'classe_emission', 'usage_commerciale_anterieure', 
               'nb_ancien_proprietaire'], axis=1)
y = data['prix']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:,.2f}')
print(f'R² Score: {r2:.4f}')

# Calculate feature importances
feature_names = (numeric_features + 
                [f"{feature}__{val}" for feature, vals in 
                 model.named_steps['preprocessor']
                 .named_transformers_['cat']
                 .named_steps['onehot'].get_feature_names_out(categorical_features)
                 for val in vals])

importances = pd.DataFrame(
    {'feature': feature_names,
     'importance': model.named_steps['regressor'].feature_importances_}
).sort_values('importance', ascending=False)

print("\nTop 10 most important features:")
print(importances.head(10))