In [1]:
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor

In [2]:
def new_target_price(row, inter_confiance=10):
    """
    Calcule un nouveau prix cible basé sur le rapport entre 'mmr' et 'sellingprice'.
    Retourne 'mmr' si le ratio est dans l'intervalle de confiance, sinon 'sellingprice'.
    """
    x, y = row['mmr'], row['sellingprice']
    return x if 100 - inter_confiance <= (y / x) * 100 <= 100 + inter_confiance else y

def cut_outlier(data, name_column, outlier_min=0, outlier_max=0):
    """Filtre les valeurs dans l'intervalle spécifié pour la colonne donnée."""
    return data[(data[name_column] >= outlier_min) & (data[name_column] <= outlier_max)]

def cut_categorical(data, name_column, value_min=0):
    """Garde les catégories avec un nombre minimum d'occurrences."""
    to_keep = data[name_column].value_counts()[lambda count: count >= value_min].index
    return data[data[name_column].isin(to_keep)]

def map_condition(x):
    if 0 <= x < 1.5:
        return 0
    elif 1.5 <= x < 3.5:
        return 1
    elif x >= 3.5:
        return 2

In [3]:
# Chargement des données
data = pd.read_csv("<your/path>", on_bad_lines='skip').drop(columns=['vin', 'trim', 'seller', 'saledate']).dropna()

# Map nouvelles 'condition'
data['new_condition'] = data['condition'].apply(map_condition)

# Application de new_target_price
data['new_price'] = data.apply(new_target_price, inter_confiance=5, axis=1)

# Drop colonnes en trop
data = data.drop(['mmr', 'sellingprice', 'condition'], axis=1)

In [4]:
# Filtrage sur 'new_price' et 'odometer'
data_filtered = cut_outlier(data, name_column='new_price', outlier_min=1000, outlier_max=60000)
data_filtered = cut_outlier(data_filtered, name_column='odometer', outlier_min=0, outlier_max=300000)

# Simplification sur 'color' et 'interior'
data_filtered['color'] = data_filtered['color'].apply(lambda x: x if x in ['black', 'gray', 'white', 'silver', 'blue', 'red'] else 'other')
data_filtered['interior'] = data_filtered['interior'].apply(lambda x: x if x in ['black', 'gray', 'beige', 'tan'] else 'other')

# Normalisation sur 'state', 'body', 'model', et 'make'
data_filtered['state'] = data_filtered['state'].str.lower()
data_filtered['body'] = data_filtered['body'].str.lower()
data_filtered['model'] = data_filtered['model'].str.lower()
data_filtered['make'] = data_filtered['make'].str.lower()

# Filtrage sur pour 'body' et 'model'
data_filtered = cut_categorical(data_filtered, name_column='body', value_min=50)
data_filtered = cut_categorical(data_filtered, name_column='model', value_min=100)

In [5]:
# Séparation X et y
X = data_filtered.drop('new_price', axis=1).reset_index(drop=True)
y = data_filtered['new_price'].reset_index(drop=True)

In [6]:
# Catégorisation des colonnes
numerical_features = ['year', 'new_condition', 'odometer']
categorical_features = ['make', 'model', 'body', 'transmission', 'state', 'color', 'interior']

# Preprocessing (num)
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing (cat)
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop="first"))
])

# Combinaison
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

In [7]:
# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=500, max_depth=10, learning_rate=0.1, n_jobs=-1))
    ])

# Fit
pipeline.fit(X, y)

# Save
with open('xgboost.pkl', 'wb') as f:
    pickle.dump(pipeline, f)