In [5]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import warnings
warnings.simplefilter("ignore")

# Load CSV files
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Drop the 'id' column as it's not useful
df_train = df_train.drop(columns=['id'])
id_test = df_test['id']
df_test = df_test.drop(columns=['id'])

# Separate numerical and categorical columns
categorical_columns = ['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
numerical_columns = df_train.select_dtypes(exclude=['object']).columns.tolist()
numerical_columns.remove('price')  # Don't include target

# Step 1: Handle missing values before feature engineering

# KNN Imputer for both numerical and categorical features (categoricals should be encoded later)
knn_imputer = KNNImputer(n_neighbors=5)
X_train_imputed = knn_imputer.fit_transform(df_train[numerical_columns])
X_test_imputed = knn_imputer.transform(df_test[numerical_columns])

# Reintegrating the imputed numerical data back into the DataFrame
df_train[numerical_columns] = X_train_imputed
df_test[numerical_columns] = X_test_imputed

# Step 2: Feature engineering (after handling missing values)

# Add car age and drop model_year
df_train['car_age'] = 2024 - df_train['model_year']
df_test['car_age'] = 2024 - df_test['model_year']
df_train.drop('model_year', axis=1, inplace=True)
df_test.drop('model_year', axis=1, inplace=True)

# Target encoding for 'model' to avoid high cardinality
mean_price_per_model = df_train.groupby('model')['price'].mean()
df_train['model'] = df_train['model'].map(mean_price_per_model)
df_test['model'] = df_test['model'].map(mean_price_per_model)

# Extract engine power and cylinder
df_train['engine_power'] = df_train['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)
df_test['engine_power'] = df_test['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)

df_train['engine_cylinder'] = df_train['engine'].str.extract(r'(\d+\.?\d*)L').astype(float)
df_test['engine_cylinder'] = df_test['engine'].str.extract(r'(\d+\.?\d*)L').astype(float)

# Remove the 'engine' column after extraction
df_train.drop('engine', axis=1, inplace=True)
df_test.drop('engine', axis=1, inplace=True)

# Regroup rare colors in 'ext_col' and 'int_col'
color_threshold = 100
ext_color_counts = df_train['ext_col'].value_counts()
rare_ext_colors = ext_color_counts[ext_color_counts < color_threshold].index
df_train['ext_col'] = df_train['ext_col'].replace(rare_ext_colors, 'Other')
df_test['ext_col'] = df_test['ext_col'].replace(rare_ext_colors, 'Other')

int_color_counts = df_train['int_col'].value_counts()
rare_int_colors = int_color_counts[int_color_counts < color_threshold].index
df_train['int_col'] = df_train['int_col'].replace(rare_int_colors, 'Other')
df_test['int_col'] = df_test['int_col'].replace(rare_int_colors, 'Other')

# Step 3: Drop any newly created missing values from feature engineering
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

# OneHotEncode only for categorical columns with low cardinality
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Apply OneHotEncoder to categorical columns
encoded_train = pd.DataFrame(encoder.fit_transform(df_train[categorical_columns]))
encoded_test = pd.DataFrame(encoder.transform(df_test[categorical_columns]))

# Add column names to encoded data
encoded_train.columns = encoder.get_feature_names_out(categorical_columns)
encoded_test.columns = encoder.get_feature_names_out(categorical_columns)

# Reset index to align encoded data with original data
encoded_train.index = df_train.index
encoded_test.index = df_test.index

# Concatenate encoded categorical columns with numerical columns
X_train = pd.concat([df_train[numerical_columns], encoded_train], axis=1)
X_test = pd.concat([df_test[numerical_columns], encoded_test], axis=1)

# Align the columns between train and test sets to avoid issues with different columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Apply RobustScaler to numerical columns
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Target variable
y_train = df_train['price']

# Apply Box-Cox transformation to target variable (price)
y_train_boxcox, lambda_ = boxcox(y_train)  # No need to add 1 if prices are strictly positive

# Train-test split for evaluation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_scaled, y_train_boxcox, test_size=0.2, random_state=42)

KeyError: "['model_year'] not in index"

In [None]:
# Example: Using a simple model like Ridge to evaluate
from sklearn.linear_model import Ridge

ridge_model = Ridge()
ridge_model.fit(X_train_split, y_train_split)
y_pred_boxcox = ridge_model.predict(X_val)

# Inverse Box-Cox transformation to get predictions back to the original scale
y_pred = inv_boxcox(y_pred_boxcox, lambda_)

# RMSE evaluation in the original scale
rmse = np.sqrt(mean_squared_error(inv_boxcox(y_val, lambda_), y_pred))
print(f'Ridge Model RMSE: {rmse}')

In [6]:
# Load CSV files
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Drop the 'id' column as it's not useful
df_train = df_train.drop(columns=['id'])
id_test = df_test['id']
df_test = df_test.drop(columns=['id'])

# Add car age and drop model_year
df_train['car_age'] = 2024 - df_train['model_year']
df_test['car_age'] = 2024 - df_test['model_year']
df_train.drop('model_year', axis=1, inplace=True)
df_test.drop('model_year', axis=1, inplace=True)

# Rebuild the numerical columns list without 'model_year'
numerical_columns = df_train.select_dtypes(exclude=['object']).columns.tolist()
numerical_columns.remove('price')  # Don't include the target

# Separate categorical columns for encoding
categorical_columns = ['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']

# Handle missing values before feature engineering
knn_imputer = KNNImputer(n_neighbors=5)
X_train_imputed = knn_imputer.fit_transform(df_train[numerical_columns])
X_test_imputed = knn_imputer.transform(df_test[numerical_columns])

# Reintegrating the imputed numerical data back into the DataFrame
df_train[numerical_columns] = X_train_imputed
df_test[numerical_columns] = X_test_imputed

# OneHotEncode only for categorical columns with low cardinality
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Apply OneHotEncoder to categorical columns in train and test
encoded_train = pd.DataFrame(encoder.fit_transform(df_train[categorical_columns]))
encoded_test = pd.DataFrame(encoder.transform(df_test[categorical_columns]))

# Add column names to encoded data
encoded_train.columns = encoder.get_feature_names_out(categorical_columns)
encoded_test.columns = encoder.get_feature_names_out(categorical_columns)

# Reset index to align encoded data with original data
encoded_train.index = df_train.index
encoded_test.index = df_test.index

# Concatenate encoded categorical columns with numerical columns
X_train = pd.concat([df_train[numerical_columns], encoded_train], axis=1)
X_test = pd.concat([df_test[numerical_columns], encoded_test], axis=1)

# Align the columns between train and test sets to avoid issues with different columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Apply RobustScaler to numerical columns
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Target variable
y_train = df_train['price']

# Apply Box-Cox transformation to target variable (price)
y_train_boxcox, lambda_ = boxcox(y_train)  # No need to add 1 if prices are strictly positive

# Train-test split for evaluation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_scaled, y_train_boxcox, test_size=0.2, random_state=42)

In [7]:
# Import necessary regressors
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Create a dictionary of models
models = {
    "Lasso Regression": Lasso(),
    "ElasticNet": ElasticNet(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor(silent=True),  # Silent=True to suppress verbose output
}

# Loop to train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train_split, y_train_split)
    
    # Make predictions on validation set
    y_pred_boxcox = model.predict(X_val)
    
    # Inverse Box-Cox transformation to get predictions back to original scale
    y_pred = inv_boxcox(y_pred_boxcox, lambda_)
    
    # Calculate RMSE on validation set
    rmse = np.sqrt(mean_squared_error(inv_boxcox(y_val, lambda_), y_pred))
    
    print(f"{name} - RMSE: {rmse}")


Lasso Regression - RMSE: 76007.05061082898
ElasticNet - RMSE: 76007.05061082898
Random Forest - RMSE: 69636.59025929742
Gradient Boosting - RMSE: 69912.58285707333
XGBoost - RMSE: 69295.07711191328
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010593 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1135
[LightGBM] [Info] Number of data points in the train set: 150826, number of used features: 425
[LightGBM] [Info] Start training from score 8.698397
LightGBM - RMSE: 69319.42656435727
CatBoost - RMSE: 69232.44833854595


In [8]:
model=  CatBoostRegressor(silent=True)
model.fit(X_train_split, y_train_split)
y_pred_transformed = model.predict( X_test)
    
 # Inverser la transformation Box-Cox
y_pred = inv_boxcox(y_pred_transformed, lambda_)

In [10]:
## Création du fichier de soumission
submission_df = pd.DataFrame({'id': id_test, 'price': y_pred})
submission_df.to_csv('submission_nv.csv', index=False)