<a href="https://www.kaggle.com/code/kunrittyhe/used-car-prices-autogluon-full-train?scriptVersionId=198567310" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install autogluon.tabular[all]

Collecting autogluon.tabular[all]
  Downloading autogluon.tabular-1.1.1-py3-none-any.whl.metadata (13 kB)
Collecting scipy<1.13,>=1.5.4 (from autogluon.tabular[all])
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn<1.4.1,>=1.3.0 (from autogluon.tabular[all])
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting autogluon.core==1.1.1 (from autogluon.tabular[all])
  Downloading autogluon.core-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.features==1.1.1 (from autogluon.tabular[all])
  Downloading autogluon.features-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting torch<2.4,>=2.2 (from autogluon.tabular[all])
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting au

# Imports #

In [2]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

In [3]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv", index_col="id")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv", index_col="id")

In [4]:
df_train

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500
...,...,...,...,...,...,...,...,...,...,...,...,...
188528,Cadillac,Escalade ESV Platinum,2017,49000,Gasoline,420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,Beige,None reported,Yes,27500
188529,Mercedes-Benz,AMG C 43 AMG C 43 4MATIC,2018,28600,Gasoline,385.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes,30000
188530,Mercedes-Benz,AMG GLC 63 Base 4MATIC,2021,13650,Gasoline,469.0HP 4.0L 8 Cylinder Engine Gasoline Fuel,7-Speed A/T,White,Black,None reported,Yes,86900
188531,Audi,S5 3.0T Prestige,2022,13895,Gasoline,3.0L,1-Speed Automatic,Daytona Gray Pearl Effect,Black,None reported,,84900


# Preprocessing #

In [5]:
def clean(df):
    df["fuel_type"] = df["fuel_type"].replace("Plug-In Hybrid", "Hybrid")
    df["clean_title"] = df["clean_title"].fillna("No") #To treat feature as binary
    
    return df

Extract horsepower, liters, and cylinders from `engine` feature.

Extract automatic, manual, or dct (dual clutch transmission) from `transmission` feature. 

In [6]:
def extract_features(df):
    #Engine features
    df["engine_horsepower"] = df["engine"].str.extract(r'(\d+\.\d+|\d+)\s*HP').astype(float)
    df["engine_liters"] = df["engine"].str.extract(r'(\d+\.\d+|\d+)\s*L').astype(float)
    df_cylinders = df['engine'].str.extract(r'(\d+)\s*Cylinder|V(\d+)', expand=False)
    df['engine_cylinders'] = df_cylinders[0].fillna(df_cylinders[1]) #Combine both regex searches
    
    df = df.drop("engine", axis=1)
    
    #Transmission features
    df["transmission_automatic"] = df["transmission"].str.contains(r'Automatic|A/T|AT', case=False)
    df["transmission_manual"] = df["transmission"].str.contains(r'Manual|M/T|MT', case=False)
    df["transimssion_dct"] = df["transmission"].str.contains('Dual', case=False)
    
    return df

Bundle feature extraction and cleaning into one pipeline

In [7]:
def preprocess(df_train, df_test):
    df = pd.concat([df_train, df_test], axis=0)
    
    df = clean(df)
    df = extract_features(df)
    
    df_train = df.loc[df_train.index]
    df_test = df.loc[df_test.index]
    return df_train, df_test

In [8]:
df_train, df_test = preprocess(df_train, df_test)

In [9]:
df_train.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,engine_horsepower,engine_liters,engine_cylinders,transmission_automatic,transmission_manual,transimssion_dct
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,A/T,Yellow,Gray,None reported,Yes,4200.0,172.0,1.6,4,True,False,False
1,Lincoln,LS V8,2002,143250,Gasoline,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999.0,252.0,3.9,8,True,False,False
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,A/T,Blue,Gray,None reported,Yes,13900.0,320.0,5.3,8,True,False,False
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000.0,420.0,5.0,8,False,False,True
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,7-Speed A/T,Black,Beige,None reported,Yes,97500.0,208.0,2.0,4,True,False,False


# Modeling #

First, create holdout set for validation. Random seed is to keep consistent OOF predictions throughout different notebooks.

In [10]:
#Convert for autogluon
ds_train = TabularDataset(df_train)
ds_test = TabularDataset(df_test)


In [11]:
xgb_params = {'max_depth': 6, 'min_child_weight': 7, 'colsample_bytree': 0.21577008076093662, 
              'subsample': 0.7812835287449484, 'learning_rate': 0.01459299700503753, 'max_leaves': 44, 
              'n_estimators': 1287, 'reg_alpha': 0.017095752487029176, 'reg_lambda': 98.93396229323028}
cat_params = {'iterations': 1040, 'depth': 8, 'learning_rate': 0.01812522069947833, 
              'l2_leaf_reg': 8.217612632114935, 'bagging_temperature': 0.5059198086110822, 
              'border_count': 225}

In [12]:
params = {
    'NN_TORCH': {},
    'GBM': {}, 
    'CAT': cat_params, 
    'XGB': xgb_params, 
    'FASTAI': {}, 
    'RF': {}, 
    'XT': {}, 
}

In [13]:
#Disabled dynamic stacking since it was done on this dataset in other notebooks
model = TabularPredictor(label="price", eval_metric="rmse").fit(
    ds_train, 
    presets="best_quality", 
    hyperparameters=params, 
    dynamic_stacking=False, 
    num_stack_levels=2, 
    time_limit=3600*11
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240927_200305"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 27 20:43:36 UTC 2024
CPU Count:          4
Memory Avail:       30.09 GB / 31.36 GB (96.0%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=2, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ... Time limit = 39600s
AutoGluon will save models to "AutogluonModels/ag-20240927_200305"
Train Data Rows:    188533
Train Data Columns: 16
Label Column:       price
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (2954083.0, 2000.0, 43878.01618, 78819.52225)
	If 'regression' is not the correct problem_type, please manu

In [14]:
model.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L4,-72491.605252,root_mean_squared_error,132.049196,15161.081434,0.003395,0.581589,4,True,24
1,WeightedEnsemble_L3,-72501.0617,root_mean_squared_error,103.712546,12727.735759,0.003355,0.263159,3,True,16
2,XGBoost_BAG_L2,-72515.605417,root_mean_squared_error,83.421678,10199.806556,12.011695,2178.14603,2,True,14
3,CatBoost_BAG_L2,-72526.478443,root_mean_squared_error,73.180012,9061.787819,1.770029,1040.127293,2,True,11
4,WeightedEnsemble_L2,-72539.058062,root_mean_squared_error,71.413731,8021.988696,0.003748,0.32817,2,True,8
5,CatBoost_BAG_L3,-72543.139501,root_mean_squared_error,117.104889,14250.275686,1.487482,977.67261,3,True,19
6,XGBoost_BAG_L3,-72552.240738,root_mean_squared_error,123.996563,14853.911237,8.379156,1581.30816,3,True,22
7,XGBoost_BAG_L1,-72643.965113,root_mean_squared_error,40.930191,4227.12165,40.930191,4227.12165,1,True,6
8,CatBoost_BAG_L1,-72743.252917,root_mean_squared_error,2.855224,1709.336748,2.855224,1709.336748,1,True,3
9,NeuralNetFastAI_BAG_L3,-72819.597349,root_mean_squared_error,122.363147,14093.239527,6.74574,820.636451,3,True,21


In [15]:
predictions = model.predict(ds_test)

In [16]:
predictions

id
188533    17650.380859
188534    76827.187500
188535    55289.433594
188536    27333.322266
188537    29800.039062
              ...     
314218    27425.103516
314219    48893.347656
314220    21419.726562
314221    17070.837891
314222    36715.890625
Name: price, Length: 125690, dtype: float32

In [17]:
predictions.to_csv("submission.csv")