<a href="https://www.kaggle.com/code/kunrittyhe/used-car-prices-autogluon-full-train?scriptVersionId=198219439" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install autogluon.tabular[all]

# Imports #

In [None]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv", index_col="id")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv", index_col="id")

In [None]:
df_train

# Preprocessing #

In [None]:
def clean(df):
    df["fuel_type"] = df["fuel_type"].replace("Plug-In Hybrid", "Hybrid")
    df["clean_title"] = df["clean_title"].fillna("No") #To treat feature as binary
    
    return df

Extract horsepower, liters, and cylinders from `engine` feature.

Extract automatic, manual, or dct (dual clutch transmission) from `transmission` feature. 

In [None]:
def extract_features(df):
    #Engine features
    df["engine_horsepower"] = df["engine"].str.extract(r'(\d+\.\d+|\d+)\s*HP').astype(float)
    df["engine_liters"] = df["engine"].str.extract(r'(\d+\.\d+|\d+)\s*L').astype(float)
    df_cylinders = df['engine'].str.extract(r'(\d+)\s*Cylinder|V(\d+)', expand=False)
    df['engine_cylinders'] = df_cylinders[0].fillna(df_cylinders[1]) #Combine both regex searches
    
    df = df.drop("engine", axis=1)
    
    #Transmission features
    df["transmission_automatic"] = df["transmission"].str.contains(r'Automatic|A/T|AT', case=False)
    df["transmission_manual"] = df["transmission"].str.contains(r'Manual|M/T|MT', case=False)
    df["transimssion_dct"] = df["transmission"].str.contains('Dual', case=False)
    
    return df

Bundle feature extraction and cleaning into one pipeline

In [None]:
def preprocess(df_train, df_test):
    df = pd.concat([df_train, df_test], axis=0)
    
    df = clean(df)
    df = extract_features(df)
    
    df_train = df.loc[df_train.index]
    df_test = df.loc[df_test.index]
    return df_train, df_test

In [None]:
df_train, df_test = preprocess(df_train, df_test)

In [None]:
df_train.head()

# Modeling #

First, create holdout set for validation. Random seed is to keep consistent OOF predictions throughout different notebooks.

In [None]:
#Convert for autogluon
ds_train = TabularDataset(df_train)
ds_test = TabularDataset(df_test)


In [None]:
xgb_params = {'max_depth': 6, 'min_child_weight': 7, 'colsample_bytree': 0.21577008076093662, 
              'subsample': 0.7812835287449484, 'learning_rate': 0.01459299700503753, 'max_leaves': 44, 
              'n_estimators': 1287, 'reg_alpha': 0.017095752487029176, 'reg_lambda': 98.93396229323028}
cat_params = {'iterations': 1040, 'depth': 8, 'learning_rate': 0.01812522069947833, 
              'l2_leaf_reg': 8.217612632114935, 'bagging_temperature': 0.5059198086110822, 
              'border_count': 225}

In [None]:
params = {
    'NN_TORCH': {},
    'GBM': {}, 
    'CAT': cat_params, 
    'XGB': xgb_params, 
    'FASTAI': {}, 
    'RF': {}, 
    'XT': {}, 
}

In [None]:
#Disabled dynamic stacking since it was done on this dataset in other notebooks
model = TabularPredictor(label="price", eval_metric="rmse").fit(
    ds_train, 
    presets="best_quality", 
    hyperparameters=params, 
    dynamic_stacking=False, 
    num_stack_levels=1, 
    time_limit=3600*11
)

In [None]:
model.leaderboard()

In [None]:
predictions = model.predict(ds_test)

In [None]:
predictions

In [None]:
predictions.to_csv("submission.csv")