<a href="https://www.kaggle.com/code/kunrittyhe/used-car-prices-catboost?scriptVersionId=198442543" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install autogluon.features

In [None]:
import numpy as np 
import pandas as pd 
import optuna

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from catboost import CatBoostRegressor
from autogluon.features.generators import AutoMLPipelineFeatureGenerator #Does autogluon's feature engineering

In [None]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv", index_col="id")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv", index_col="id")

In [None]:
def clean(df):
    df["fuel_type"] = df["fuel_type"].replace("Plug-In Hybrid", "Hybrid")
    df["clean_title"] = df["clean_title"].fillna("No") #To treat feature as binary
    return df
    
def extract_features(df):
    #Engine features
    df["engine_horsepower"] = df["engine"].str.extract(r'(\d+\.\d+|\d+)\s*HP').astype(float)
    df["engine_liters"] = df["engine"].str.extract(r'(\d+\.\d+|\d+)\s*L').astype(float)
    df_cylinders = df['engine'].str.extract(r'(\d+)\s*Cylinder|V(\d+)', expand=False)
    df['engine_cylinders'] = df_cylinders[0].fillna(df_cylinders[1]) #Combine both regex searches
    
    df.drop("engine", axis=1, inplace=True)
    
    #Transmission features
    df["transimssion_dct"] = df["transmission"].str.contains('Dual', case=False)
    
    return df

def preprocess(df_train, df_test):
    df = pd.concat([df_train, df_test], axis=0)
    
    df = clean(df)
    df = extract_features(df)    
    
    df_train = df.loc[df_train.index]
    df_test = df.loc[df_test.index]
    return df_train, df_test

df_train, df_test = preprocess(df_train, df_test)

In [None]:
#Autogluon preprocessing
autogluon_pipeline = AutoMLPipelineFeatureGenerator()
autogluon_pipeline.fit_transform(df_train)
#autogluon_pipeline.transform(df_test)

In [None]:
def encode_and_impute(df_train, df_test):
    df = pd.concat([df_train, df_test], axis=0)
    
    label_encoders = {}
    for col in df.select_dtypes(include=["category", "object"]).columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    
    imputer = SimpleImputer(strategy='most_frequent')
    df[df.columns] = imputer.fit_transform(df)
    
    df_train = df.iloc[:len(df_train)]
    df_test = df.iloc[len(df_train):].drop("price", axis=1)
    
    return df_train, df_test

df_train, df_test = encode_and_impute(df_train, df_test)

In [None]:
def score(df, model=CatBoostRegressor()):
    X = df.copy()
    y = X.pop("price")
    preds = cross_val_predict(model, X, y, cv=5, n_jobs=-1) 
    rmse = np.sqrt(mean_squared_error(y, preds))
    return rmse

def objective(trial):
    cat_params = dict(
        iterations=trial.suggest_int('iterations', 500, 2000),
        depth=trial.suggest_int('depth', 4, 10),
        learning_rate=trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        l2_leaf_reg=trial.suggest_float('l2_leaf_reg', 1e-3, 10, log=True),
        bagging_temperature=trial.suggest_float('bagging_temperature', 0.0, 1.0),
        border_count=trial.suggest_int('border_count', 32, 255),
        verbose=0,
        loss_function="RMSE",
    )
    model = CatBoostRegressor(**cat_params)
    return score(df_train, model)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=200, timeout=11.5*3600)

In [None]:
model = CatBoostRegressor(**study.best_params, verbose=100)
X_train = df_train.drop("price", axis=1)
y_train = df_train["price"]
model.fit(X_train, y_train)

model.save_model('catboost_model.cbm')


In [None]:
preds = model.predict(df_test)
preds = pd.Series(preds, index=df_test.index, name="price")
preds

In [None]:
preds.to_csv("submission.csv")