<a href="https://www.kaggle.com/code/kunrittyhe/used-car-prices-catboost?scriptVersionId=198442543" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install autogluon.features

In [40]:
import numpy as np 
import pandas as pd 
import optuna

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from catboost import CatBoostRegressor
from autogluon.features.generators import AutoMLPipelineFeatureGenerator #Does autogluon's feature engineering

In [41]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv", index_col="id")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv", index_col="id")

In [42]:
def clean(df):
    df["fuel_type"] = df["fuel_type"].replace("Plug-In Hybrid", "Hybrid")
    df["clean_title"] = df["clean_title"].fillna("No") #To treat feature as binary
    return df
    
def extract_features(df):
    #Engine features
    df["engine_horsepower"] = df["engine"].str.extract(r'(\d+\.\d+|\d+)\s*HP').astype(float)
    df["engine_liters"] = df["engine"].str.extract(r'(\d+\.\d+|\d+)\s*L').astype(float)
    df_cylinders = df['engine'].str.extract(r'(\d+)\s*Cylinder|V(\d+)', expand=False)
    df['engine_cylinders'] = df_cylinders[0].fillna(df_cylinders[1]) #Combine both regex searches
    
    df.drop("engine", axis=1, inplace=True)
    
    #Transmission features
    df["transimssion_dct"] = df["transmission"].str.contains('Dual', case=False)
    
    return df

def preprocess(df_train, df_test):
    df = pd.concat([df_train, df_test], axis=0)
    
    df = clean(df)
    df = extract_features(df)    
    
    df_train = df.loc[df_train.index]
    df_test = df.loc[df_test.index]
    return df_train, df_test

df_train, df_test = preprocess(df_train, df_test)

In [43]:
#Autogluon preprocessing
autogluon_pipeline = AutoMLPipelineFeatureGenerator()
autogluon_pipeline.fit_transform(df_train)
#autogluon_pipeline.transform(df_test)

Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    30157.81 MB
	Train Data (Original)  Memory Usage: 113.89 MB (0.4% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 2 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('bool', [])   : 1 | ['transimssion_dct']
		('float', [])  : 3 | ['price', 'engine_horsepower', 'engine_liters']
		('int', [])    : 2 | ['model_year', 'm

Unnamed: 0_level_0,model_year,milage,clean_title,price,engine_horsepower,engine_liters,transimssion_dct,brand,model,fuel_type,transmission,ext_col,int_col,accident,engine_cylinders
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,2007,213000,1,4200.0,172.0,1.6,0,31,495,3,38,312,71,2,4
1,2002,143250,1,4999.0,252.0,3.9,0,28,929,3,38,263,10,1,7
2,2002,136731,1,13900.0,320.0,5.3,0,9,1574,2,38,38,71,2,7
3,2017,19500,1,45000.0,420.0,5.0,1,16,758,3,49,29,14,2,7
4,2021,7388,1,97500.0,208.0,2.0,0,36,1076,3,23,29,10,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188528,2017,49000,1,27500.0,420.0,6.2,1,8,604,3,49,304,10,2,7
188529,2018,28600,1,30000.0,385.0,3.0,0,36,206,3,31,304,14,1,6
188530,2021,13650,1,86900.0,469.0,4.0,0,36,223,3,23,304,14,2,7
188531,2022,13895,0,84900.0,,3.0,0,3,1470,3,1,82,14,2,0


In [45]:
def encode_and_impute(df_train, df_test):
    df = pd.concat([df_train, df_test], axis=0)
    
    label_encoders = {}
    for col in df.select_dtypes(include=["category", "object"]).columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    
    imputer = SimpleImputer(strategy='most_frequent')
    df[df.columns] = imputer.fit_transform(df)
    
    df_train = df.iloc[:len(df_train)]
    df_test = df.iloc[len(df_train):].drop("price", axis=1)
    
    return df_train, df_test

df_train, df_test = encode_and_impute(df_train, df_test)

In [56]:
def score(df, model=CatBoostRegressor()):
    X = df.copy()
    y = X.pop("price")
    preds = cross_val_predict(model, X, y, cv=5, n_jobs=-1) 
    rmse = np.sqrt(mean_squared_error(y, preds))
    return rmse

def objective(trial):
    cat_params = dict(
        iterations=5000,
        early_stopping_rounds=500,
        depth=trial.suggest_int('depth', 4, 14),
        learning_rate=trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        l2_leaf_reg=trial.suggest_float('l2_leaf_reg', 1e-3, 10, log=True),
        bagging_temperature=trial.suggest_float('bagging_temperature', 0.0, 1.0),
        random_strength=trial.suggest_float('random_strength', 0.5, 2.0),
        verbose=0,
        loss_function="RMSE",
        eval_metric="RMSE"
    )
    model = CatBoostRegressor(**cat_params)
    return score(df_train, model)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=200, timeout=11.5*3600)

[I 2024-09-27 00:37:11,592] A new study created in memory with name: no-name-50830cd4-d1e0-47c3-a697-b368b1a1845f
[I 2024-09-27 00:40:49,767] Trial 0 finished with value: 72927.99541884384 and parameters: {'iterations': 807, 'depth': 10, 'learning_rate': 0.008381356994061135, 'l2_leaf_reg': 0.09448486170297987, 'bagging_temperature': 0.5869940721118078, 'border_count': 202}. Best is trial 0 with value: 72927.99541884384.


In [59]:
model = CatBoostRegressor(**study.best_params, verbose=100)
X_train = df_train.drop("price", axis=1)
y_train = df_train["price"]
model.fit(X_train, y_train)

model.save_model('catboost_model.cbm')


0:	learn: 78723.0108583	total: 65ms	remaining: 52.4s
100:	learn: 73723.6344794	total: 6.07s	remaining: 42.4s
200:	learn: 71986.9528371	total: 12.3s	remaining: 37s
300:	learn: 70984.8239832	total: 18.5s	remaining: 31.1s
400:	learn: 70272.0878961	total: 24.5s	remaining: 24.8s
500:	learn: 69641.4884715	total: 30.4s	remaining: 18.6s
600:	learn: 69050.3427454	total: 36.5s	remaining: 12.5s
700:	learn: 68461.2957858	total: 42.4s	remaining: 6.41s
800:	learn: 67913.0738435	total: 48.8s	remaining: 366ms
806:	learn: 67886.8153215	total: 49.2s	remaining: 0us


In [60]:
preds = model.predict(df_test)
preds = pd.Series(preds, index=df_test.index, name="price")
preds

id
188533    16527.424399
188534    75408.506828
188535    53117.701111
188536    25001.755683
188537    28930.847307
              ...     
314218    27612.121729
314219    45177.894779
314220    22713.542053
314221    17397.765966
314222    37653.946978
Name: price, Length: 125690, dtype: float64

In [None]:
preds.to_csv("submission.csv")