In [None]:
!pip install autogluon.features

In [None]:
import numpy as np 
import pandas as pd 
import optuna

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from autogluon.features.generators import AutoMLPipelineFeatureGenerator #Does autogluon's feature engineering

In [None]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv", index_col="id")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv", index_col="id")

In [None]:
def clean(df):
    df["fuel_type"] = df["fuel_type"].replace("Plug-In Hybrid", "Hybrid")
    df["clean_title"] = df["clean_title"].fillna("No") #To treat feature as binary
    
def extract_features(df):
    #Engine features
    df["engine_horsepower"] = df["engine"].str.extract(r'(\d+\.\d+|\d+)\s*HP').astype(float)
    df["engine_liters"] = df["engine"].str.extract(r'(\d+\.\d+|\d+)\s*L').astype(float)
    df_cylinders = df['engine'].str.extract(r'(\d+)\s*Cylinder|V(\d+)', expand=False)
    df['engine_cylinders'] = df_cylinders[0].fillna(df_cylinders[1]) #Combine both regex searches
    
    df.drop("engine", axis=1, inplace=True)
    
    #Transmission features
    df["transimssion_dct"] = df["transmission"].str.contains('Dual', case=False)

def preprocess(df_train, df_test):
    df = pd.concat([df_train, df_test], axis=0)
    
    clean(df)
    extract_features(df)    
    
    df_train = df.loc[df_train.index]
    df_test = df.loc[df_test.index]
    return df_train, df_test

df_train, df_test = preprocess(df_train, df_test)

In [None]:
RANDOM_SEED = 31415
#Holdout set
df_train, df_holdout = train_test_split(df_train, test_size=0.1, random_state=RANDOM_SEED)

In [None]:
#Autogluon preprocessing
autogluon_pipeline = AutoMLPipelineFeatureGenerator()
autogluon_pipeline.fit_transform(df_train)
autogluon_pipeline.transform(df_holdout)

In [None]:
def encode_and_impute(df_train, df_holdout, df_test):
    df = pd.concat([df_train, df_holdout, df_test], axis=0)
    
    label_encoders = {}
    for col in df.select_dtypes(include=["category", "object"]).columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    
    imputer = SimpleImputer(strategy='most_frequent')
    df[df.columns] = imputer.fit_transform(df)
    
    df_train = df.iloc[:len(df_train)]
    df_holdout = df.iloc[len(df_train):len(df_train)+len(df_holdout)]
    df_test = df.iloc[len(df_train)+len(df_holdout):].drop("price", axis=1)
    
    return df_train, df_holdout, df_test

df_train, df_holdout, df_test = encode_and_impute(df_train, df_holdout, df_test)


In [None]:
def score(df, model=XGBRegressor()):
    X = df.copy()
    y = X.pop("price")
    preds = cross_val_predict(model, X, y, cv=5, n_jobs=-1) 
    rmse = np.sqrt(mean_squared_error(y, preds))
    return rmse

def objective(trial):
    xgb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        learning_rate=trial.suggest_float("learning_rate", 5e-3, 1e-1, log=True),
        max_leaves=trial.suggest_int("max_leaves", 10, 50),      
        n_estimators=trial.suggest_int("n_estimators", 1000, 5000),        
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
        objective="reg:squarederror",
        eval_metric="rmse",
    )
    model = XGBRegressor(**xgb_params, n_jobs=-1)
    return score(df_train, model)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=200, timeout=11.5*3600)

In [None]:
model = XGBRegressor(**study.best_params)
X_train = df_train.drop("price", axis=1)
y_train = df_train["price"]
model.fit(X_train, y_train)

model.save_model("xgboost_model.json")

X_holdout = df_holdout.copy()
y_holdout = X_holdout.pop("price")

preds = model.predict(X_holdout)

rmse = np.sqrt(mean_squared_error(y_holdout, preds))

print(f"Holdout RMSE: {rmse}")

In [None]:
preds = model.predict(df_test)
preds = pd.Series(preds, index=df_test.index, name="price")
preds

In [None]:
preds.to_csv("submission.csv")