# Benchmarking with fe_engine v1

In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import category_encoders as ce

In this notebook my goal is to benchmark multiple regression models on the dataset with as little data cleaning as required. 


In [2]:
og_train = pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv", index_col="id")
og_test = pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv", index_col="id")

In [3]:
engine_re = r'(?:(\d+\.?\d*)HP)? ?(?:(\d+\.?\d*)(?:L| Liter))? ?((?:Straight)? ?(?:V\d+ ?(?:Cylinder)?|\d+ Cylinder|I\d+|H\d+|Flat ?\d+|W\d+))? ?(?:(\d+)V)? ?(DDI|GDI|PFI|MPFI|SIDI|PDI)? ?(OHV|DOHC|SOHC)? ?(Turbo|Twin Turbo|Supercharged|TFSI|T\/C|TSI)? ?(?:Engine)? ?(?:(\d+\.?\d*) ?A[hH])?[ \/]*(?:FR ?(\d+\.?\d*)kW)?[ \/]*(?:RR ?(\d+\.?\d*)kW)?[ \/]?(?:\((\d+)V?\))?(.*)'

In [4]:
def extract_engine_features(df): 
    extracted_features = ["HP","L","Cylinders","Valves","Fuel_injection","Valve_train_design","Forced_induction","Ah","FRKW","RRKW","Volts","Engine_type"]
    df = df.copy()
    df_extracted = df["engine"].str.extract(engine_re)
    df_extracted.columns = extracted_features
    
    #Clean value types
    float_feat = ["HP","L","Valves","Ah","FRKW","RRKW","Volts"]
    for feat in float_feat: 
        df_extracted[feat] = df_extracted[feat].astype(float)
    
    df = df.join(df_extracted)
    
    df = df.drop("engine", axis=1)
    
    return df

In [5]:
def fe_accident(df): 
    accident_map = {
        "None reported": 0,
        "At least 1 accident or damage reported": 1
    }
    df = df.copy()
    
    # more than 50% of vehicules with a milage > 175k have accidents
    df.loc[(df.accident.isna()) & (df.milage > 175_000), "accident"] = "At least 1 accident or damage reported"
    df["accident"] = df["accident"].fillna("None reported")
    df["accident"] = df["accident"].map(accident_map)
    
    return df

In [6]:
def fe_clean_title(df):
    df = df.copy()
    df["clean_title"] = df["clean_title"].fillna("No")
    df["clean_title"] = df["clean_title"].map({"No": 0, "Yes": 1})
    
    return df

In [7]:
def feature_engineering(train, test): 
    fe_train = train.copy()
    fe_test = test.copy()
    
    fe_train = extract_engine_features(fe_train)
    fe_test = extract_engine_features(fe_test)


    fe_train = fe_accident(fe_train)
    fe_test = fe_accident(fe_test)
    
    fe_train = fe_clean_title(fe_train)
    fe_test = fe_clean_title(fe_test)
    
    
    
    return fe_train, fe_test
    

In [8]:
train, test = feature_engineering(og_train, og_test)

train[(train["fuel_type"].isna()) & (train["Engine_type"].isin(["",'–']))].head()

In [9]:
unused_cols = ["price"]
ordinal_cols = ["model","ext_col","int_col"]
num_cols_mean = ["model_year","milage", "HP"]
num_cols_zero = ["L","Valves","Ah","FRKW","RRKW","Volts"]
target_encode_cols = []
one_hot_cols = ["fuel_type","transmission","Cylinders","Fuel_injection","Valve_train_design","Forced_induction","Engine_type"]

In [10]:
num_mean_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])
num_zero_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=0))
])
# So we keep an order in the brand encoding
mean_brand_prices = train.groupby("brand")["price"].mean()
brand_sorted = mean_brand_prices.sort_values(ascending=True)
brand_sorted = brand_sorted.reset_index()
sorted_brands = brand_sorted["brand"].tolist()

brand_pipeline = Pipeline(steps=[
    ("encoder", OrdinalEncoder(categories=[sorted_brands]))
])

cat_ord_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder_ord", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
cat_one_hot_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder_onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
target_encode_pipeline = Pipeline(steps=[
    ("target_encoder", ce.TargetEncoder())
])

passthrough = False 
if passthrough: 
    preprocessor = ColumnTransformer(transformers=[
        ("num_mean", num_mean_pipeline, num_cols_mean),
        ("num_zero", num_zero_pipeline, num_cols_zero),
        ("target_enc", target_encode_pipeline, target_encode_cols),
        ("cat_ord", cat_ord_pipeline, ordinal_cols),
        ("cat_onehot", cat_one_hot_pipeline, one_hot_cols),
        ("brand",brand_pipeline, ["brand"]),
        ("passthrough", "passthrough", unused_cols)
    ])
else: 
    preprocessor = ColumnTransformer(transformers=[
        ("num_mean", num_mean_pipeline, num_cols_mean),
        ("num_zero", num_zero_pipeline, num_cols_zero),
        ("target_enc", target_encode_pipeline, target_encode_cols),
        ("cat_ord", cat_ord_pipeline, ordinal_cols),
        ("cat_onehot", cat_one_hot_pipeline, one_hot_cols),
        ("brand",brand_pipeline, ["brand"])
    ])

In [11]:
if passthrough: 
    
    preprocessed_train = preprocessor.fit_transform(train, train["price"])
    one_hot_encoder = preprocessor.named_transformers_["cat_onehot"]
    onehot_features_names = one_hot_encoder.get_feature_names_out(one_hot_cols)
    all_feature_names = num_cols_mean + num_cols_zero + target_encode_cols + ordinal_cols + list(onehot_features_names) + ["brand"] + unused_cols
    print(f"all features: {len(all_feature_names)}")
    print(f"preprocessed shape: {preprocessed_train.shape}")
    preprocessed_train_df = pd.DataFrame(preprocessed_train, columns=all_feature_names)


# Benchmarking CV

n_folds = 3

X = train.drop("price", axis=1)
y = train["price"]

In [12]:
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import HuberRegressor, RidgeCV, BayesianRidge, Ridge, LinearRegression, LassoCV, ElasticNetCV, LassoLarsCV, LassoLarsIC, LarsCV, Lars, SGDRegressor, RANSACRegressor, ElasticNet, Lasso, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.dummy import DummyRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score

model_list = [
    ('AdaBoostRegressor', AdaBoostRegressor()),
    ('GradientBoostingRegressor', GradientBoostingRegressor()),
    ('HistGradientBoostingRegressor', HistGradientBoostingRegressor()),
    ('RidgeCV', RidgeCV()),
    ('BayesianRidge', BayesianRidge()),
    ('Ridge', Ridge()),
    ('LinearRegression', LinearRegression()),
    ('TransformedTargetRegressor', TransformedTargetRegressor()),
    ('LassoCV', LassoCV()),
    ('ElasticNetCV', ElasticNetCV()),
    ('LassoLarsCV', LassoLarsCV()),
    ('LassoLarsIC', LassoLarsIC()),
    ('LarsCV', LarsCV()),
    ('Lars', Lars()),
    ('RANSACRegressor', RANSACRegressor()),
    ('ElasticNet', ElasticNet()),
    ('Lasso', Lasso()),
    ('OrthogonalMatchingPursuitCV', OrthogonalMatchingPursuitCV()),
    ('OrthogonalMatchingPursuit', OrthogonalMatchingPursuit()),
    ('DummyRegressor', DummyRegressor()),
    ('XGBRegressor', xgb.XGBRegressor()),
    ('LGBMRegressor', lgb.LGBMRegressor()),
    ('CatBoostRegressor', CatBoostRegressor(silent=True))
]

reduced_model_list = [
    ('XGBRegressor', xgb.XGBRegressor()),
    ('LGBMRegressor', lgb.LGBMRegressor()),
    ('CatBoostRegressor', CatBoostRegressor(silent=True)),
    ('GradientBoostingRegressor', GradientBoostingRegressor()),
    ('HistGradientBoostingRegressor', HistGradientBoostingRegressor())
]

results_df = pd.DataFrame(columns=["model","rmse","time_taken"])

import time
for name, model in reduced_model_list:
    st = time.perf_counter()
    print(f"Evaluating {name}")
    try:
        pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
        
        cv_mse = -cross_val_score(pipeline, X, y, scoring="neg_mean_squared_error", cv=n_folds)
        cv_rmse = np.sqrt(cv_mse).mean()
        
        time_taken = time.perf_counter()-st
        
        print(f"{name} took {time_taken}s")
        print(f"{name} 3CV_rmse: {cv_rmse}")
        
        results_df.loc[len(results_df)] = [name,cv_rmse,time_taken]
    except Exception as e: 
        print(f"{name} Failed to execute: {e}")
        
results_df.sort_values(by="rmse")

# Checking train rmse vs eval rmse

for name, model in reduced_model_list:
    print(f"Evaluating {name}")
    try:
        pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

        X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.05)

        pipeline.fit(X_train, y_train)

        y_train_preds = pipeline.predict(X_train)
        y_eval_preds = pipeline.predict(X_eval)

        rmse_train = np.sqrt(mean_squared_error(y_train, y_train_preds))
        rmse_eval = np.sqrt(mean_squared_error(y_eval, y_eval_preds))

        time_taken = time.perf_counter()-st

    except Exception as e: 
        print(f"{name} Failed to execute: {e}")

    print(f"Train rmse: {rmse_train}")
    print(f"Eval rmse: {rmse_eval}")

# Submission

In [13]:
X = train.drop("price", axis=1)
y = train["price"]

In [14]:
model = lgb.LGBMRegressor()
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

In [15]:
pipeline.fit(X,y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1447
[LightGBM] [Info] Number of data points in the train set: 188533, number of used features: 117
[LightGBM] [Info] Start training from score 43878.016178


In [16]:
preds = pipeline.predict(test)
preds

array([18541.44969631, 77094.1692001 , 70835.09288285, ...,
       23626.275975  , 16560.13415213, 38013.05879199])

In [17]:
submission_file = test.reset_index()[['id']]
submission_file['price'] = preds
submission_file = submission_file.set_index("id")
submission_file.to_csv("/kaggle/working/submission.csv")