## Install all Libraries

In [None]:
!apt-get update
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
!pip install catboost
!pip install --upgrade scikit-learn
!pip install optuna

In [None]:
import sys
import pandas as pd
import time
import re
from random import randint
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from sklearn.preprocessing import PowerTransformer, RobustScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from scipy.stats import boxcox
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#model
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

#crosval
from sklearn.model_selection import cross_val_score, KFold

#Metrics
from sklearn.metrics import mean_squared_error

from google.colab import files
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")

## Extracting Car From CarSome

In [None]:
# Setup Chrome Driver
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("user-agent=Mozilla/5.0")
service = Service('/usr/bin/chromedriver')
driver = webdriver.Chrome(options=options)

# Data containers
year_brand, model, total_price, price_per_month = [], [], [], []
mileage, transmission, location, highlight, links = [], [], [], [], []

# Loop page
for page in range(1, 103):
    url = f"https://www.carsome.my/buy-car?page=%7Bpage%7D&pageNo={page}"
    driver.get(url)

    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "mod-b-card"))
        )
    except:
        print(f"❌ Gagal load data mobil di page {page}")
        continue

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    car_data = soup.find_all('article', class_='mod-b-card')
    # print(car_data[0].petrify())

    print(f"📄 Page {page}: {len(car_data)} cars found")
    if len(car_data) == 0:
        break

    for store in car_data:
        try:
            # Title: Year + Brand + Model
            title_tag = store.find('a', class_='mod-b-card__title')
            if title_tag:
                full_text = title_tag.get_text(strip=True)
                parts = full_text.split()
                year = parts[0]
                brand_model = " ".join(parts[1:])
                link = "https://www.carsome.my" + title_tag['href']
            else:
                year, brand_model, link = "-", "-", "-"

            # Total Price
            price_total = store.find('div', class_='mod-card__price')
            price = price_total.find_all('span')[-1].text.strip() if price_total else "-"

            # Price per Month (ambil hanya angka RM-nya saja)
            ppm_tag = store.find('div', class_='mod-tooltipMonthPay')
            ppm_raw = ppm_tag.get_text(strip=True) if ppm_tag else "-"
            ppm = ppm_raw.split("/")[0].strip() if "/" in ppm_raw else ppm_raw

            # Mileage & Transmission
            car_other = store.find('div', class_='mod-b-card__car-other')
            spans = car_other.find_all('span') if car_other else []
            km = spans[0].text.strip() if len(spans) > 0 else "-"
            trans = spans[1].text.strip() if len(spans) > 1 else "-"

            # Location
            loc_tag = store.find('div', class_='mod-b-card__car-location')
            loc = loc_tag.text.strip() if loc_tag else "-"

            # Highlight (Family Drive, Daily Drive, dll)
            tag = store.find('div', class_='mod-car-tagging')
            span_tag = tag.find('span') if tag else None
            hl = span_tag.get_text(strip=True) if span_tag else "-"

            # Append ke list
            year_brand.append(year)
            model.append(brand_model)
            total_price.append(price)
            price_per_month.append(ppm)
            mileage.append(km)
            transmission.append(trans)
            location.append(loc)
            highlight.append(hl)
            links.append(link)

        except Exception as e:
            print(f"⚠️ Gagal ambil 1 mobil: {e}")
            continue

    time.sleep(2)  # delay antar halaman

# close browser
driver.quit()

# DataFrame
df = pd.DataFrame({
    'Year': year_brand,
    'Model': model,
    'Total_Price': total_price,
    'Price_per_month': price_per_month,
    'Mileage': mileage,
    'Transmission': transmission,
    'Location': location,
    'Highlight': highlight,
    'URL': links
})

# drop duplikat
print(f"✅ Total data sebelum hapus duplikat: {len(df)}")
df.drop_duplicates(subset=['URL'], inplace=True)
print(f"✅ Total data setelah hapus duplikat: {len(df)}")

df.head()

In [None]:
df.to_csv('carsome_scraped_data.csv', index=False)

In [None]:
files.download("carsome_scraped_data.csv")

In [None]:
df = pd.read_csv('/content/carsome_scraped_data.csv')

## Data Check

In [None]:
df

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
df = pd.read_csv('/content/carsome_scraped_data.csv')

## Data Cleaning

1. brand

In [None]:
df['Model'].head()

In [None]:
df['Brand'] = df['Model'].str.split().str[0]

In [None]:
df['Brand'].head()

2. model

In [None]:
df['Model'].head()

In [None]:
df['Model'] = df['Model'].str.split().str[1:].apply(lambda x: ' '.join(x))
df['Model'] = df['Model'].str.lower()

In [None]:
df['Model']

3. Total_Price

In [None]:
df['Total_Price'].head()

In [None]:
df['Total_Price'] = df['Total_Price'].str.replace(r'[^\d.]', '', regex=True).astype(float)
df.columns = df.columns.str.replace('Total_Price', 'Total_Price(RM)')

In [None]:
df['Total_Price(RM)'].head()

4. Mileage

In [None]:
df['Mileage'].head()

In [None]:
df['Mileage'] = df['Mileage'].str.replace(r'km', '', regex=True)
df['Mileage'] = df['Mileage'].str.replace(',', '').astype(float)
df.columns = df.columns.str.replace('Mileage', 'Mileage(km)')

In [None]:
df['Mileage(km)'].head()

5. Transmission

In [None]:
df['Transmission'].head()

In [None]:
df['Transmission'] = df['Transmission'].str.replace('Manual', 'M')
df['Transmission'] = df['Transmission'].str.replace('Automatic', 'A')

In [None]:
df['Transmission']

6. Location

In [None]:
df['Location'].head()

In [None]:
df['Location'] = df['Location'].str.lower()

In [None]:
df['Location']

7. Highlights

In [None]:
df['Highlight'].head()

In [None]:
df['Highlight'] = df['Highlight'].str.replace(r'View 360', 'View 360°', regex=True)
df['Highlight'] = df['Highlight'].str.lower()

In [None]:
df['Highlight'].head()

8. Price per Month


In [None]:
df['Price_per_month'].head()

In [None]:
df['Price_per_month'] = df['Price_per_month'].str.replace(r'[^\d]', '', regex=True)
df['Price_per_month'] = df['Price_per_month'].str.replace(',', '').astype(float)
df.columns = df.columns.str.replace('Price_per_month', 'Price_per_month(RM)')

In [None]:
df['Price_per_month(RM)'].head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.dropna()

In [None]:
df.duplicated().sum()

## Data Visualization

In [None]:
df.head()

In [None]:
sorted = ['Year', 'Brand', 'Model', 'Mileage(km)', 'Transmission', 'Highlight', 'Location', 'Price_per_month(RM)', 'Total_Price(RM)']
df = df[sorted]

In [None]:
df

In [None]:
df.to_csv('Clean_Carsome_Data.csv', index=False)
files.download("Clean_Carsome_Data.csv")

In [None]:
Select_Features = ['Year', 'Brand', 'Model', 'Mileage(km)', 'Transmission', 'Highlight', 'Location', 'Total_Price(RM)']
df = df[Select_Features]

In [None]:
df.head()

In [None]:
df['Location'].unique()

In [None]:
df.info()

In [None]:
numerical_col = df.select_dtypes(include=[np.number])
categorical_col = df.select_dtypes(exclude=[np.number])

In [None]:
df['Year'].plot(
    kind='hist',
    color='pink',
    edgecolor='black',
    bins=5,
    figsize=(16,8)
)

plt.ticklabel_format(style='plain', axis='x')
plt.xlabel('Year', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.title('Distribution of Year', fontdict={'fontsize': 24})
plt.grid(True)
plt.show()

In [None]:
df.groupby('Brand').size().plot.bar(figsize=(16, 8))
plt.title('Distribution of Car Brand', fontsize=20)
plt.xlabel('Brand', fontsize=14)
plt.ylabel('Jumlah', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df.groupby('Model').size().plot.bar(figsize=(16, 8))
plt.title('Distribution of Car Model', fontsize=20)
plt.xlabel('Model', fontsize=14)
plt.ylabel('Jumlah', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df['Mileage(km)'].plot(
    kind='hist',
    color='pink',
    edgecolor='black',
    bins=5,
    figsize=(16,8)
)

plt.ticklabel_format(style='plain', axis='x')
plt.xlabel('Mileage (km)', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.title('Distribution of Mileage', fontdict={'fontsize': 24})
plt.grid(True)
plt.show()

In [None]:
df['Transmission'].value_counts().plot(kind='pie', label='', legend=True, autopct='%1.1f%%', figsize=(14,10))
plt.title("Percentage of Transmission", fontdict={'fontsize': 24})
plt.show()

In [None]:
df.groupby('Location').size().plot.bar(figsize=(16, 8))
plt.title('Distribution Location', fontsize=20)
plt.xlabel('Location', fontsize=14)
plt.ylabel('Jumlah', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df['Total_Price(RM)'].plot(
    kind='hist',
    color='pink',
    edgecolor='black',
    bins=5,
    figsize=(16,8)
)

plt.ticklabel_format(style='plain', axis='x')
plt.xlabel('Total Price (RM)', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.title('Distribution of Car Total Price', fontdict={'fontsize': 24})
plt.grid(True)
plt.show()

In [None]:
# Buat daftar kolom numerik manual atau otomatis
numerical_col = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Sekarang plot boxplot
df[numerical_col].plot(
    kind='box',
    subplots=True,
    layout=(1, len(numerical_col)),
    sharex=False,
    sharey=False,
    figsize=(20, 6),
    color='blue'
)

plt.tight_layout()
plt.show()

## Data Transformed

1. Drop Outliers

In [None]:
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

df = remove_outliers_iqr(df, numerical_col)

In [None]:
numerical_col = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Sekarang plot boxplot
df[numerical_col].plot(
    kind='box',
    subplots=True,
    layout=(1, len(numerical_col)),
    sharex=False,
    sharey=False,
    figsize=(20, 6),
    color='blue'
)

plt.tight_layout()
plt.show()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
df

# Modelling

In [None]:
num_features = ['Year', 'Mileage(km)']
cat_features = ['Brand', 'Model', 'Transmission', 'Location']

In [None]:
X = df.drop(['Total_Price(RM)'], axis=1)
y = df['Total_Price(RM)']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
y_train = np.log1p(y_train)

In [None]:
num_transformer = Pipeline(steps=[
    ('power', PowerTransformer(method='box-cox')),
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

cat_transformer = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

In [None]:
pipe_RFR = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

pipe_XGB = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42))
])

pipe_LGBM = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(random_state=42))
])

pipe_CatB = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', CatBoostRegressor(verbose=0, random_state=42))
])

In [None]:
pipe_RFR.fit(X_train, y_train)

In [None]:
y_pred_RFR = np.expm1(pipe_RFR.predict(X_test))

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_RFR))
print("Test RMSE:", rmse)

In [None]:
pipe_XGB.fit(X_train, y_train)

In [None]:
y_pred_XGB = np.expm1(pipe_XGB.predict(X_test))

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_XGB))
print("Test RMSE:", rmse)

In [None]:
pipe_LGBM.fit(X_train, y_train)

In [None]:
y_pred_LGBM = np.expm1(pipe_LGBM.predict(X_test))

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_LGBM))
print("Test RMSE:", rmse)

In [None]:
pipe_CatB.fit(X_train, y_train)

In [None]:
y_pred_CatB = np.expm1(pipe_CatB.predict(X_test))

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_CatB))
print("Test RMSE:", rmse)

In [None]:
rmse_RFR = np.sqrt(mean_squared_error(y_test, y_pred_RFR))
rmse_XGB = np.sqrt(mean_squared_error(y_test, y_pred_XGB))
rmse_LGBM = np.sqrt(mean_squared_error(y_test, y_pred_LGBM))
rmse_CatB = np.sqrt(mean_squared_error(y_test, y_pred_CatB))
print("Test RMSE RFR:", rmse_RFR)
print("Test RMSE XGB:", rmse_XGB)
print("Test RMSE LGBM:", rmse_LGBM)
print("Test RMSE CatB:", rmse_CatB)

coba buat stacking model

In [None]:
estimators = [
    ('rfr', RandomForestRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42)),
    ('lgbm', LGBMRegressor(random_state=42)),
    ('cat', CatBoostRegressor(verbose=0, random_state=42))
]

meta_model = LinearRegression()

In [None]:
stack_model = StackingRegressor(
    estimators=estimators,
    final_estimator=meta_model,
    passthrough=True,
    cv=5,
    n_jobs=-1
)

pipe_all = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacking', stack_model)
])

In [None]:
pipe_all.fit(X_train, y_train)

In [None]:
y_pred_all = np.expm1(pipe_all.predict(X_test))

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_all))
print("Test RMSE:", rmse)

In [None]:
estimators = [
    ('rfr', RandomForestRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42)),
    ('lgbm', LGBMRegressor(random_state=42)),
    ('cat', CatBoostRegressor(verbose=0, random_state=42))
]

meta_model = Ridge()

In [None]:
stack_model1 = StackingRegressor(
    estimators=estimators,
    final_estimator=meta_model,
    passthrough=True,
    cv=5,
    n_jobs=-1
)

pipe_all1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacking', stack_model1)
])

In [None]:
pipe_all1.fit(X_train, y_train)

In [None]:
y_pred_all1 = np.expm1(pipe_all1.predict(X_test))

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_all1))
print("Test RMSE:", rmse)

In [None]:
estimators = [
    ('rfr', RandomForestRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42)),
    ('lgbm', LGBMRegressor(random_state=42)),
    ('cat', CatBoostRegressor(verbose=0, random_state=42))
]

meta_model = Lasso()

In [None]:
stack_model2 = StackingRegressor(
    estimators=estimators,
    final_estimator=meta_model,
    passthrough=True,
    cv=5,
    n_jobs=-1
)

pipe_all2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacking', stack_model2)
])

In [None]:
pipe_all2.fit(X_train, y_train)

In [None]:
y_pred_all2 = np.expm1(pipe_all2.predict(X_test))

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_all2))
print("Test RMSE:", rmse)

## Tuning

In [None]:
'''TODO: Gunakan GridSearch untuk memperbaiki proses'''
%timeit max(range(100000))
hyperparameter_space = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2']
}

# Inisiasi Gridsearch
Tune_RFR = RandomizedSearchCV(pipe_RFR, hyperparameter_space,
                              n_iter=100, random_state=42,
                              scoring = 'accuracy', cv=5,
                              n_jobs=-1, refit = True,
                              verbose=2)

# Jalankan Gridsearch
Tune_RFR.fit(X_train, y_train)

In [None]:
Tune_RFR.best_params_, Tune_RFR.best_score_

In [None]:
'''TODO: Gunakan GridSearch untuk memperbaiki proses'''
%timeit max(range(100000))
hyperparameter_space = {
    'model__n_estimators': [100, 300, 500],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__reg_lambda': [1, 1.5, 2],
    'model__reg_alpha': [0, 0.5, 1]
}

# Inisiasi Gridsearch
Tune_XGB = RandomizedSearchCV(pipe_XGB, hyperparameter_space,
                              n_iter=100, random_state=42,
                              scoring = 'accuracy', cv=5,
                              n_jobs=-1, refit = True,
                              verbose=2)

# Jalankan Gridsearch
Tune_XGB.fit(X_train, y_train)

In [None]:
Tune_XGB.best_params_, Tune_XGB.best_score_

In [None]:
'''TODO: Gunakan GridSearch untuk memperbaiki proses'''
%timeit max(range(100000))
hyperparameter_space = {
    'model__n_estimators': [100, 300, 500],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [-1, 10, 20],
    'model__num_leaves': [31, 50, 100],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__reg_lambda': [0, 1, 2],
    'model__reg_alpha': [0, 1, 2]
}

# Inisiasi Gridsearch
Tune_LGBM = RandomizedSearchCV(pipe_LGBM, hyperparameter_space,
                              n_iter=100, random_state=42,
                              scoring = 'accuracy', cv=5,
                              n_jobs=-1, refit = True,
                              verbose=2)

# Jalankan Gridsearch
Tune_LGBM.fit(X_train, y_train)

In [None]:
Tune_LGBM.best_params_, Tune_LGBM.best_score_

In [None]:
'''TODO: Gunakan GridSearch untuk memperbaiki proses'''
%timeit max(range(100000))
hyperparameter_space = {
    'model__iterations': [300, 500, 800],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__depth': [4, 6, 10],
    'model__l2_leaf_reg': [1, 3, 5],
    'model__border_count': [32, 64, 128]
}

# Inisiasi Gridsearch
Tune_CatB = RandomizedSearchCV(pipe_CatB, hyperparameter_space,
                              n_iter=100, random_state=42,
                              scoring = 'accuracy', cv=5,
                              n_jobs=-1, refit = True,
                              verbose=2)

# Jalankan Gridsearch
Tune_CatB.fit(X_train, y_train)

In [None]:
Tune_CatB.best_params_, Tune_CatB.best_score_

In [None]:
rmse_RFR = np.sqrt(mean_squared_error(y_test, y_pred_RFR))
rmse_XGB = np.sqrt(mean_squared_error(y_test, y_pred_XGB))
rmse_LGBM = np.sqrt(mean_squared_error(y_test, y_pred_LGBM))
rmse_CatB = np.sqrt(mean_squared_error(y_test, y_pred_CatB))
print("Test RMSE RFR:", rmse_RFR)
print("Test RMSE XGB:", rmse_XGB)
print("Test RMSE LGBM:", rmse_LGBM)
print("Test RMSE CatB:", rmse_CatB)

In [None]:
RFR = RandomForestRegressor(random_state=42, n_estimators= 100, min_samples_split= 5, min_samples_leaf= 2, max_features= 'log2', max_depth= 30)
XGB = XGBRegressor(random_state=42, subsample= 0.8, reg_lambda= 1, reg_alpha= 0.5, n_estimators= 500, max_depth= 5, learning_rate= 0.05, colsample_bytree= 1.0)
LGBM = LGBMRegressor(random_state=42, subsample= 1.0, reg_lambda= 0, reg_alpha= 0, num_leaves= 100, n_estimators= 500, max_depth= 20, learning_rate= 0.05, colsample_bytree= 0.6)
CatB = CatBoostRegressor(verbose=0, random_state=42, learning_rate= 0.01, l2_leaf_reg= 5, iterations= 800, depth= 4, border_count= 32)

In [None]:
tune_pipe_RFR = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', RobustScaler()),
    ('model', RFR)
])

tune_pipe_XGB = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', RobustScaler()),
    ('model', XGB)
])

tune_pipe_LGBM = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', RobustScaler()),
    ('model', LGBM)
])

tune_pipe_CatB = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', RobustScaler()),
    ('model', CatB)
])

In [None]:
tune_pipe_RFR.fit(X_train, y_train)
tune_pipe_XGB.fit(X_train, y_train)
tune_pipe_LGBM.fit(X_train, y_train)
tune_pipe_CatB.fit(X_train, y_train)

In [None]:
tune_pred_RFR = tune_pipe_RFR.predict(X_test)
tune_pred_XGB = tune_pipe_XGB.predict(X_test)
tune_pred_LGBM = tune_pipe_LGBM.predict(X_test)
tune_pred_CatB = tune_pipe_CatB.predict(X_test)

In [None]:
rmse_RFR = np.sqrt(mean_squared_error(y_test, y_pred_RFR))
rmse_XGB = np.sqrt(mean_squared_error(y_test, y_pred_XGB))
rmse_LGBM = np.sqrt(mean_squared_error(y_test, y_pred_LGBM))
rmse_CatB = np.sqrt(mean_squared_error(y_test, y_pred_CatB))
rmse_RFR_final = np.sqrt(mean_squared_error(y_test, tune_pred_RFR))
rmse_XGB_final = np.sqrt(mean_squared_error(y_test, tune_pred_XGB))
rmse_LGBM_final = np.sqrt(mean_squared_error(y_test, tune_pred_LGBM))
rmse_CatB_final = np.sqrt(mean_squared_error(y_test, tune_pred_CatB))
print("Test RMSE RFR:", rmse_RFR)
print("Test RMSE XGB:", rmse_XGB)
print("Test RMSE LGBM:", rmse_LGBM)
print("Test RMSE CatB:", rmse_CatB)
print("Test RMSE RFR final:", rmse_RFR_final)
print("Test RMSE XGB final:", rmse_XGB_final)
print("Test RMSE LGBM final:", rmse_LGBM_final)
print("Test RMSE CatB final:", rmse_CatB_final)

In [None]:
def objective(trial):
    # Param untuk base learners
    rfr_params = {
        'n_estimators': trial.suggest_categorical('rfr__n_estimators', [100, 200, 300]),
        'max_depth': trial.suggest_categorical('rfr__max_depth', [None, 10, 20, 30]),
        'min_samples_split': trial.suggest_categorical('rfr__min_samples_split', [2, 5, 10]),
        'min_samples_leaf': trial.suggest_categorical('rfr__min_samples_leaf', [1, 2, 4]),
        'max_features': trial.suggest_categorical('rfr__max_features', ['sqrt', 'log2'])
    }

    xgb_params = {
        'n_estimators': trial.suggest_categorical('xgb__n_estimators', [100, 300, 500]),
        'learning_rate': trial.suggest_categorical('xgb__learning_rate', [0.01, 0.05, 0.1]),
        'max_depth': trial.suggest_categorical('xgb__max_depth', [3, 5, 7]),
        'subsample': trial.suggest_categorical('xgb__subsample', [0.6, 0.8, 1.0]),
        'colsample_bytree': trial.suggest_categorical('xgb__colsample_bytree', [0.6, 0.8, 1.0]),
        'reg_lambda': trial.suggest_categorical('xgb__reg_lambda', [1, 1.5, 2]),
        'reg_alpha': trial.suggest_categorical('xgb__reg_alpha', [0, 0.5, 1])
    }

    lgbm_params = {
        'n_estimators': trial.suggest_categorical('lgbm__n_estimators', [100, 300, 500]),
        'learning_rate': trial.suggest_categorical('lgbm__learning_rate', [0.01, 0.05, 0.1]),
        'max_depth': trial.suggest_categorical('lgbm__max_depth', [-1, 10, 20]),
        'num_leaves': trial.suggest_categorical('lgbm__num_leaves', [31, 50, 100]),
        'subsample': trial.suggest_categorical('lgbm__subsample', [0.6, 0.8, 1.0]),
        'colsample_bytree': trial.suggest_categorical('lgbm__colsample_bytree', [0.6, 0.8, 1.0]),
        'reg_lambda': trial.suggest_categorical('lgbm__reg_lambda', [0, 1, 2]),
        'reg_alpha': trial.suggest_categorical('lgbm__reg_alpha', [0, 1, 2])
    }

    cat_params = {
        'iterations': trial.suggest_categorical('cat__iterations', [300, 500, 800]),
        'learning_rate': trial.suggest_categorical('cat__learning_rate', [0.01, 0.05, 0.1]),
        'depth': trial.suggest_categorical('cat__depth', [4, 6, 10]),
        'l2_leaf_reg': trial.suggest_categorical('cat__l2_leaf_reg', [1, 3, 5]),
        'border_count': trial.suggest_categorical('cat__border_count', [32, 64, 128]),
        'verbose': 0
    }

    final_estimator_alpha = trial.suggest_categorical('final_alpha', [0.1, 0.5, 1.0])

    # Inisialisasi model
    rfr = RandomForestRegressor(**rfr_params, random_state=42)
    xgb = XGBRegressor(**xgb_params, random_state=42)
    lgbm = LGBMRegressor(**lgbm_params, random_state=42)
    cat = CatBoostRegressor(**cat_params, random_state=42)

    estimators = [
        ('rfr', rfr),
        ('xgb', xgb),
        ('lgbm', lgbm),
        ('cat', cat)
    ]

    stack_model = StackingRegressor(
        estimators=estimators,
        final_estimator=Ridge(alpha=final_estimator_alpha),
        n_jobs=-1
    )

    # Pipeline akhir
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),  # pastikan sudah didefinisikan
        ('model', stack_model)
    ])

    # K-Fold CV
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(pipe, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

    return -np.mean(scores)


In [None]:
study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=500, timeout=8100)

print("Best trial:")
print(study.best_trial)

In [None]:
best_params = study.best_trial.params

# Random Forest
rfr_best = RandomForestRegressor(
    n_estimators=best_params['rfr__n_estimators'],
    max_depth=best_params['rfr__max_depth'],
    min_samples_split=best_params['rfr__min_samples_split'],
    min_samples_leaf=best_params['rfr__min_samples_leaf'],
    max_features=best_params['rfr__max_features'],
    random_state=42
)

# XGBoost
xgb_best = XGBRegressor(
    n_estimators=best_params['xgb__n_estimators'],
    learning_rate=best_params['xgb__learning_rate'],
    max_depth=best_params['xgb__max_depth'],
    subsample=best_params['xgb__subsample'],
    colsample_bytree=best_params['xgb__colsample_bytree'],
    reg_lambda=best_params['xgb__reg_lambda'],
    reg_alpha=best_params['xgb__reg_alpha'],
    random_state=42
)

# LightGBM
lgbm_best = LGBMRegressor(
    n_estimators=best_params['lgbm__n_estimators'],
    learning_rate=best_params['lgbm__learning_rate'],
    max_depth=best_params['lgbm__max_depth'],
    num_leaves=best_params['lgbm__num_leaves'],
    subsample=best_params['lgbm__subsample'],
    colsample_bytree=best_params['lgbm__colsample_bytree'],
    reg_lambda=best_params['lgbm__reg_lambda'],
    reg_alpha=best_params['lgbm__reg_alpha'],
    random_state=42
)

# CatBoost
cat_best = CatBoostRegressor(
    iterations=best_params['cat__iterations'],
    learning_rate=best_params['cat__learning_rate'],
    depth=best_params['cat__depth'],
    l2_leaf_reg=best_params['cat__l2_leaf_reg'],
    border_count=best_params['cat__border_count'],
    verbose=0,
    random_state=42
)

# Stacking Regressor
stack_best = StackingRegressor(
    estimators=[
        ('rfr', rfr_best),
        ('xgb', xgb_best),
        ('lgbm', lgbm_best),
        ('cat', cat_best)
    ],
    final_estimator=Ridge(alpha=best_params['final_alpha'])
)

# Final Pipeline
final_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', stack_best)
])

# Fit model
final_pipe_fit = final_pipe.fit(X_train, y_train)

In [None]:
tune_pred_final_pipe = np.expm1(final_pipe.predict(X_test))

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, tune_pred_final_pipe))
print("Test RMSE:", rmse)

In [None]:
'''TODO: Silahkan simpan model yang kamu miliki'''
import pickle
# Menyimpan model terbaik dengan pickle
pklname = "best_regression.pkl"

with open(pklname, 'wb') as file:
    pickle.dump(final_pipe_fit, file)

files.download(pklname)

In [None]:
df.head()

In [None]:
sorted(df['Year'].unique())

In [None]:
df['Brand'].unique()

In [None]:
df['Model'].unique()

In [None]:
sorted(df['Mileage(km)'].unique())

In [None]:
sorted(df['Mileage(km)'].unique(), reverse=True)

In [None]:
df['Highlight'].unique()

In [None]:
df['Location'].unique()