In [28]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from rich.console import Console
from rich.table import Table
from collections import Counter
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [None]:
os.chdir(r'C:\SML_Projects\SML_airplane_price_project')
os.makedirs('results', exist_ok=True)

In [30]:
df = pd.read_csv("data/preprocessed/preprocessed_dataset.csv")

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12377 entries, 0 to 12376
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Model                        12377 non-null  float64
 1   Year_of_Manufacture          12377 non-null  float64
 2   Number_of_Engines            12377 non-null  float64
 3   Engine_Type                  12377 non-null  float64
 4   Capacity                     12377 non-null  float64
 5   Range_(km)                   12377 non-null  int64  
 6   Fuel_Consumption_(L/hour)    12377 non-null  float64
 7   Hourly_Maintenance_Cost_($)  12377 non-null  float64
 8   Age                          12377 non-null  float64
 9   Sales_Region                 12377 non-null  float64
 10  Price_($)                    12377 non-null  float64
 11  Company                      12377 non-null  float64
 12  Age_Group                    12377 non-null  float64
 13  HMC_per_person  

In [32]:
x = df.drop('Range_(km)', axis=1)   
y = df['Range_(km)'] 

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [34]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

In [35]:
print("Original train class distribution:", Counter(y_train))

Original train class distribution: Counter({5700: 1710, 14800: 1678, 6300: 1646, 3000: 1632, 15600: 1621, 1285: 1614})


In [36]:
from lightgbm import LGBMRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor


base_estimators = [
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor()),
    ('et', ExtraTreesRegressor())
]

models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Extra Trees': ExtraTreesRegressor(),
    'Hist Gradient Boosting': HistGradientBoostingRegressor(),
    'SVR': SVR(),
    'KNN': KNeighborsRegressor(),
    'XGBoost': XGBRegressor(),
    'LGBMRegressor': LGBMRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'Bagging': BaggingRegressor(),
    'Voting': VotingRegressor(
        estimators=base_estimators
    ),
    'Stacking': StackingRegressor(
        estimators=base_estimators,
        final_estimator=Ridge()
    ),
    'Bagged KNN': BaggingRegressor(
        estimator=KNeighborsRegressor(),
        n_estimators=10
    ),
    'Bagged DT': BaggingRegressor(
        estimator=DecisionTreeRegressor(),
        n_estimators=10
    ),
}

In [37]:
search_spaces = {
    'Linear Regression': {
        'fit_intercept': Categorical([True, False]),
        'positive': Categorical([True, False]),
    },
    
    'Lasso': {
        'alpha': Real(0.0001, 1.0, prior='log-uniform'),
    },
    'Ridge': {
        'alpha': Real(0.1, 10.0, prior='log-uniform'),
    },
    'ElasticNet': {
        'alpha': Real(0.0001, 1.0, prior='log-uniform'),
        'l1_ratio': Real(0.0, 1.0)
    },
    'Decision Tree': {
        'max_depth': Integer(2, 20),
        'min_samples_split': Integer(2, 20)
    },
    'Extra Trees': {
        'n_estimators': Integer(50, 200),
        'max_depth': Integer(2, 20),
        'min_samples_split': Integer(2, 20)
    },
    'Random Forest': {
        'n_estimators': Integer(100, 500),
        'max_depth': Integer(3, 30),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 4),
        'max_features': Categorical(['sqrt', 'log2', None])
    },

    'Bagging': {
        'n_estimators': Integer(10, 100),
        'max_samples': Real(0.3, 1.0),
        'max_features': Real(0.5, 1.0)
    },
    'Gradient Boosting': {
        'n_estimators': Integer(50, 200),
        'learning_rate': Real(0.01, 0.2, prior='log-uniform'),
        'max_depth': Integer(2, 10)
    },
    'Hist Gradient Boosting': {
        'learning_rate': Real(0.01, 0.2, prior='log-uniform'),
        'max_depth': Integer(2, 10),
        'max_leaf_nodes': Integer(15, 255)
    },
    'AdaBoost': {
        'n_estimators': Integer(50, 200),
        'learning_rate': Real(0.01, 0.5, prior='log-uniform')
    },
    'KNN': {
        'n_neighbors': Integer(3, 30),
        'weights': Categorical(['uniform', 'distance'])
    },
    'SVR': {
        'C': Real(0.1, 10, prior='log-uniform'),
        'epsilon': Real(0.001, 1.0, prior='log-uniform'),
        'kernel': Categorical(['rbf', 'linear', 'poly'])
    },
    'XGBoost': {
        'n_estimators': Integer(50, 200),
        'max_depth': Integer(2, 10),
        'learning_rate': Real(0.01, 0.2, prior='log-uniform'),
    },
    'LGBMRegressor': {
        'n_estimators': Integer(50, 200),
        'max_depth': Integer(-1, 20),
        'learning_rate': Real(0.01, 0.2, prior='log-uniform')
    },
    'Voting': {
        # Linear Regression
        'lr__fit_intercept': Categorical([True, False]),
        'lr__positive': Categorical([True, False]),

        # Random Forest
        'rf__n_estimators': Integer(50, 300),
        'rf__max_depth': Integer(2, 20),
        'rf__max_features': Categorical(['sqrt', 'log2', None]),

        # Extra Trees
        'et__n_estimators': Integer(50, 300),
        'et__max_depth': Integer(2, 20),
        'et__min_samples_split': Integer(2, 20),
    },
    'Stacking': {
        # Base models
        'lr__fit_intercept': Categorical([True, False]),
        'lr__positive': Categorical([True, False]),

        'rf__n_estimators': Integer(50, 300),
        'rf__max_depth': Integer(2, 20),
        'rf__max_features': Categorical(['sqrt', 'log2', None]),

        'et__n_estimators': Integer(50, 300),
        'et__max_depth': Integer(2, 20),
        'et__min_samples_split': Integer(2, 20),

        # Final Estimator (Ridge)
        'final_estimator__alpha': Real(0.1, 10.0, prior='log-uniform'),
    },
    'Bagged KNN': {
        'n_estimators': Integer(10, 100),
        'max_samples': Real(0.3, 1.0)
    },
    'Bagged DT': {
        'n_estimators': Integer(10, 100),
        'max_samples': Real(0.3, 1.0),
        'max_features': Real(0.5, 1.0)
    },
}

In [38]:
results = []

for name, model in models.items():
    print(f"Training {name}...")

    search_space = search_spaces[name]

    bayes_search = BayesSearchCV(
        estimator=model,
        search_spaces=search_space,
        n_iter=20,
        scoring='r2',
        cv=kf,
        n_jobs=-1,
        random_state=42,
        verbose=0
    )

    bayes_search.fit(x_train, y_train)

    best_model = bayes_search.best_estimator_
    y_pred = best_model.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    scores = cross_val_score(best_model, x_train, y_train, cv=kf, scoring='r2', n_jobs=-1)

    results.append([name, r2, mae, scores.mean(), scores.std()])

Training Linear Regression...




Training Lasso...


  model = cd_fast.enet_coordinate_descent(


Training Ridge...
Training ElasticNet...


  model = cd_fast.enet_coordinate_descent(


Training Decision Tree...
Training Random Forest...
Training Gradient Boosting...
Training Extra Trees...
Training Hist Gradient Boosting...
Training SVR...
Training KNN...




Training XGBoost...
Training LGBMRegressor...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1667
[LightGBM] [Info] Number of data points in the train set: 9901, number of used features: 19
[LightGBM] [Info] Start training from score 7798.079992
Training AdaBoost...
Training Bagging...
Training Voting...
Training Stacking...
Training Bagged KNN...
Training Bagged DT...


In [39]:
console = Console()

results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
best_model = results_sorted[0]
worst_model = results_sorted[-1]

table = Table(title="Bayesian Optimization", show_lines=True)
table.add_column("Algorithm")
table.add_column("R2")
table.add_column("MAE")
table.add_column("K-Fold mean", justify="right")
table.add_column("K-Fold std", justify="right")

for row in results_sorted:
    algo, r2, mae, kmean, kstd = row
    if row == best_model:
        table.add_row(f"[bold green]{algo}[/bold green]",
                      f"[bold green]{r2:.2f}[/bold green]",
                      f"[bold green]{mae:.2f}[/bold green]",
                      f"[bold green]{kmean:.2f}[/bold green]",
                      f"[bold green]{kstd:.2f}[/bold green]")
    elif row == worst_model:
        table.add_row(f"[bold red]{algo}[/bold red]",
                      f"[bold red]{r2:.2f}[/bold red]",
                      f"[bold red]{mae:.2f}[/bold red]",
                      f"[bold red]{kmean:.2f}[/bold red]",
                      f"[bold red]{kstd:.2f}[/bold red]")
    else:
        table.add_row(algo, f"{r2:.2f}", f"{mae:.2f}", f"{kmean:.2f}", f"{kstd:.2f}")

console.print(table)

In [40]:
temp_console = Console(record=True)
temp_console.print(table)
text = temp_console.export_text()
with open('results/Tuning.txt', 'a', encoding='utf-8') as f:
    f.write(text)