In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost
import re
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import clone

In [2]:
col_names = []
with open('../data/Faults27x7_var','r') as f:
    for line in f:
        col_names.append(line.strip())

In [3]:
train_org = pd.read_csv('../data/train.csv')
test_org = pd.read_csv('../data/test.csv')
org_data = pd.read_csv('../data/Faults.NNA', delimiter='\s', engine='python', names=col_names)

In [4]:
train_org.columns[-7:]

Index(['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps',
       'Other_Faults'],
      dtype='object')

In [5]:
# logistic regression
X = train_org.drop(['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps',
       'Other_Faults'], axis=1)
pastry = train_org['Pastry'].copy()
z_scratch = train_org['Z_Scratch'].copy()
k_scratch = train_org['K_Scatch'].copy()
stains = train_org['Stains'].copy()
dirtiness = train_org['Dirtiness'].copy()
bumps = train_org['Bumps'].copy()
other_faults = train_org['Other_Faults'].copy()

In [6]:
ys = [pastry, z_scratch, k_scratch, stains, dirtiness, bumps, other_faults]

In [7]:
class KMeansTransformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=0)
        
    def fit(self,X, y=None):
        self.kmeans.fit(X)
        return self
        
    def transform(self,X):
        labels = self.kmeans.predict(X)
        return np.c_[X, labels]

# hpt-lightboost

In [8]:
def objective_lgb_pastry(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    model = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,pastry, scoring='roc_auc', cv= cvo))
    return score

In [9]:
study_lgb_pastry = optuna.create_study(direction='maximize')

[I 2024-03-02 22:57:00,269] A new study created in memory with name: no-name-83be5e04-f5e0-4fa4-94fc-c01f400822c5


In [10]:
study_lgb_pastry.optimize(objective_lgb_pastry, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-02 22:57:04,292] Trial 7 finished with value: 0.8677375179883757 and parameters: {'n_estimators': 131, 'max_depth': 2, 'num_leaves': 65, 'learning_rate': 0.18992983655884602, 'min_child_samples': 78, 'min_child_weight': 8.375115816685609, 'subsample': 0.36928838365991523, 'colsample_bylevel': 0.638166020021052, 'reg_alpha': 0.02791921462835604, 'reg_lambda': 0.09081629435333059}. Best is trial 7 with value: 0.8677375179883757.
[I 2024-03-02 22:57:05,716] Trial 5 finished with value: 0.8595925639015298 and parameters: {'n_estimators': 167, 'max_depth': 4, 'num_leaves': 9, 'learning_rate': 0.2513803862399797, 'min_child_samples': 263, 'min_child_weight': 8.583972867454758, 'subsample': 0.6000989126606076, 'colsample_bylevel': 0.4269514201510317, 'reg_alpha': 0.06212133240416262, 'reg_lambda': 0.05895987108520275}. Best is trial 7 with value: 0.8677375179883757.
[I 2024-03-02 22:57:09,610] Trial 2 finished with value: 0.859420597003125 and parameters: {'n_estimators': 339, 'max

In [None]:
study_lgb_pastry.best_params

{'n_estimators': 425,
 'max_depth': 4,
 'num_leaves': 69,
 'learning_rate': 0.028816633040589602,
 'min_child_samples': 323,
 'min_child_weight': 7.176040100770706,
 'subsample': 0.7955545344583981,
 'colsample_bylevel': 0.33715604989321935,
 'reg_alpha': 0.03990243199123561,
 'reg_lambda': 0.03653401534415128}

In [None]:
pastry_params = study_lgb_pastry.best_params

pastry_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**pastry_params))
    ]
)
pastry_m.fit(X,pastry)
pastry_pred = pastry_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [None]:
def objective_lgb_z_scratch(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    model = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,z_scratch, scoring='roc_auc', cv= cvo))
    return score

In [None]:
study_lgb_z_scratch = optuna.create_study(direction='maximize')

[I 2024-03-02 07:01:59,380] A new study created in memory with name: no-name-81ed7621-c831-4a29-bfd2-49f4d391af64


In [None]:
study_lgb_z_scratch.optimize(objective_lgb_z_scratch, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-02 07:02:09,802] Trial 1 finished with value: 0.9584551847010173 and parameters: {'n_estimators': 184, 'max_depth': 6, 'num_leaves': 47, 'learning_rate': 0.08394921755985905, 'min_child_samples': 300, 'min_child_weight': 7.32643322066261, 'subsample': 0.5875227968149113, 'colsample_bylevel': 0.5024628928169316, 'reg_alpha': 0.04763119481686654, 'reg_lambda': 0.05919765412867862}. Best is trial 1 with value: 0.9584551847010173.
[I 2024-03-02 07:02:10,711] Trial 11 finished with value: 0.951104282619594 and parameters: {'n_estimators': 134, 'max_depth': 29, 'num_leaves': 99, 'learning_rate': 0.1710484752466628, 'min_child_samples': 269, 'min_child_weight': 9.218867667556239, 'subsample': 0.45838719900634584, 'colsample_bylevel': 0.6969927662800506, 'reg_alpha': 0.042146098448882695, 'reg_lambda': 0.05759229882383174}. Best is trial 1 with value: 0.9584551847010173.
[I 2024-03-02 07:02:12,895] Trial 7 finished with value: 0.9487788422971238 and parameters: {'n_estimators': 289,

In [None]:
study_lgb_z_scratch.best_params

{'n_estimators': 344,
 'max_depth': 42,
 'num_leaves': 8,
 'learning_rate': 0.029593843586703018,
 'min_child_samples': 256,
 'min_child_weight': 3.5430180970124887,
 'subsample': 0.5033661208760476,
 'colsample_bylevel': 0.4256463353505471,
 'reg_alpha': 0.09646857371589859,
 'reg_lambda': 0.09634067185683329}

In [None]:
z_scratch_params = study_lgb_z_scratch.best_params

z_scratch_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**z_scratch_params))
    ]
)
z_scratch_m.fit(X,z_scratch)
z_scratch_pred = z_scratch_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [None]:
def objective_lgb_k_scratch(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    model = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,k_scratch, scoring='roc_auc', cv= cvo))
    return score

In [None]:
study_lgb_k_scratch = optuna.create_study(direction='maximize')

[I 2024-03-02 07:22:13,172] A new study created in memory with name: no-name-7d2cb34c-e8e4-44b2-b8e8-9aa97d7e63c3


In [None]:
study_lgb_k_scratch.optimize(objective_lgb_k_scratch, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-02 07:22:20,240] Trial 6 finished with value: 0.9853876928154474 and parameters: {'n_estimators': 463, 'max_depth': 42, 'num_leaves': 3, 'learning_rate': 0.063588640968433, 'min_child_samples': 384, 'min_child_weight': 5.121204989711714, 'subsample': 0.692591980275645, 'colsample_bylevel': 0.6114155168329114, 'reg_alpha': 0.07273980457542496, 'reg_lambda': 0.06291858609391068}. Best is trial 6 with value: 0.9853876928154474.
[I 2024-03-02 07:22:22,420] Trial 8 finished with value: 0.9857020223912387 and parameters: {'n_estimators': 129, 'max_depth': 17, 'num_leaves': 84, 'learning_rate': 0.04240010631462436, 'min_child_samples': 355, 'min_child_weight': 0.822229619613438, 'subsample': 0.37239397449976547, 'colsample_bylevel': 0.6937572100803213, 'reg_alpha': 0.06510501106064878, 'reg_lambda': 0.04960832910958121}. Best is trial 8 with value: 0.9857020223912387.
[I 2024-03-02 07:22:23,602] Trial 2 finished with value: 0.9858838938656085 and parameters: {'n_estimators': 199, '

In [None]:
study_lgb_k_scratch.best_params

{'n_estimators': 271,
 'max_depth': 29,
 'num_leaves': 118,
 'learning_rate': 0.02560845874325205,
 'min_child_samples': 449,
 'min_child_weight': 1.802873155395419,
 'subsample': 0.7902749625886126,
 'colsample_bylevel': 0.3433292920431201,
 'reg_alpha': 0.047379430530416024,
 'reg_lambda': 0.010007517965380167}

In [None]:
k_scratch_params = study_lgb_k_scratch.best_params

k_scratch_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**k_scratch_params))
    ]
)
k_scratch_m.fit(X,k_scratch)
k_scratch_pred = k_scratch_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [None]:
def objective_lgb_stains(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    model = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,stains, scoring='roc_auc', cv= cvo))
    return score

In [None]:
study_lgb_stains = optuna.create_study(direction='maximize')

[I 2024-03-02 07:42:42,938] A new study created in memory with name: no-name-82481f58-5b0e-4ae5-bb70-e194abd12edc


In [None]:
study_lgb_stains.optimize(objective_lgb_stains, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-02 07:42:49,054] Trial 10 finished with value: 0.9923650177040934 and parameters: {'n_estimators': 211, 'max_depth': 41, 'num_leaves': 3, 'learning_rate': 0.11457827592735238, 'min_child_samples': 400, 'min_child_weight': 2.6788384107798637, 'subsample': 0.6271934580462543, 'colsample_bylevel': 0.4644740073328141, 'reg_alpha': 0.04585039093178693, 'reg_lambda': 0.022013796575868518}. Best is trial 10 with value: 0.9923650177040934.
[I 2024-03-02 07:42:49,490] Trial 4 finished with value: 0.9922440936149602 and parameters: {'n_estimators': 255, 'max_depth': 2, 'num_leaves': 38, 'learning_rate': 0.09796479426488525, 'min_child_samples': 380, 'min_child_weight': 3.2328235829724727, 'subsample': 0.37628682881016423, 'colsample_bylevel': 0.4206939731470839, 'reg_alpha': 0.06583148967140875, 'reg_lambda': 0.028603653612165904}. Best is trial 10 with value: 0.9923650177040934.
[I 2024-03-02 07:42:53,504] Trial 1 finished with value: 0.992226814525287 and parameters: {'n_estimators'

In [None]:
study_lgb_stains.best_params

{'n_estimators': 185,
 'max_depth': 57,
 'num_leaves': 6,
 'learning_rate': 0.04533891633316206,
 'min_child_samples': 443,
 'min_child_weight': 3.089195797904931,
 'subsample': 0.7284029460970833,
 'colsample_bylevel': 0.4874764997830047,
 'reg_alpha': 0.09101380589410715,
 'reg_lambda': 0.007130834940473996}

In [None]:
stains_params = study_lgb_stains.best_params

stains_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**stains_params))
    ]
)
stains_m.fit(X,stains)
stains_pred = stains_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [None]:
def objective_lgb_dirtiness(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    model = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,dirtiness, scoring='roc_auc', cv= cvo))
    return score

In [None]:
study_lgb_dirtiness = optuna.create_study(direction='maximize')

[I 2024-03-02 08:00:02,001] A new study created in memory with name: no-name-7e8eb49a-9657-4887-b171-cff65c1d32b0


In [None]:
study_lgb_dirtiness.optimize(objective_lgb_dirtiness, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-02 08:00:11,111] Trial 1 finished with value: 0.883400045737498 and parameters: {'n_estimators': 240, 'max_depth': 7, 'num_leaves': 23, 'learning_rate': 0.05989948067270578, 'min_child_samples': 415, 'min_child_weight': 9.07367363280566, 'subsample': 0.7737625708840743, 'colsample_bylevel': 0.4695098086952362, 'reg_alpha': 0.05152532338430903, 'reg_lambda': 0.019061592631917052}. Best is trial 1 with value: 0.883400045737498.
[I 2024-03-02 08:00:13,652] Trial 4 finished with value: 0.8668573984173807 and parameters: {'n_estimators': 134, 'max_depth': 11, 'num_leaves': 41, 'learning_rate': 0.2229318813948199, 'min_child_samples': 206, 'min_child_weight': 0.20228721268283958, 'subsample': 0.7863273330153842, 'colsample_bylevel': 0.532098254126284, 'reg_alpha': 0.04729900038669147, 'reg_lambda': 0.08364478687153196}. Best is trial 1 with value: 0.883400045737498.
[I 2024-03-02 08:00:14,485] Trial 3 finished with value: 0.8804835669916742 and parameters: {'n_estimators': 212, 'm

In [None]:
study_lgb_dirtiness.best_params

{'n_estimators': 100,
 'max_depth': 13,
 'num_leaves': 62,
 'learning_rate': 0.04095550709994935,
 'min_child_samples': 331,
 'min_child_weight': 0.1953843881250878,
 'subsample': 0.4674944721594767,
 'colsample_bylevel': 0.33072889947558426,
 'reg_alpha': 0.07067954909592045,
 'reg_lambda': 0.0017098736746246962}

In [None]:
dirtiness_params = study_lgb_dirtiness.best_params

dirtiness_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**dirtiness_params))
    ]
)
dirtiness_m.fit(X,dirtiness)
dirtiness_pred = dirtiness_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [None]:
def objective_lgb_bumps(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    model = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,bumps, scoring='roc_auc', cv= cvo))
    return score

In [None]:
study_lgb_bumps = optuna.create_study(direction='maximize')

[I 2024-03-02 08:17:07,526] A new study created in memory with name: no-name-6258cb62-5abf-4d1d-a249-4d07e2e59279


In [None]:
study_lgb_bumps.optimize(objective_lgb_bumps, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-02 08:17:17,087] Trial 8 finished with value: 0.7953129245466546 and parameters: {'n_estimators': 261, 'max_depth': 5, 'num_leaves': 22, 'learning_rate': 0.16508288813398286, 'min_child_samples': 139, 'min_child_weight': 7.159592477745972, 'subsample': 0.700023244524113, 'colsample_bylevel': 0.6337038154884842, 'reg_alpha': 0.08903567364239039, 'reg_lambda': 0.09962073056791823}. Best is trial 8 with value: 0.7953129245466546.
[I 2024-03-02 08:17:18,484] Trial 7 finished with value: 0.7833357301817696 and parameters: {'n_estimators': 338, 'max_depth': 4, 'num_leaves': 53, 'learning_rate': 0.2419049563340446, 'min_child_samples': 24, 'min_child_weight': 1.8785788102290328, 'subsample': 0.5400445852965277, 'colsample_bylevel': 0.6486808406774336, 'reg_alpha': 0.07375832582815689, 'reg_lambda': 0.08213139215868064}. Best is trial 8 with value: 0.7953129245466546.
[I 2024-03-02 08:17:19,954] Trial 2 finished with value: 0.8104412980610853 and parameters: {'n_estimators': 436, 'm

In [None]:
study_lgb_bumps.best_params

{'n_estimators': 450,
 'max_depth': 8,
 'num_leaves': 44,
 'learning_rate': 0.012849208203633231,
 'min_child_samples': 322,
 'min_child_weight': 1.2745300791059089,
 'subsample': 0.5869159792528273,
 'colsample_bylevel': 0.46785554903946636,
 'reg_alpha': 0.05444067251372892,
 'reg_lambda': 0.04278704991242975}

In [None]:
bumps_params = study_lgb_bumps.best_params

bumps_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**bumps_params))
    ]
)
bumps_m.fit(X,bumps)
bumps_pred = bumps_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [None]:
def objective_lgb_other_faults(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    model = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,other_faults, scoring='roc_auc', cv= cvo))
    return score

In [None]:
study_lgb_other_faults = optuna.create_study(direction='maximize')

[I 2024-03-02 08:44:59,923] A new study created in memory with name: no-name-e0d7ba40-f8e6-4d49-987a-a1252a97dbe0


In [None]:
study_lgb_other_faults.optimize(objective_lgb_other_faults, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-02 08:45:04,619] Trial 4 finished with value: 0.6988030904207261 and parameters: {'n_estimators': 268, 'max_depth': 51, 'num_leaves': 2, 'learning_rate': 0.07805702550725556, 'min_child_samples': 290, 'min_child_weight': 9.308488739363934, 'subsample': 0.7325589582609147, 'colsample_bylevel': 0.5112132743061855, 'reg_alpha': 0.0871740849017208, 'reg_lambda': 0.08203173364837735}. Best is trial 4 with value: 0.6988030904207261.
[I 2024-03-02 08:45:08,271] Trial 8 finished with value: 0.699209818675339 and parameters: {'n_estimators': 105, 'max_depth': 47, 'num_leaves': 48, 'learning_rate': 0.13032899204807424, 'min_child_samples': 431, 'min_child_weight': 5.072221442139856, 'subsample': 0.547739237545897, 'colsample_bylevel': 0.49911623941872507, 'reg_alpha': 0.012183696863368218, 'reg_lambda': 0.06851228870471616}. Best is trial 8 with value: 0.699209818675339.
[I 2024-03-02 08:45:08,438] Trial 7 finished with value: 0.7061166397383147 and parameters: {'n_estimators': 424, '

In [None]:
study_lgb_other_faults.best_params

{'n_estimators': 254,
 'max_depth': 7,
 'num_leaves': 18,
 'learning_rate': 0.020543880031226605,
 'min_child_samples': 378,
 'min_child_weight': 8.98045597340312,
 'subsample': 0.6398999645373245,
 'colsample_bylevel': 0.6110641328208201,
 'reg_alpha': 0.09445215428123344,
 'reg_lambda': 0.010077108649068555}

In [None]:
other_faults_params = study_lgb_other_faults.best_params

other_faults_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**other_faults_params))
    ]
)
other_faults_m.fit(X,other_faults)
other_faults_pred = other_faults_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [None]:
sub = pd.DataFrame({'id':test_org['id'].copy(), 'Pastry':pastry_pred, 'Z_Scratch':z_scratch_pred, 'K_Scatch':k_scratch_pred,
                    'Stains':stains_pred, 'Dirtiness':dirtiness_pred, 'Bumps':bumps_pred, 'Other_Faults':other_faults_pred})

In [None]:
sub.head()

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.601224,0.001581,0.002984,0.000101,0.011791,0.159638,0.353399
1,19220,0.256636,0.011489,0.002667,0.000106,0.165159,0.150879,0.330532
2,19221,0.001477,0.04208,0.037689,0.000417,0.002944,0.308827,0.508441
3,19222,0.126037,0.002178,0.000375,0.000812,0.013434,0.376329,0.425821
4,19223,0.001514,0.002423,0.000338,0.004942,0.006323,0.565537,0.389942


In [None]:
(0.8712560368435296 + 0.9612675058978631 + 0.9860986294660549 + 0.9929646335341336 + 0.8967476825823397 + 0.8119434051286692 + 0.709327117443453)/7

0.889943572985149

In [None]:
sub.to_csv('../submissions/m2_1.csv', index=False)

# hpt-lgb-stack

In [None]:
def objective_stack_lgb_pastry(trial):
    import warnings

    warnings.filterwarnings('ignore')
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    model = StackingClassifier(
        estimators=[
            ('AdaBoost', AdaBoostClassifier())
        ], 
        final_estimator= lgbc,
        stack_method='predict_proba',
        passthrough=True
    )
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,pastry, scoring='roc_auc', cv= cvo))
    return score

In [None]:
study_stack_lgb_pastry = optuna.create_study(direction='maximize')

[I 2024-03-02 12:25:29,057] A new study created in memory with name: no-name-db0db545-06ab-45c4-b4b8-24113154272f


In [None]:
study_stack_lgb_pastry.optimize(objective_stack_lgb_pastry, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-02 12:26:48,267] Trial 11 finished with value: 0.8646811432195131 and parameters: {'n_estimators': 291, 'max_depth': 3, 'num_leaves': 98, 'learning_rate': 0.015286565189254229, 'min_child_samples': 12, 'min_child_weight': 0.519171958999822, 'subsample': 0.8110991297271324, 'colsample_bylevel': 0.6744190087715001, 'reg_alpha': 0.05429740233161844, 'reg_lambda': 0.06671875020496656}. Best is trial 11 with value: 0.8646811432195131.
[I 2024-03-02 12:26:48,837] Trial 6 finished with value: 0.8584288361252819 and parameters: {'n_estimators': 195, 'max_depth': 30, 'num_leaves': 18, 'learning_rate': 0.2088771538372269, 'min_child_samples': 458, 'min_child_weight': 7.108101229803168, 'subsample': 0.5899478328782217, 'colsample_bylevel': 0.5349230016077154, 'reg_alpha': 0.08761035182366973, 'reg_lambda': 0.04602696827238188}. Best is trial 11 with value: 0.8646811432195131.
[I 2024-03-02 12:26:51,466] Trial 7 finished with value: 0.8626607770298783 and parameters: {'n_estimators': 33

In [None]:
study_stack_lgb_pastry.best_params

{'n_estimators': 389,
 'max_depth': 52,
 'num_leaves': 12,
 'learning_rate': 0.02405898591296412,
 'min_child_samples': 196,
 'min_child_weight': 2.059402030023506,
 'subsample': 0.5093287439289258,
 'colsample_bylevel': 0.3449906483764486,
 'reg_alpha': 0.09111218893023963,
 'reg_lambda': 0.05522357335577206}

In [None]:
def objective_stack_lgb_z_scratch(trial):
    import warnings

    warnings.filterwarnings('ignore')
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    model = StackingClassifier(
        estimators=[
            ('AdaBoost', AdaBoostClassifier())
        ], 
        final_estimator= lgbc,
        stack_method='predict_proba',
        passthrough=True
    )
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,z_scratch, scoring='roc_auc', cv= cvo))
    return score

In [None]:
study_stack_lgb_z_scratch = optuna.create_study(direction='maximize')

[I 2024-03-02 14:46:50,559] A new study created in memory with name: no-name-d41c7c97-80c6-4b4b-8f18-7664891635ad


In [None]:
study_stack_lgb_z_scratch.optimize(objective_stack_lgb_z_scratch, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-02 17:41:15,221] Trial 1005 finished with value: 0.9588091422987363 and parameters: {'n_estimators': 420, 'max_depth': 2, 'num_leaves': 57, 'learning_rate': 0.03133226318978335, 'min_child_samples': 225, 'min_child_weight': 3.0645373001227636, 'subsample': 0.7829961440197197, 'colsample_bylevel': 0.33548282507753224, 'reg_alpha': 0.04571056113434432, 'reg_lambda': 0.0578199159787304}. Best is trial 788 with value: 0.9600648408160495.
[I 2024-03-02 17:41:15,796] Trial 1006 finished with value: 0.9587787150993774 and parameters: {'n_estimators': 439, 'max_depth': 2, 'num_leaves': 57, 'learning_rate': 0.047256569100365756, 'min_child_samples': 232, 'min_child_weight': 2.9595229295530463, 'subsample': 0.7628342552133652, 'colsample_bylevel': 0.3357033740564241, 'reg_alpha': 0.04579592441298038, 'reg_lambda': 0.05744753679956811}. Best is trial 788 with value: 0.9600648408160495.
[I 2024-03-02 17:41:16,621] Trial 1011 finished with value: 0.958270129461267 and parameters: {'n_est

In [None]:
# {'n_estimators': 460, 'max_depth': 4, 'num_leaves': 33, 'learning_rate': 0.017760464056848547, 'min_child_samples': 267, 'min_child_weight': 3.589036321652739, 'subsample': 0.7453402257990246, 'colsample_bylevel': 0.3301659472925667, 'reg_alpha': 0.047521897321316106, 'reg_lambda': 0.05975339807146562}

# little-experiment

In [8]:
scaler = ColumnTransformer(
    transformers=[
        ('standard_scaler', StandardScaler(), X.columns)
    ],
    remainder='passthrough'
)

In [9]:
mod_adaboost = Pipeline(
    steps=[
        ('scaler', scaler),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('adaboost', AdaBoostClassifier(random_state=0))
    ]
)

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(X,pastry, test_size=0.2, stratify=pastry, random_state=0)

In [11]:
xtrain_new = xtrain.copy()

In [12]:
xtrain_new['pred'] = None
# xtrain_new['pred1'] = None

In [13]:
skf = StratifiedKFold()

In [14]:
for train_index, test_index in skf.split(xtrain,ytrain):
    xtr,xte = xtrain.iloc[train_index], xtrain.iloc[test_index]
    ytr,yte = ytrain.iloc[train_index], ytrain.iloc[test_index]
    
    adb = clone(mod_adaboost)
    
    adb.fit(xtr,ytr)
    prd = adb.predict_proba(xte)[:,0]
    
    xtrain_new.iloc[test_index,-1] = prd



In [15]:
xtrain_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15375 entries, 14461 to 16610
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   X_Minimum              15375 non-null  int64  
 1   X_Maximum              15375 non-null  int64  
 2   Y_Minimum              15375 non-null  int64  
 3   Y_Maximum              15375 non-null  int64  
 4   Pixels_Areas           15375 non-null  int64  
 5   X_Perimeter            15375 non-null  int64  
 6   Y_Perimeter            15375 non-null  int64  
 7   Sum_of_Luminosity      15375 non-null  int64  
 8   Minimum_of_Luminosity  15375 non-null  int64  
 9   Maximum_of_Luminosity  15375 non-null  int64  
 10  Length_of_Conveyer     15375 non-null  int64  
 11  TypeOfSteel_A300       15375 non-null  int64  
 12  TypeOfSteel_A400       15375 non-null  int64  
 13  Steel_Plate_Thickness  15375 non-null  int64  
 14  Edges_Index            15375 non-null  float64
 15  Emp

In [16]:
mod_adaboost.fit(xtrain,ytrain)



In [17]:
# adaboost_xtest_pred0 = mod_adaboost.predict_proba(xtest)[:,0]
adaboost_xtest_pred = mod_adaboost.predict_proba(xtest)[:,0]

In [18]:
# xtest_new = pd.concat([xtest,pd.Series(adaboost_xtest_pred0, name='pred0', index=xtest.index)], axis=1)
xtest_new = pd.concat([xtest,pd.Series(adaboost_xtest_pred, name='pred', index=xtest.index)], axis=1)

In [19]:
lgbc = lgb.LGBMClassifier(random_state=0, objective='binary', verbose = -1)

In [20]:
mod_lgbc = Pipeline(
    steps=[
        ('scaler', scaler),
        ('kmeans', KMeansTransformer(5)),
        ('lgb', lgbc)
    ]
)

In [21]:
mod_lgbc.fit(xtrain_new,ytrain)

In [22]:
roc_auc_score(ytest, mod_lgbc.predict_proba(xtest_new)[:,1])

0.8751541410726007

In [23]:
stack_mods = StackingClassifier(
    estimators=[('adaboost', AdaBoostClassifier(random_state=0))],
    final_estimator=lgb.LGBMClassifier(random_state=0, objective='binary', verbose = -1),
    stack_method='predict_proba',
    passthrough=True
)

In [24]:
stack_pipe = Pipeline(
    steps=[
        ('scaler', scaler),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', stack_mods)
    ]
)

In [25]:
stack_pipe.fit(xtrain,ytrain)



In [26]:
roc_auc_score(ytest, stack_pipe.predict_proba(xtest)[:,1])

0.8716796595296427