In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, ExtraTreesClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score

import optuna
import warnings
warnings.filterwarnings('ignore')

In [21]:
train = pd.read_csv("data/train.csv", index_col='id')

In [22]:
test = pd.read_csv("data/test.csv", index_col='id')

In [23]:
submission = pd.read_csv("data/sample_submission.csv", index_col='id')

In [24]:
X = train.drop(columns="defects")
y = train.defects

In [25]:
class_weight = y.value_counts(True).to_dict()

In [26]:
skf = StratifiedKFold(n_splits = 4, shuffle=True)

In [27]:
seed = 0

In [28]:
estimators = [
    ('xgb', XGBClassifier(
        objective = 'binary:logistic',
        tree_method = 'hist',
        colsample_bytree = 0.7,
        gamma = 2,
        learning_rate = 0.01,
        max_depth = 7,
        min_child_weight = 10,
        n_estimators = 500,
        subsample = 0.7,
        random_state=seed
    )),
    ('rf', RandomForestClassifier(
        class_weight='balanced_subsample',
        criterion='entropy',
        max_depth=16,
        min_samples_split=256,
        n_estimators=1024,
        random_state=seed
    )),
    ('lgbm', LGBMClassifier(
        objective = 'binary',
        n_estimators = 500,
        max_depth = 7,
        learning_rate = 0.01,
        num_leaves = 20,
        reg_alpha = 3,
        reg_lambda = 3,
        subsample = 0.7,
        colsample_bytree = 0.7,
        random_state=seed
    )),
    ('hist', HistGradientBoostingClassifier(
        random_state=seed
    )),
    ('cat', CatBoostClassifier(
        loss_function = 'Logloss',
        iterations = 500,
        learning_rate = 0.01,
        depth = 7,
        random_strength = 0.5,
        bagging_temperature = 0.7,
        border_count = 30,
        l2_leaf_reg = 5,
        verbose = False,
        task_type = 'CPU',
        random_state=seed
    )),
    ('et', ExtraTreesClassifier(
        random_state=seed
    )),
    ('gb', GradientBoostingClassifier(
        random_state=seed
    ))
]

In [10]:
weights = []
for name, model in estimators :
    scores = cross_val_score(model,
                            X,
                            y,
                            scoring = 'roc_auc',
                            cv = skf,
                            n_jobs = -1)
    weights.append(scores.mean())
    print(f'{name} - Roc AUC score: {scores.mean():.4f} ± {scores.std():.4f}')


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  retur

xgb - Roc AUC score: 0.7922 ± 0.0036
rf - Roc AUC score: 0.7911 ± 0.0018
lgbm - Roc AUC score: 0.7920 ± 0.0046
hist - Roc AUC score: 0.7910 ± 0.0025
cat - Roc AUC score: 0.7912 ± 0.0050
et - Roc AUC score: 0.7632 ± 0.0034
gb - Roc AUC score: 0.7913 ± 0.0019


In [None]:
voting = VotingClassifier(
     estimators=estimators,
     voting='soft',
    weights=weights
)

In [None]:
voting.fit(X, y)

In [None]:
submission.loc[:,'defects'] = voting.predict_proba(test)[:, 1]

In [None]:
submission.to_csv("submission.csv")