In [1]:
import pandas as pd

# Data import

FE = 0 # Feature Engineering (0 = No, 1 = Yes)

FR = 0 # Feature Reduction (0 = No, 1 = Yes)

OR = 0  # Original not included

CAT = 0 # Categorical data (0 = No, 1 = Yes)

def dataset(FE, FR, OR, CAT):
    if CAT == 0:
        if OR == 0:
            if FR == 0:
                if FE == 0:
                    X_train = pd.read_csv('data/X_train.csv')
                    X_test = pd.read_csv('data/X_test.csv') # Test data
                else:
                    X_train = pd.read_csv('data/X_train_FE.csv')
                    X_test = pd.read_csv('data/X_test_FE.csv') # Test data
            else:
                X_train = pd.read_csv('data/X_train_FR.csv')
                X_test = pd.read_csv('data/X_test_FR.csv') # Test data
        else:
            if FE == 0:
                X_train = pd.read_csv('data/X_total.csv')
                X_test = pd.read_csv('data/X_test.csv') # Test data
            else:
                X_train = pd.read_csv('data/X_total_FE.csv')
                X_test = pd.read_csv('data/X_test_FE.csv')
    else:
        X_train = pd.read_csv('data/X_train_CAT.csv')
        X_test = pd.read_csv('data/X_test_CAT.csv') # Test data
    
    return X_train, X_test


In [2]:
import lightgbm as lgb
from sklearn.model_selection import RepeatedStratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score

def ROC_AUC(model, X_train, y_train, n_splits=3, n_repeats=3):
    
    # Initialize the Repeated Stratified K Fold
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)

    # Create an empty list to store the Out-of-Fold (OOF) predictions
    oof_preds = np.zeros(X_train.shape[0])

    # Loop through each split
    for train_index, valid_index in rskf.split(X_train, y_train):
    
        # Split the data
        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]
    
        # Fit the model
        model.fit(X_train_fold, y_train_fold)
    
        # Make predictions and add them to the OOF predictions
        oof_preds[valid_index] = model.predict_proba(X_valid_fold)[:,1]

    # Calculate the overall AUC
    auc = roc_auc_score(y_train, oof_preds)

    return auc

# Training datasets performance

In [3]:
Best_LGBM = {'colsample_bytree': 0.1923023355270077,
         'learning_rate': 0.03574583615481279,
         'max_depth': 16, 
         'min_child_samples': 89, 
         'n_estimators': 260, 
         'num_leaves': 68, 
         'reg_alpha': 0.30028296727692755, 
         'reg_lambda': 0.6125642241926401, 
         'subsample': 0.7293703101825368
         }

In [4]:
Best_XGB =  {'max_depth': 13, 
         'learning_rate': 0.012308520402322306, 
         'colsample_bytree': 0.15564433647290904, 
         'subsample': 0.9392376085401448, 
         'n_estimators': 494, 
         'min_child_weight': 1, 
         'reg_alpha': 0.26760253520809857, 
         'reg_lambda': 0.24616802866656362} # best value: 0.9743947392021206.

In [5]:
Best_Cat_Opt_AllCat = {'iterations': 249, 
                       'depth': 6, 
                       'learning_rate': 0.1555748471212781, 
                       'random_strength': 58, 
                       'bagging_temperature': 87.47376677399185, 
                       'od_type': 'IncToDec', 
                       'od_wait': 27}

In [6]:
Best =  {'colsample_bytree': 0.19297355677628159, 
          'learning_rate': 0.020755882048032786, 
          'max_depth': 9, 
          'min_child_samples': 41, 
          'n_estimators': 320, 
          'num_leaves': 100, 
          'reg_alpha': 0.39149507035237485, 
          'reg_lambda': 0.43245778149146746, 
          'subsample': 0.5188437264047947} # best loss: -0.973283159101125

In [7]:
Train_sets = {'Normal dataset': [0, 0, 0, 0], 'Dataset With FE': [1, 0, 0, 0], 'Dataset With FE and FR': [1, 1, 0, 0], 'Dataset Ori + train no FE': [0, 0, 1, 0], 'Dataset Ori + train with FE': [1, 0, 1, 0]}

In [9]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import lightgbm as lgb

# Define the model with the best parameters
model_LGBM = lgb.LGBMClassifier(**Best)

Scores = {}

for name, model in Train_sets.items():

    X_train, X_test  = dataset(model[0], model[1], model[2], model[3])
    y_train = X_train['Machine_failure']
    X_train = X_train.drop(['Machine_failure'], axis=1)
    print(X_train.shape)

    # Initialize the Repeated Stratified K Fold
    rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42)

    # Create an empty list to store the Out-of-Fold (OOF) predictions
    oof_preds = np.zeros(X_train.shape[0])

    # Loop through each split
    for train_index, valid_index in rskf.split(X_train, y_train):
        
        # Split the data
        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]
        
        # Fit the model
        model_LGBM.fit(X_train_fold, y_train_fold)
        
        # Make predictions and add them to the OOF predictions
        oof_preds[valid_index] = model_LGBM.predict_proba(X_valid_fold)[:,1]

    # Calculate the overall AUC
    auc = roc_auc_score(y_train, oof_preds)

    print("Model: ", name, "   AUC: ", auc)


(136429, 14)
Model:  Normal dataset    AUC:  0.9706861467455258
(136429, 22)
Model:  Dataset With FE    AUC:  0.9672851967906262
(136429, 19)
Model:  Dataset With FE and FR    AUC:  0.9653824357485317
(146429, 14)
Model:  Dataset Ori + train no FE    AUC:  0.9752747857937709
(146429, 22)
Model:  Dataset Ori + train with FE    AUC:  0.9726090433142951


# Models Trainning

In [10]:
X_train, X_test  = dataset(0, 0, 1, 0)
y_train = X_train['Machine_failure']
X_train = X_train.drop(['Machine_failure'], axis=1)

# LightGBM model training

### LightGBM hyper-parameter tunning

In [21]:
# Libraries
import lightgbm as lgb
# hp 
from hyperopt import hp, tpe, STATUS_OK, Trials, fmin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict, StratifiedKFold

space ={
    'max_depth': hp.quniform('max_depth', 6, 18, 1),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.01, 0.3),
    'subsample': hp.uniform('subsample', 0.4, 1.0),
    'n_estimators': hp.quniform('n_estimators', 100, 400, 10),
    'min_child_samples': hp.quniform('min_child_samples', 20, 100, 1),
    'reg_alpha': hp.uniform('reg_alpha', 0.15, 0.5),
    'reg_lambda': hp.uniform('reg_lambda', 0.3, 1.0)
}

def objective(space):
    model_LGBM = lgb.LGBMClassifier(max_depth = int(space['max_depth']),
                             learning_rate = space['learning_rate'],
                             num_leaves = int(space['num_leaves']),
                             n_estimators = int(space['n_estimators']),
                             colsample_bytree = space['colsample_bytree'],
                             subsample = space['subsample'],
                             is_unbalance = True,
                             min_child_samples = int(space['min_child_samples']),
                             reg_alpha = space['reg_alpha'],
                             reg_lambda = space['reg_lambda'])
    
    cv = StratifiedKFold(n_splits=10)
    y_pred_proba = cross_val_predict(model_LGBM, X_train, y_train, cv=cv, method='predict_proba')[:,1]
    auc = roc_auc_score(y_train, y_pred_proba)
    return {'loss': -auc, 'status': STATUS_OK}

# Run the algorithm
trials = Trials()
best_LGBM = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

print("Best: ", best_LGBM)



  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [13:11<00:00, 15.83s/trial, best loss: -0.973283159101125] 
Best:  {'colsample_bytree': 0.19297355677628159, 'learning_rate': 0.020755882048032786, 'max_depth': 9.0, 'min_child_samples': 41.0, 'n_estimators': 320.0, 'num_leaves': 100.0, 'reg_alpha': 0.39149507035237485, 'reg_lambda': 0.43245778149146746, 'subsample': 0.5188437264047947}


## Fitting

In [11]:
Best_LGBM =  {'colsample_bytree': 0.19297355677628159, 
          'learning_rate': 0.020755882048032786, 
          'max_depth': 9, 
          'min_child_samples': 41, 
          'n_estimators': 320, 
          'num_leaves': 100, 
          'reg_alpha': 0.39149507035237485, 
          'reg_lambda': 0.43245778149146746, 
          'subsample': 0.5188437264047947} # best loss: -0.973283159101125

In [12]:
model_LGBM = lgb.LGBMClassifier(**Best_LGBM)

In [18]:
AUC_LGBM = ROC_AUC(model_LGBM, X_train, y_train)
print("AUC LGBM: ", AUC_LGBM)

AUC LGBM:  0.9752747857937709


In [None]:
model_LGBM.fit(X_train, y_train)

# XGBoost model training

## Hyper-parameter tunning

In [31]:
import optuna
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict

# Define the objective function
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 4, 18),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.05, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.2, 1.0),
    }
    
    model_XGB = xgb.XGBClassifier(**params, n_jobs = -1)
    
    cv = StratifiedKFold(n_splits=5)
    y_pred_proba = cross_val_predict(model_XGB, X_train, y_train, cv=cv, method='predict_proba')[:,1]
    auc = roc_auc_score(y_train, y_pred_proba)
    
    return auc

# Create a study object
study = optuna.create_study(direction='maximize')

# Start optimization
study.optimize(objective, n_trials=50)

# Get the best parameters
best_XGB = study.best_params
print("Best: ", best_XGB)


[32m[I 2023-06-17 05:21:02,743][0m A new study created in memory with name: no-name-7add8121-9694-4e28-8a09-f07f15c515d8[0m
[32m[I 2023-06-17 05:21:09,176][0m Trial 0 finished with value: 0.9616607567456765 and parameters: {'max_depth': 4, 'learning_rate': 0.116001680777605, 'colsample_bytree': 0.14692953139998594, 'subsample': 0.8030857175987267, 'n_estimators': 261, 'min_child_weight': 10, 'reg_alpha': 0.16187385783406927, 'reg_lambda': 0.260216248238337}. Best is trial 0 with value: 0.9616607567456765.[0m
[32m[I 2023-06-17 05:21:33,296][0m Trial 1 finished with value: 0.9682791249913572 and parameters: {'max_depth': 16, 'learning_rate': 0.026303751565074036, 'colsample_bytree': 0.275439983412078, 'subsample': 0.8493748844170721, 'n_estimators': 519, 'min_child_weight': 5, 'reg_alpha': 0.3696863546810296, 'reg_lambda': 0.4048009875593822}. Best is trial 1 with value: 0.9682791249913572.[0m
[32m[I 2023-06-17 05:21:43,373][0m Trial 2 finished with value: 0.9598617173352173 a

Best:  {'max_depth': 13, 'learning_rate': 0.012308520402322306, 'colsample_bytree': 0.15564433647290904, 'subsample': 0.9392376085401448, 'n_estimators': 494, 'min_child_weight': 1, 'reg_alpha': 0.26760253520809857, 'reg_lambda': 0.24616802866656362}


## Fitting

In [7]:
Best_XGB =  {'max_depth': 13, 
         'learning_rate': 0.012308520402322306, 
         'colsample_bytree': 0.15564433647290904, 
         'subsample': 0.9392376085401448, 
         'n_estimators': 494, 
         'min_child_weight': 1, 
         'reg_alpha': 0.26760253520809857, 
         'reg_lambda': 0.24616802866656362} # best value: 0.9743947392021206.

In [15]:
import xgboost as xgb
model_XGB = xgb.XGBClassifier(**Best_XGB)
model_XGB.fit(X_train, y_train)

In [17]:
AUC_XGB = ROC_AUC(model_XGB, X_train, y_train)
print("AUC XGB: ", AUC_XGB)

AUC XGB:  0.9784126460666145


# CatBoost model training

In [52]:
X_train_CAT, X_test_CAT = dataset(0,0,0,1)

## Hyper-parameter tunning

In [53]:
y_train_CAT = X_train_CAT['Machine_failure']
X_train_CAT = X_train_CAT.drop('Machine_failure', axis=1)

In [48]:
# Total unique values in each column of X_train
X_train_CAT.nunique()

Product_ID               10000
Type                         3
Air_temperature_K           96
Process_temperature_K       82
Rotational_speed_rpm       957
Torque_Nm                  615
Tool_wear_min              246
TWF                          2
HDF                          2
PWF                          2
OSF                          2
RNF                          2
dtype: int64

In [54]:
cat = ['Product_ID', 'Type', 'Air_temperature_K', 'Process_temperature_K','Rotational_speed_rpm', 'Torque_Nm', 'Tool_wear_min', 'TWF', 'HDF','PWF', 'OSF', 'RNF']

# Convert cat columns of X_train to str
for col in cat:
    X_train_CAT[col] = X_train_CAT[col].astype('str')

In [43]:
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold

# Define the objective function
def objective(trial):
    # Specify a search space using trial object
    params = {
        'iterations': trial.suggest_int('iterations', 50, 300),
        'depth': trial.suggest_int('depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True), 
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.01, 100.00, log=True), 
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'loss_function': 'Logloss',  # Binary classification
        'eval_metric': 'AUC',  # AUC as the performance metric
    }

    model_CAT = CatBoostClassifier(**params, verbose=False)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    scores = []
    for train_index, valid_index in cv.split(X_train_CAT, y_train_CAT):
        train_pool = Pool(data=X_train_CAT.iloc[train_index], label=y_train_CAT.iloc[train_index], cat_features = cat)
        valid_pool = Pool(data=X_train_CAT.iloc[valid_index], label=y_train_CAT.iloc[valid_index], cat_features = cat)

        model_CAT.fit(train_pool, eval_set=valid_pool)
        scores.append(model_CAT.best_score_['validation']['AUC'])

    return np.mean(scores)

# Create a study object
study = optuna.create_study(direction='maximize')

# Start optimization
study.optimize(objective, n_trials=50)


[32m[I 2023-06-17 11:41:53,370][0m A new study created in memory with name: no-name-449c8501-3d31-496b-8116-78031ca34173[0m
[32m[I 2023-06-17 11:42:05,471][0m Trial 0 finished with value: 0.9533153317183979 and parameters: {'iterations': 126, 'depth': 4, 'learning_rate': 0.013998335101610257, 'random_strength': 52, 'bagging_temperature': 7.60416911051556, 'od_type': 'IncToDec', 'od_wait': 44}. Best is trial 0 with value: 0.9533153317183979.[0m
[32m[I 2023-06-17 11:43:20,402][0m Trial 1 finished with value: 0.9746634364468246 and parameters: {'iterations': 233, 'depth': 7, 'learning_rate': 0.011270440591083359, 'random_strength': 31, 'bagging_temperature': 18.53878123731863, 'od_type': 'IncToDec', 'od_wait': 50}. Best is trial 1 with value: 0.9746634364468246.[0m
[32m[I 2023-06-17 11:44:22,709][0m Trial 2 finished with value: 0.9782274073884368 and parameters: {'iterations': 216, 'depth': 5, 'learning_rate': 0.030656105623185603, 'random_strength': 4, 'bagging_temperature': 0

KeyboardInterrupt: 

[I 2023-06-17 11:51:23,335] Trial 11 finished with value: 0.9783593516221193 and parameters: {'iterations': 248, 'depth': 8, 'learning_rate': 0.07556114402524425, 'random_strength': 2, 'bagging_temperature': 1.6353270643758222, 'od_type': 'Iter', 'od_wait': 22}. Best is trial 11 with value: 0.9783593516221193.


## Fitting

In [44]:
Best_Cat_Opt_AllCat = {'iterations': 249, 
                       'depth': 6, 
                       'learning_rate': 0.1555748471212781, 
                       'random_strength': 58, 
                       'bagging_temperature': 87.47376677399185, 
                       'od_type': 'IncToDec', 
                       'od_wait': 27} # 0.9791965650818231

In [12]:
Best_CAT = {'iterations': 290, 
            'depth': 12, 
            'learning_rate': 0.016041043108679078, 
            'random_strength': 12, 
            'bagging_temperature': 0.014032128937182435, 
            'od_type': 'IncToDec', 
            'od_wait': 47}

In [57]:
model_CAT = CatBoostClassifier(**Best_Cat_Opt_AllCat, verbose=False)
model_CAT.fit(X_train_CAT, y_train_CAT, cat_features = cat)

<catboost.core.CatBoostClassifier at 0x1e016936610>

# Stacking

In [59]:
for name, model in Train_sets.items():

    X_train, X_test  = dataset(model[0], model[1], model[2], model[3])
    y_train = X_train['Machine_failure']
    X_train = X_train.drop(['Machine_failure'], axis=1)

    # Use the function
    AUC = ROC_AUC(model_XGB, X_train, y_train)
    print("Dataset: ", name, "AUC LGBM: ", AUC)


Dataset:  Normal dataset AUC LGBM:  0.9724129378237473
Dataset:  Dataset With FE AUC LGBM:  0.9702258169335194
Dataset:  Dataset With FE and FR AUC LGBM:  0.9674699971488955
Dataset:  Dataset Ori + train no FE AUC LGBM:  0.9784126460666145
Dataset:  Dataset Ori + train with FE AUC LGBM:  0.9755411959281259


In [10]:
# Use the function
AUC_LGBM = ROC_AUC(model_LGBM, X_train, y_train)
print("AUC LGBM: ", AUC_LGBM)

AUC_XGB = ROC_AUC(model_XGB, X_train, y_train)
print("AUC XGB: ", AUC_XGB)

AUC LGBM:  0.9751743189440938
AUC XGB:  0.9784126460666145


In [37]:
# Initialize the Repeated Stratified K Fold
rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42)

# Create an empty list to store the Out-of-Fold (OOF) predictions
oof_preds = np.zeros(X_train_CAT.shape[0])


# Loop through each split
for train_index, valid_index in rskf.split(X_train_CAT.astype(str), y_train):

    # Split the data
    X_train_fold, X_valid_fold = X_train_CAT.iloc[train_index], X_train_CAT.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]

    # Fit the model
    model_CAT.fit(X_train_fold.astype(str), y_train_fold, cat_features = cat)

    # Make predictions and add them to the OOF predictions
    oof_preds[valid_index] = model_CAT.predict_proba(X_valid_fold.astype(str))[:,1]

# Calculate the overall AUC
auc = roc_auc_score(y_train, oof_preds)


print("AUC CAT: ", auc)

AUC CAT:  0.977913617834177


AUC:  0.9659444919120036
AUC:  0.9702258169335194
AUC:  0.9605964382592067

In [40]:
# Obtener las predicciones
pred_lgbm = model_LGBM.predict_proba(X_train)[:,1]
pred_xgb = model_XGB.predict_proba(X_train)[:,1]
pred_cat = model_CAT.predict_proba(X_train_CAT.astype(str))[:,1]

X_train_meta = pd.DataFrame({'lgbm': pred_lgbm, 'xgb': pred_xgb, 'cat': pred_cat, 'target': y_train})

pred_lgbm_test = model_LGBM.predict_proba(X_test)[:,1]
pred_xgb_test = model_XGB.predict_proba(X_test)[:,1]
pred_cat_test = model_CAT.predict_proba(X_test_CAT.astype(str))[:,1]

X_test_meta = pd.DataFrame({'lgbm': pred_lgbm_test, 'xgb': pred_xgb_test, 'cat': pred_cat_test})

## Hyper-parameter tunning for meta model

In [None]:
y_train_m = X_train_meta['target']
X_train_m = X_train_meta.drop(['target'], axis=1)

In [53]:
# Libraries
import lightgbm as lgb
# hp 
from hyperopt import hp, tpe, STATUS_OK, Trials, fmin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict, StratifiedKFold

space ={
    'max_depth': hp.quniform('max_depth', 6, 18, 1),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.01, 0.3),
    'subsample': hp.uniform('subsample', 0.4, 1.0),
    'n_estimators': hp.quniform('n_estimators', 100, 400, 10),
    'min_child_samples': hp.quniform('min_child_samples', 20, 100, 1),
    'reg_alpha': hp.uniform('reg_alpha', 0.15, 0.5),
    'reg_lambda': hp.uniform('reg_lambda', 0.3, 1.0)
}



def objective(space):
    model_LGBM_meta = lgb.LGBMClassifier(max_depth = int(space['max_depth']),
                             learning_rate = space['learning_rate'],
                             num_leaves = int(space['num_leaves']),
                             n_estimators = int(space['n_estimators']),
                             colsample_bytree = space['colsample_bytree'],
                             subsample = space['subsample'],
                             is_unbalance = True,
                             min_child_samples = int(space['min_child_samples']),
                             reg_alpha = space['reg_alpha'],
                             reg_lambda = space['reg_lambda'])
    
    cv = StratifiedKFold(n_splits=10)
    y_pred_proba = cross_val_predict(model_LGBM_meta, X_train_m, y_train_m, cv=cv, method='predict_proba')[:,1]
    auc = roc_auc_score(y_train_m, y_pred_proba)
    return {'loss': -auc, 'status': STATUS_OK}

# Run the algorithm
trials = Trials()
best_LGBM_meta = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

print("Best: ", best_LGBM_meta)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [06:46<00:00,  8.14s/trial, best loss: -0.9923355097281873]
Best:  {'colsample_bytree': 0.026181323096692063, 'learning_rate': 0.04207644660879477, 'max_depth': 6.0, 'min_child_samples': 26.0, 'n_estimators': 190.0, 'num_leaves': 21.0, 'reg_alpha': 0.4910393820375527, 'reg_lambda': 0.6246168616161446, 'subsample': 0.41742010255980644}


## Fitting

In [58]:
Best_meta_LGBM =  {'colsample_bytree': 0.22071462239650944, 
                'learning_rate': 0.0071184882183990906, 
                'max_depth': 12, 
                'min_child_samples': 91, 
                'n_estimators': 180, 
                'num_leaves': 20, 
                'reg_alpha': 0.34357926147986606, 
                'reg_lambda': 0.9332514748078644, 
                'subsample': 0.9562001151634876} # 0.988

best_LGBM_meta = {'colsample_bytree': 0.026181323096692063, 
                  'learning_rate': 0.04207644660879477, 
                  'max_depth': 6, 
                  'min_child_samples': 26, 
                  'n_estimators': 190, 
                  'num_leaves': 21, 
                  'reg_alpha': 0.4910393820375527, 
                  'reg_lambda': 0.6246168616161446, 
                  'subsample': 0.41742010255980644} # 0.9923355097281873

In [59]:
model_meta_LGBM = lgb.LGBMClassifier(**best_LGBM_meta)

AUC_META_LGBM_tunned = ROC_AUC(model_meta_LGBM, X_train_m, y_train_m)
print("AUC META LGBM tunned: ", AUC_META_LGBM_tunned)

AUC META LGBM tunned:  0.9893425917870007


In [None]:
model_meta_LGBM.fit(X_train_m, y_train_m)

In [61]:
y_pred = model_meta_LGBM.predict_proba(X_test_meta)[:,1]

# Results analysis

In [None]:
model = model_meta_LGBM

In [37]:
y_pred_analysis = model.predict_proba(X_train)[:,1]

In [38]:
X_train_analysis = X_train.copy()
X_train_analysis['Machine_failure'] = y_train
X_train_analysis['Machine_failure_pred'] = y_pred_analysis
X_train_analysis['Error'] = X_train_analysis['Machine_failure'] - X_train_analysis['Machine_failure_pred']

In [41]:
X_train_analysis.to_csv('data/X_train_analysis.csv', index=False)

# Submission

In [62]:
X_test_submission = pd.read_csv('input/test.csv') # Test data

In [63]:
# Submission to a file
# Rounded pred increases the score
submission = pd.DataFrame({'id': X_test_submission['id'], 'Machine failure': y_pred})
submission.to_csv('submission.csv', index=False)