In [1]:
import pandas as pd

# Data import

FE = 0 # Feature Engineering (0 = No, 1 = Yes)

FR = 0 # Feature Reduction (0 = No, 1 = Yes)

OR = 0  # Original not included

CAT = 0 # Categorical data (0 = No, 1 = Yes)

def dataset(FE, FR, OR, CAT):
    if CAT == 0:
        if OR == 0:
            if FR == 0:
                if FE == 0:
                    X_train = pd.read_csv('data/X_train.csv')
                    X_test = pd.read_csv('data/X_test.csv') # Test data
                else:
                    X_train = pd.read_csv('data/X_train_FE.csv')
                    X_test = pd.read_csv('data/X_test_FE.csv') # Test data
            else:
                X_train = pd.read_csv('data/X_train_FR.csv')
                X_test = pd.read_csv('data/X_test_FR.csv') # Test data
        else:
            if FE == 0:
                X_train = pd.read_csv('data/X_total.csv')
                X_test = pd.read_csv('data/X_test.csv') # Test data
            else:
                X_train = pd.read_csv('data/X_total_FE.csv')
                X_test = pd.read_csv('data/X_test_FE.csv')
    else:
        X_train = pd.read_csv('data/X_train_CAT.csv')
        X_test = pd.read_csv('data/X_test_CAT.csv') # Test data
    
    return X_train, X_test


In [2]:
import lightgbm as lgb
from sklearn.model_selection import RepeatedStratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score

def ROC_AUC(model, X_train, y_train, n_splits=5, n_repeats=3):
    
    # Initialize the Repeated Stratified K Fold
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)

    # Create an empty list to store the Out-of-Fold (OOF) predictions
    oof_preds = np.zeros(X_train.shape[0])

    # Loop through each split
    for train_index, valid_index in rskf.split(X_train, y_train):
    
        # Split the data
        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]
    
        # Fit the model
        model.fit(X_train_fold, y_train_fold)
    
        # Make predictions and add them to the OOF predictions
        oof_preds[valid_index] = model.predict_proba(X_valid_fold)[:,1]

    # Calculate the overall AUC
    auc = roc_auc_score(y_train, oof_preds)

    return auc

# Training datasets performance

In [32]:
Best_LGBM = {'colsample_bytree': 0.1923023355270077,
         'learning_rate': 0.03574583615481279,
         'max_depth': 16, 
         'min_child_samples': 89, 
         'n_estimators': 260, 
         'num_leaves': 68, 
         'reg_alpha': 0.30028296727692755, 
         'reg_lambda': 0.6125642241926401, 
         'subsample': 0.7293703101825368
         }

In [33]:
Best_XGB =  {'max_depth': 13, 
         'learning_rate': 0.012308520402322306, 
         'colsample_bytree': 0.15564433647290904, 
         'subsample': 0.9392376085401448, 
         'n_estimators': 494, 
         'min_child_weight': 1, 
         'reg_alpha': 0.26760253520809857, 
         'reg_lambda': 0.24616802866656362} # best value: 0.9743947392021206.

In [34]:
Best_Cat_Opt_AllCat = {'iterations': 249, 
                       'depth': 6, 
                       'learning_rate': 0.1555748471212781, 
                       'random_strength': 58, 
                       'bagging_temperature': 87.47376677399185, 
                       'od_type': 'IncToDec', 
                       'od_wait': 27}

In [35]:
Best =  {'colsample_bytree': 0.19297355677628159, 
          'learning_rate': 0.020755882048032786, 
          'max_depth': 9, 
          'min_child_samples': 41, 
          'n_estimators': 320, 
          'num_leaves': 100, 
          'reg_alpha': 0.39149507035237485, 
          'reg_lambda': 0.43245778149146746, 
          'subsample': 0.5188437264047947} # best loss: -0.973283159101125

In [37]:
Train_sets = {'Normal dataset': [0, 0, 0, 0], 'Dataset With FE': [1, 0, 0, 0], 'Dataset With FE and FR': [1, 1, 0, 0], 'Dataset Ori + train no FE': [0, 0, 1, 0], 'Dataset Ori + train with FE': [1, 0, 1, 0]}

In [38]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import lightgbm as lgb

# Define the model with the best parameters
model_LGBM = lgb.LGBMClassifier(**Best)

Scores = {}

for name, model in Train_sets.items():

    X_train, X_test  = dataset(model[0], model[1], model[2], model[3])
    y_train = X_train['Machine_failure']
    X_train = X_train.drop(['Machine_failure'], axis=1)
    print(X_train.shape)

    # Initialize the Repeated Stratified K Fold
    rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42)

    # Create an empty list to store the Out-of-Fold (OOF) predictions
    oof_preds = np.zeros(X_train.shape[0])

    # Loop through each split
    for train_index, valid_index in rskf.split(X_train, y_train):
        
        # Split the data
        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]
        
        # Fit the model
        model_LGBM.fit(X_train_fold, y_train_fold)
        
        # Make predictions and add them to the OOF predictions
        oof_preds[valid_index] = model_LGBM.predict_proba(X_valid_fold)[:,1]

    # Calculate the overall AUC
    auc = roc_auc_score(y_train, oof_preds)

    print("Model: ", name, "   AUC: ", auc)


(136429, 14)
Model:  Normal dataset    AUC:  0.9706861467455258
(136429, 24)
Model:  Dataset With FE    AUC:  0.9663308554698875
(136429, 21)
Model:  Dataset With FE and FR    AUC:  0.966638698897308
(146429, 14)
Model:  Dataset Ori + train no FE    AUC:  0.9752747857937709
(146429, 24)
Model:  Dataset Ori + train with FE    AUC:  0.9718414288710988


In [None]:
(136429, 14)
Model:  Normal dataset    AUC:  0.9706861467455258
(136429, 22)
Model:  Dataset With FE    AUC:  0.9672851967906262
(136429, 19)
Model:  Dataset With FE and FR    AUC:  0.9653824357485317
(146429, 14)
Model:  Dataset Ori + train no FE    AUC:  0.9752747857937709
(146429, 22)
Model:  Dataset Ori + train with FE    AUC:  0.9726090433142951

# Models Trainning

In [3]:
X_train, X_test  = dataset(0, 0, 1, 0)
y_train = X_train['Machine_failure']
X_train = X_train.drop(['Machine_failure'], axis=1)

In [4]:
X_train_CAT, X_test_CAT = dataset(0,0,0,1)

In [5]:
cat_columns = ['Product_ID', 'Type']

In [None]:
X_train_CAT[cat_columns] = X_train_CAT[cat_columns].astype('category')
X_test_CAT[cat_columns] = X_test_CAT[cat_columns].astype('category')

In [37]:
X_train.head()

Unnamed: 0,Product_ID,Air_temperature_K,Process_temperature_K,Rotational_speed_rpm,Torque_Nm,Tool_wear_min,TWF,HDF,PWF,OSF,RNF,Type_H,Type_L,Type_M
0,50096,300.6,309.6,1596,36.1,140,0,0,0,0,0,0,1,0
1,20343,302.6,312.1,1759,29.1,200,0,0,0,0,0,0,0,1
2,49454,299.3,308.5,1805,26.5,25,0,0,0,0,0,0,1,0
3,53355,301.0,310.9,1524,44.3,197,0,0,0,0,0,0,1,0
4,24050,298.0,309.0,1641,35.4,34,0,0,0,0,0,0,0,1


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146429 entries, 0 to 146428
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   Product_ID             146429 non-null  category
 1   Type                   146429 non-null  category
 2   Air_temperature_K      146429 non-null  float64 
 3   Process_temperature_K  146429 non-null  float64 
 4   Rotational_speed_rpm   146429 non-null  int64   
 5   Torque_Nm              146429 non-null  float64 
 6   Tool_wear_min          146429 non-null  int64   
 7   TWF                    146429 non-null  int64   
 8   HDF                    146429 non-null  int64   
 9   PWF                    146429 non-null  int64   
 10  OSF                    146429 non-null  int64   
 11  RNF                    146429 non-null  int64   
dtypes: category(2), float64(3), int64(7)
memory usage: 11.9 MB


# LightGBM model training

### LightGBM hyper-parameter tunning

In [21]:
# Libraries
import lightgbm as lgb
# hp 
from hyperopt import hp, tpe, STATUS_OK, Trials, fmin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict, StratifiedKFold

space ={
    'max_depth': hp.quniform('max_depth', 6, 18, 1),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.01, 0.3),
    'subsample': hp.uniform('subsample', 0.4, 1.0),
    'n_estimators': hp.quniform('n_estimators', 100, 400, 10),
    'min_child_samples': hp.quniform('min_child_samples', 20, 100, 1),
    'reg_alpha': hp.uniform('reg_alpha', 0.15, 0.5),
    'reg_lambda': hp.uniform('reg_lambda', 0.3, 1.0)
}

def objective(space):
    model_LGBM = lgb.LGBMClassifier(max_depth = int(space['max_depth']),
                             learning_rate = space['learning_rate'],
                             num_leaves = int(space['num_leaves']),
                             n_estimators = int(space['n_estimators']),
                             colsample_bytree = space['colsample_bytree'],
                             subsample = space['subsample'],
                             is_unbalance = True,
                             min_child_samples = int(space['min_child_samples']),
                             reg_alpha = space['reg_alpha'],
                             reg_lambda = space['reg_lambda'])
    
    cv = StratifiedKFold(n_splits=10)
    y_pred_proba = cross_val_predict(model_LGBM, X_train, y_train, cv=cv, method='predict_proba')[:,1]
    auc = roc_auc_score(y_train, y_pred_proba)
    return {'loss': -auc, 'status': STATUS_OK}

# Run the algorithm
trials = Trials()
best_LGBM = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

print("Best: ", best_LGBM)



  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [13:11<00:00, 15.83s/trial, best loss: -0.973283159101125] 
Best:  {'colsample_bytree': 0.19297355677628159, 'learning_rate': 0.020755882048032786, 'max_depth': 9.0, 'min_child_samples': 41.0, 'n_estimators': 320.0, 'num_leaves': 100.0, 'reg_alpha': 0.39149507035237485, 'reg_lambda': 0.43245778149146746, 'subsample': 0.5188437264047947}


## Fitting

In [38]:
Best_LGBM =  {'colsample_bytree': 0.19297355677628159, 
          'learning_rate': 0.020755882048032786, 
          'max_depth': 9, 
          'min_child_samples': 41, 
          'n_estimators': 320, 
          'num_leaves': 100, 
          'reg_alpha': 0.39149507035237485, 
          'reg_lambda': 0.43245778149146746, 
          'subsample': 0.5188437264047947} # best loss: -0.973283159101125

In [39]:
model_LGBM = lgb.LGBMClassifier(**Best_LGBM)

In [63]:
AUC_LGBM = ROC_AUC(model_LGBM, X_train, y_train)
print("AUC LGBM: ", AUC_LGBM)

AUC LGBM:  0.9762130741832492


In [40]:
model_LGBM.fit(X_train, y_train)

# XGBoost model training

## Hyper-parameter tunning

In [110]:
import optuna
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict

# Define the objective function
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 4, 18),
        'learning_rate': trial.suggest_float('learning_rate', 0.000001, 0.15, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.3),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 1.0),
        'gamma': trial.suggest_float('gamma', 0.00001, 0.1),
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'tree_method': 'gpu_hist',
    }

    
    model_XGB = xgb.XGBClassifier(**params, n_jobs = -1)
    
    cv = StratifiedKFold(n_splits=5)
    y_pred_proba = cross_val_predict(model_XGB, X_train, y_train, cv=cv, method='predict_proba')[:,1]
    auc = roc_auc_score(y_train, y_pred_proba)
    
    return auc

# Create a study object
study = optuna.create_study(direction='maximize')

# Start optimization
study.optimize(objective, n_trials=50)

# Get the best parameters
best_XGB = study.best_params
print("Best: ", best_XGB)


[32m[I 2023-06-20 18:43:02,580][0m A new study created in memory with name: no-name-5718bf0b-b7b3-43c1-91d9-2445417e6fd4[0m


[32m[I 2023-06-20 18:43:15,158][0m Trial 0 finished with value: 0.9713728349806623 and parameters: {'max_depth': 9, 'learning_rate': 0.029179412645366845, 'colsample_bytree': 0.14179228442089745, 'subsample': 0.8725851015059102, 'n_estimators': 544, 'min_child_weight': 8, 'reg_alpha': 0.35704646398741113, 'reg_lambda': 0.4697658245581723, 'gamma': 0.05976736552903123, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.9713728349806623.[0m
[32m[I 2023-06-20 18:43:27,776][0m Trial 1 finished with value: 0.9715950378016317 and parameters: {'max_depth': 16, 'learning_rate': 0.1260985472345255, 'colsample_bytree': 0.10081084207759922, 'subsample': 0.8056571289897766, 'n_estimators': 381, 'min_child_weight': 9, 'reg_alpha': 0.0023665560833429937, 'reg_lambda': 0.3483418922540138, 'gamma': 0.08531269992877895, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.9715950378016317.[0m
[32m[I 2023-06-20 18:43:34,140][0m Trial 2 finished with value: 0.9634858401982118 and pa

Best:  {'max_depth': 13, 'learning_rate': 0.016474009154216287, 'colsample_bytree': 0.27492493169509574, 'subsample': 0.9177187561606172, 'n_estimators': 530, 'min_child_weight': 6, 'reg_alpha': 0.06728015878725187, 'reg_lambda': 0.3850044956052784, 'gamma': 0.03479209631794973, 'grow_policy': 'depthwise'}


## Fitting

In [41]:
Best_xgb2 = {'max_depth': 13, 
            'learning_rate': 0.016474009154216287, 
            'colsample_bytree': 0.27492493169509574, 
            'subsample': 0.9177187561606172, 
            'n_estimators': 530, 
            'min_child_weight': 6, 
            'reg_alpha': 0.06728015878725187, 
            'reg_lambda': 0.3850044956052784, 
            'gamma': 0.03479209631794973, 
            'grow_policy': 'depthwise',
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'tree_method': 'gpu_hist',} #Best 0.9750384510465802


In [42]:
Best_XGB =  {'max_depth': 13, 
         'learning_rate': 0.012308520402322306, 
         'colsample_bytree': 0.15564433647290904, 
         'subsample': 0.9392376085401448, 
         'n_estimators': 494, 
         'min_child_weight': 1, 
         'reg_alpha': 0.26760253520809857, 
         'reg_lambda': 0.24616802866656362} # best value: 0.9743947392021206.

In [43]:
import xgboost as xgb
model_XGB = xgb.XGBClassifier(**Best_XGB)
model_XGB.fit(X_train, y_train)

In [60]:
AUC_XGB = ROC_AUC(model_XGB, X_train, y_train)
print("AUC XGB: ", AUC_XGB)

AUC XGB:  0.9794722807448966


In [94]:
y_pred_train = model_XGB.predict_proba(X_train)[:,1]

In [None]:
AUC XGB:  0.9794722807448966

In [None]:
AUC XGB:  0.9754890580872562

# CatBoost model training

## Hyper-parameter tunning

In [12]:
y_train_CAT = y_train
X_train_CAT = X_train

In [13]:
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold

# Define the objective function
def objective(trial):
    # Specify a search space using trial object
    params = {
        'iterations': trial.suggest_int('iterations', 50, 300),
        'depth': trial.suggest_int('depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True), 
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.01, 100.00, log=True), 
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50),
        'loss_function': 'Logloss',  # Binary classification
        'eval_metric': 'AUC',  # AUC as the performance metric
    }

    model_CAT = CatBoostClassifier(**params, verbose=False)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    scores = []
    for train_index, valid_index in cv.split(X_train_CAT, y_train_CAT):
        train_pool = Pool(data=X_train_CAT.iloc[train_index], label=y_train_CAT.iloc[train_index], cat_features = cat_columns)
        valid_pool = Pool(data=X_train_CAT.iloc[valid_index], label=y_train_CAT.iloc[valid_index], cat_features = cat_columns)

        model_CAT.fit(train_pool, eval_set=valid_pool)
        scores.append(model_CAT.best_score_['validation']['AUC'])

    return np.mean(scores)

# Create a study object
study = optuna.create_study(direction='maximize')

# Start optimization
study.optimize(objective, n_trials=50)


  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2023-06-21 06:15:44,113][0m A new study created in memory with name: no-name-7a378e77-1fa3-4d47-b4f4-9b6c9a5df6e8[0m
[32m[I 2023-06-21 06:15:55,160][0m Trial 0 finished with value: 0.9734898592275569 and parameters: {'iterations': 118, 'depth': 8, 'learning_rate': 0.03337627092412684, 'random_strength': 38, 'bagging_temperature': 0.6630721421099399, 'od_type': 'IncToDec', 'od_wait': 41}. Best is trial 0 with value: 0.9734898592275569.[0m
[32m[I 2023-06-21 06:16:01,367][0m Trial 1 finished with value: 0.958876001875122 and parameters: {'iterations': 95, 'depth': 4, 'learning_rate': 0.018201277955057153, 'random_strength': 67, 'bagging_temperature': 1.6558340311284774, 'od_type': 'Iter', 'od_wait': 44}. Best is trial 0 with value: 0.9734898592275569.[0m
[32m[I 2023-06-21 06:16:45,692][0m Trial 2 finished with value: 0.9772358139676354 and parameters: {'iterations': 257, 'depth': 5, 'learning_rate': 0.016013653180670353, 

KeyboardInterrupt: 

[I 2023-06-17 11:51:23,335] Trial 11 finished with value: 0.9783593516221193 and parameters: {'iterations': 248, 'depth': 8, 'learning_rate': 0.07556114402524425, 'random_strength': 2, 'bagging_temperature': 1.6353270643758222, 'od_type': 'Iter', 'od_wait': 22}. Best is trial 11 with value: 0.9783593516221193.


## Fitting

In [14]:
Best_Cat_Opt_AllCat = {'iterations': 192, 
                        'depth': 6,
                        'learning_rate': 0.13298695008637992, 
                        'random_strength': 96, 
                        'bagging_temperature': 23.38968224422692, 
                        'od_type': 'Iter', 
                        'od_wait': 36
                        } #Best is trial 6 with value: 0.9802838561139681.


In [13]:
Best_CAT = {'iterations': 290, 
            'depth': 12, 
            'learning_rate': 0.016041043108679078, 
            'random_strength': 12, 
            'bagging_temperature': 0.014032128937182435, 
            'od_type': 'IncToDec', 
            'od_wait': 47}

In [98]:
X_train_CAT.drop(columns = ['Machine_failure'], inplace = True)

In [100]:
from catboost import CatBoostClassifier


model_CAT = CatBoostClassifier(**Best_Cat_Opt_AllCat, cat_features = cat_columns, verbose=False)
model_CAT.fit(X_train_CAT, y_train_CAT)

<catboost.core.CatBoostClassifier at 0x277c225a790>

In [85]:
X_test_CAT[cat_columns] = X_test_CAT[cat_columns].astype('category')
X_train_CAT[cat_columns] = X_train_CAT[cat_columns].astype('category')

In [101]:
from sklearn.model_selection import cross_val_score

cv2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

score = cross_val_score(model_CAT, X_train_CAT, y_train_CAT, cv=cv2, n_jobs=-1, verbose=1, scoring='roc_auc').mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.6s finished


In [102]:
print("AUC CAT: ", score)

AUC CAT:  0.9799481201707483


In [27]:
y_pred = model_CAT.predict_proba(X_test)[:,1]

# Gradient Boosting

## Hyper-parameter tunning

In [14]:
import optuna
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, cross_val_score


def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 16)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.1, log=True)

    strat_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    model = GradientBoostingClassifier(n_estimators=n_estimators, 
                                       max_depth=max_depth, 
                                       min_samples_split=min_samples_split, 
                                       min_samples_leaf=min_samples_leaf,
                                       learning_rate=learning_rate,
                                       random_state=42)
    
    score = cross_val_score(model, X_train, y_train, cv=strat_kfold, scoring="roc_auc")
    roc_auc = score.mean()
    return roc_auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Muestra los mejores parámetros
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[32m[I 2023-06-19 06:37:32,282][0m A new study created in memory with name: no-name-8b291dda-fda5-4161-b858-708e5ff96aee[0m
[33m[W 2023-06-19 06:46:12,572][0m Trial 0 failed with parameters: {'n_estimators': 128, 'max_depth': 13, 'min_samples_split': 10, 'min_samples_leaf': 3, 'learning_rate': 0.013447672137663734} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "c:\Users\Jose\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Jose\AppData\Local\Temp\ipykernel_8156\4039872000.py", line 23, in objective
    score = cross_val_score(model, X_train, y_train, cv=strat_kfold, scoring="roc_auc")
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Jose\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.

KeyboardInterrupt: 

## Fitting

# HistGradientBoosting

## Hyper-parameter tunning

In [17]:
import optuna
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

def objective(trial):
    params = {
        'max_iter': trial.suggest_int('max_iter', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'l2_regularization': trial.suggest_float('l2_regularization', 0.0, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2),
    }

    model = HistGradientBoostingClassifier(**params)

    strat_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=strat_kfold, scoring='roc_auc')

    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Luego de que se complete la optimización, puedes obtener los mejores parámetros así:
best_params = study.best_params


[32m[I 2023-06-19 06:50:46,192][0m A new study created in memory with name: no-name-127648be-2b2d-418c-947e-988ab33583ed[0m
[32m[I 2023-06-19 06:51:08,768][0m Trial 0 finished with value: 0.9695277072482027 and parameters: {'max_iter': 753, 'max_depth': 16, 'min_samples_leaf': 1, 'l2_regularization': 0.5932723023081404, 'learning_rate': 0.03824436300967314}. Best is trial 0 with value: 0.9695277072482027.[0m
[32m[I 2023-06-19 06:51:23,807][0m Trial 1 finished with value: 0.9697791003028786 and parameters: {'max_iter': 956, 'max_depth': 44, 'min_samples_leaf': 5, 'l2_regularization': 0.6347878379750351, 'learning_rate': 0.07254069710263998}. Best is trial 1 with value: 0.9697791003028786.[0m
[32m[I 2023-06-19 06:51:40,914][0m Trial 2 finished with value: 0.9691692746192763 and parameters: {'max_iter': 625, 'max_depth': 13, 'min_samples_leaf': 20, 'l2_regularization': 0.013610801663519467, 'learning_rate': 0.04962425909010801}. Best is trial 1 with value: 0.9697791003028786.[

## Fitting

In [76]:
from sklearn.ensemble import HistGradientBoostingClassifier

best_Hist = {'max_iter': 465, 
             'max_depth': 23, 
             'min_samples_leaf': 19, 
             'l2_regularization': 0.6937970904634575, 
             'learning_rate': 0.05507960310469436} #0.9708572700192338.

model_Hist = HistGradientBoostingClassifier(**best_Hist)

In [77]:
model_Hist.fit(X_train, y_train)

In [21]:
AUC = ROC_AUC(model_Hist, X_train, y_train)
print("AUC LGBM: ", AUC)

AUC LGBM:  0.9697346782390577


# Adaboost

## Hyper-parameter tunning

In [83]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Aplicar la estandarización con StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Aplicar la normalización con MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train_scaled)
X_test_scaled = min_max_scaler.transform(X_test_scaled)


In [81]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

In [26]:
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import AdaBoostClassifier

def objective(trial):
    # Definir los parámetros a optimizar con Optuna
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'algorithm': trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R'])
    }

    # Crear el modelo de AdaBoost con los parámetros sugeridos por Optuna
    model = AdaBoostClassifier(**params)

    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=strat_kfold, scoring='roc_auc')

    return scores.mean()

# Crear un estudio Optuna y optimizar los parámetros
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Imprimir los mejores parámetros y el valor de la métrica ROC AUC
print('Mejor ROC AUC:', study.best_value)
print('Mejores parámetros:', study.best_params)


[32m[I 2023-06-19 19:50:29,021][0m A new study created in memory with name: no-name-881835c8-348c-4abf-85db-92c322f91542[0m
[32m[I 2023-06-19 19:50:57,930][0m Trial 0 finished with value: 0.9003605207136085 and parameters: {'n_estimators': 62, 'learning_rate': 0.012167325242628762, 'algorithm': 'SAMME.R'}. Best is trial 0 with value: 0.9003605207136085.[0m
[32m[I 2023-06-19 19:54:18,426][0m Trial 1 finished with value: 0.9622434903937718 and parameters: {'n_estimators': 470, 'learning_rate': 0.01685188753976667, 'algorithm': 'SAMME.R'}. Best is trial 1 with value: 0.9622434903937718.[0m
[32m[I 2023-06-19 19:55:48,512][0m Trial 2 finished with value: 0.9541504214067545 and parameters: {'n_estimators': 185, 'learning_rate': 0.012967299448727172, 'algorithm': 'SAMME.R'}. Best is trial 1 with value: 0.9622434903937718.[0m
[32m[I 2023-06-19 19:56:56,125][0m Trial 3 finished with value: 0.8960126651265835 and parameters: {'n_estimators': 137, 'learning_rate': 0.0051574940171673

Mejor ROC AUC: 0.9647429289493697
Mejores parámetros: {'n_estimators': 340, 'learning_rate': 0.09801544008842904, 'algorithm': 'SAMME.R'}


## Fitting

In [78]:
best_ada = {'n_estimators': 340, 
            'learning_rate': 0.09801544008842904, 
            'algorithm': 'SAMME.R'}

In [82]:
model_Ada = AdaBoostClassifier(**best_ada)
model_Ada.fit(X_train_scaled, y_train)

In [None]:
AUC = ROC_AUC(model_Ada, X_train_scaled, y_train)
print("AUC ADA: ", AUC)

# Stacking

In [108]:
X_train.head()

Unnamed: 0,Product_ID,Air_temperature_K,Process_temperature_K,Rotational_speed_rpm,Torque_Nm,Tool_wear_min,TWF,HDF,PWF,OSF,RNF,Type_H,Type_L,Type_M
0,50096,300.6,309.6,1596,36.1,140,0,0,0,0,0,0,1,0
1,20343,302.6,312.1,1759,29.1,200,0,0,0,0,0,0,0,1
2,49454,299.3,308.5,1805,26.5,25,0,0,0,0,0,0,1,0
3,53355,301.0,310.9,1524,44.3,197,0,0,0,0,0,0,1,0
4,24050,298.0,309.0,1641,35.4,34,0,0,0,0,0,0,0,1


In [107]:
X_test.head()

Unnamed: 0,Product_ID,Air_temperature_K,Process_temperature_K,Rotational_speed_rpm,Torque_Nm,Tool_wear_min,TWF,HDF,PWF,OSF,RNF,Type_H,Type_L,Type_M
0,50896,302.3,311.5,1499,38.0,60,0,0,0,0,0,0,1,0
1,53866,301.7,311.0,1713,28.8,17,0,0,0,0,0,0,1,0
2,50498,301.3,310.4,1525,37.7,96,0,0,0,0,0,0,1,0
3,21232,300.1,309.6,1479,47.6,5,0,0,0,0,0,0,0,1
4,19751,303.4,312.3,1515,41.3,114,0,0,0,0,0,0,0,1


In [84]:
# Obtener las predicciones
pred_lgbm = model_LGBM.predict_proba(X_train)[:,1]
pred_xgb = model_XGB.predict_proba(X_train)[:,1]
pred_cat = model_CAT.predict_proba(X_train_CAT.astype(str))[:,1]
pred_hist = model_Hist.predict_proba(X_train)[:,1]
pred_Ada = model_Ada.predict_proba(X_train_scaled)[:,1]

X_train_meta_FE = pd.DataFrame({'lgbm': pred_lgbm, 'xgb': pred_xgb, 'cat': pred_cat, 'hist': pred_hist, 'ada': pred_Ada, 'target': y_train})

pred_lgbm_test = model_LGBM.predict_proba(X_test)[:,1]
pred_xgb_test = model_XGB.predict_proba(X_test)[:,1]
pred_cat_test = model_CAT.predict_proba(X_test.astype(str))[:,1]
pred_hist_test = model_Hist.predict_proba(X_test)[:,1]
pred_Ada_test = model_Ada.predict_proba(X_test_scaled)[:,1]

X_test_meta_FE_TrainOri = pd.DataFrame({'lgbm': pred_lgbm_test, 'xgb': pred_xgb_test,'cat': pred_cat_test, 'hist': pred_hist_test, 'ada': pred_Ada_test})




In [50]:
X_test_CAT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90954 entries, 0 to 90953
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   Product_ID             90954 non-null  category
 1   Type                   90954 non-null  category
 2   Air_temperature_K      90954 non-null  float64 
 3   Process_temperature_K  90954 non-null  float64 
 4   Rotational_speed_rpm   90954 non-null  int64   
 5   Torque_Nm              90954 non-null  float64 
 6   Tool_wear_min          90954 non-null  int64   
 7   TWF                    90954 non-null  int64   
 8   HDF                    90954 non-null  int64   
 9   PWF                    90954 non-null  int64   
 10  OSF                    90954 non-null  int64   
 11  RNF                    90954 non-null  int64   
dtypes: category(2), float64(3), int64(7)
memory usage: 7.5 MB


In [51]:
# Obtener las predicciones
pred_lgbm = model_LGBM.predict_proba(X_train)[:,1]
pred_xgb = model_XGB.predict_proba(X_train)[:,1]
pred_cat = model_CAT.predict_proba(X_train_CAT)[:,1]


X_train_meta_FE = pd.DataFrame({'lgbm': pred_lgbm, 'xgb': pred_xgb, 'cat': pred_cat, 'target': y_train})

pred_lgbm_test = model_LGBM.predict_proba(X_test)[:,1]
pred_xgb_test = model_XGB.predict_proba(X_test)[:,1]
pred_cat_test = model_CAT.predict_proba(X_test_CAT)[:,1]

X_test_meta_FE_TrainOri = pd.DataFrame({'lgbm': pred_lgbm_test, 'xgb': pred_xgb_test,'cat': pred_cat_test})

In [52]:
X_test_meta_FE_TrainOri.to_csv('results/X_test_meta_TrainOri.csv', index=False)
X_train_meta_FE.to_csv('results/X_train_meta_TrainOri.csv', index=False)

## Hyper-parameter tunning for meta model

In [53]:
X_test_meta = X_test_meta_FE_TrainOri

In [54]:
y_train_m = X_train_meta_FE['target']
X_train_m = X_train_meta_FE.drop(['target'], axis=1)

In [98]:
# Libraries
import lightgbm as lgb
# hp 
from hyperopt import hp, tpe, STATUS_OK, Trials, fmin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict, StratifiedKFold

space ={
    'max_depth': hp.quniform('max_depth', 6, 18, 1),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.01, 0.3),
    'subsample': hp.uniform('subsample', 0.4, 1.0),
    'n_estimators': hp.quniform('n_estimators', 100, 400, 10),
    'min_child_samples': hp.quniform('min_child_samples', 20, 100, 1),
    'reg_alpha': hp.uniform('reg_alpha', 0.15, 0.5),
    'reg_lambda': hp.uniform('reg_lambda', 0.3, 1.0)
}



def objective(space):
    model_LGBM_meta = lgb.LGBMClassifier(max_depth = int(space['max_depth']),
                             learning_rate = space['learning_rate'],
                             num_leaves = int(space['num_leaves']),
                             n_estimators = int(space['n_estimators']),
                             colsample_bytree = space['colsample_bytree'],
                             subsample = space['subsample'],
                             is_unbalance = True,
                             min_child_samples = int(space['min_child_samples']),
                             reg_alpha = space['reg_alpha'],
                             reg_lambda = space['reg_lambda'])
    
    cv = StratifiedKFold(n_splits=10)
    y_pred_proba = cross_val_predict(model_LGBM_meta, X_train_m, y_train_m, cv=cv, method='predict_proba')[:,1]
    auc = roc_auc_score(y_train_m, y_pred_proba)
    return {'loss': -auc, 'status': STATUS_OK}

# Run the algorithm
trials = Trials()
best_LGBM_meta = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

print("Best: ", best_LGBM_meta)

100%|██████████| 50/50 [10:47<00:00, 12.95s/trial, best loss: -0.9992910628005761]
Best:  {'colsample_bytree': 0.09081575196369268, 'learning_rate': 0.09832401564492325, 'max_depth': 16.0, 'min_child_samples': 21.0, 'n_estimators': 390.0, 'num_leaves': 21.0, 'reg_alpha': 0.36709509155020276, 'reg_lambda': 0.5389143108424922, 'subsample': 0.7446113262171848}


## Fitting

In [55]:
Best =  {'colsample_bytree': 0.09081575196369268,
         'learning_rate': 0.09832401564492325,
         'max_depth': 16, 
         'min_child_samples': 21, 
         'n_estimators': 390, 
         'num_leaves': 21, 
         'reg_alpha': 0.36709509155020276, 
         'reg_lambda': 0.5389143108424922, 
         'subsample': 0.7446113262171848} # best loss: -0.9992910628005761


In [56]:
Best_meta_LGBM =  {'colsample_bytree': 0.22071462239650944, 
                'learning_rate': 0.0071184882183990906, 
                'max_depth': 12, 
                'min_child_samples': 91, 
                'n_estimators': 180, 
                'num_leaves': 20, 
                'reg_alpha': 0.34357926147986606, 
                'reg_lambda': 0.9332514748078644, 
                'subsample': 0.9562001151634876} # 0.988

best_LGBM_meta = {'colsample_bytree': 0.026181323096692063, 
                  'learning_rate': 0.04207644660879477, 
                  'max_depth': 6, 
                  'min_child_samples': 26, 
                  'n_estimators': 190, 
                  'num_leaves': 21, 
                  'reg_alpha': 0.4910393820375527, 
                  'reg_lambda': 0.6246168616161446, 
                  'subsample': 0.41742010255980644} # 0.9923355097281873

In [57]:
model_meta_LGBM = lgb.LGBMClassifier(**Best)

AUC_META_LGBM_tunned = ROC_AUC(model_meta_LGBM, X_train_m, y_train_m)
print("AUC META LGBM tunned: ", AUC_META_LGBM_tunned)

AUC META LGBM tunned:  0.9894180616922632


In [58]:
model_meta_LGBM.fit(X_train_m, y_train_m)

In [61]:
y_pred = model_meta_LGBM.predict_proba(X_test_meta)[:,1]

# Stacking Classifier

In [67]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
import numpy as np

# Crear el transformador personalizado
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            X_copy[col] = X_copy[col].astype('category')
        return X_copy

# Asignar los datos de entrada
X = X_train
y = y_train

# Crear un conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)


# Columnas que necesitan ser transformadas para model_CAT
categorical_features = ['Product_ID', 'Type']
categorical_transformer = CategoricalTransformer(categorical_features)

# Incluir el transformador en un Pipeline con model_CAT
model_CAT = CatBoostClassifier()
model_CAT = Pipeline(steps=[('categorical_transformer', categorical_transformer),
                            ('classifier', model_CAT)])

# Ahora, model_CAT incluirá el preprocesamiento necesario y puedes usarlo en tu modelo de stacking como antes
base_models = [
    ('LGBM', model_LGBM),
    ('XGB', model_XGB),
    ('CatBoost', model_CAT)  
]

# Definir el metamodelo
meta_model = LogisticRegression()

# Definir el modelo de stacking
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)

# Definir el procedimiento de validación cruzada estratificada
cv = StratifiedKFold(n_splits=5)

# Realizar la validación cruzada
n_scores = cross_val_score(stacking_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# Reportar el rendimiento del modelo
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))


KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import VotingClassifier

# Supongamos que ya tienes tres modelos entrenados: model1, model2, model_CAT
# model1 y model2 están entrenados en X1 y model_CAT está entrenado en X2

# Crear un clasificador de votación
voting_model = VotingClassifier(estimators=[('model1', model_LGBM), ('model2', model_XGB), ('model_CAT', model_CAT)], voting='soft')

# Entrenar el clasificador de votación
# Nota: Esto asume que tienes correspondencias uno a uno entre las filas de X1, X2, y y
# es decir, la i-ésima fila de X1, la i-ésima fila de X2 y el i-ésimo elemento de y todos corresponden a la misma observación
voting_model.fit([X_train, X_train_CAT], y_train)

# Ahora puedes hacer predicciones con el modelo de votación
y_pred = voting_model.predict([X_test, X_test_CAT])


In [73]:
X_train.shape

(146429, 14)

In [74]:
X_train_CAT.shape

(146429, 13)

In [86]:
X_train_CAT.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146429 entries, 0 to 146428
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   Product_ID             146429 non-null  category
 1   Type                   146429 non-null  category
 2   Air_temperature_K      146429 non-null  float64 
 3   Process_temperature_K  146429 non-null  float64 
 4   Rotational_speed_rpm   146429 non-null  int64   
 5   Torque_Nm              146429 non-null  float64 
 6   Tool_wear_min          146429 non-null  int64   
 7   Machine_failure        146429 non-null  int64   
 8   TWF                    146429 non-null  int64   
 9   HDF                    146429 non-null  int64   
 10  PWF                    146429 non-null  int64   
 11  OSF                    146429 non-null  int64   
 12  RNF                    146429 non-null  int64   
dtypes: category(2), float64(3), int64(8)
memory usage: 13.0 MB


In [104]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146429 entries, 0 to 146428
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Product_ID             146429 non-null  int64  
 1   Air_temperature_K      146429 non-null  float64
 2   Process_temperature_K  146429 non-null  float64
 3   Rotational_speed_rpm   146429 non-null  int64  
 4   Torque_Nm              146429 non-null  float64
 5   Tool_wear_min          146429 non-null  int64  
 6   TWF                    146429 non-null  int64  
 7   HDF                    146429 non-null  int64  
 8   PWF                    146429 non-null  int64  
 9   OSF                    146429 non-null  int64  
 10  RNF                    146429 non-null  int64  
 11  Type_H                 146429 non-null  int64  
 12  Type_L                 146429 non-null  int64  
 13  Type_M                 146429 non-null  int64  
dtypes: float64(3), int64(11)
memory usag

In [107]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90954 entries, 0 to 90953
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Product_ID             90954 non-null  int64  
 1   Air_temperature_K      90954 non-null  float64
 2   Process_temperature_K  90954 non-null  float64
 3   Rotational_speed_rpm   90954 non-null  int64  
 4   Torque_Nm              90954 non-null  float64
 5   Tool_wear_min          90954 non-null  int64  
 6   TWF                    90954 non-null  int64  
 7   HDF                    90954 non-null  int64  
 8   PWF                    90954 non-null  int64  
 9   OSF                    90954 non-null  int64  
 10  RNF                    90954 non-null  int64  
 11  Type_H                 90954 non-null  int64  
 12  Type_L                 90954 non-null  int64  
 13  Type_M                 90954 non-null  int64  
dtypes: float64(3), int64(11)
memory usage: 9.7 MB


In [108]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import numpy as np
from sklearn.base import clone

# Supongamos que ya tienes tres modelos entrenados: model1, model2, model_CAT
# model1 y model2 están entrenados en X1 y model_CAT está entrenado en X2

# Crear un clasificador de votación
voting_model = VotingClassifier(estimators=[('model1', model_LGBM), ('model2', model_XGB), ('model_CAT', model_CAT)], voting='soft')

# Predecir con los modelos base y preparar los datos para el metamodelo
X_meta_train = pd.DataFrame({
    'model1': model_LGBM.predict_proba(X_train)[:,1],
    'model2': model_XGB.predict_proba(X_train)[:,1],
    'model_CAT': model_CAT.predict_proba(X_train_CAT)[:,1]
})

X_meta_test = pd.DataFrame({
    'model1': model_LGBM.predict_proba(X_test)[:,1],
    'model2': model_XGB.predict_proba(X_test)[:,1],
    'model_CAT': model_CAT.predict_proba(X_test_CAT)[:,1]
})

# Entrenar el metamodelo
meta_model = LogisticRegression()
meta_model.fit(X_meta_train, y_train)

# Ahora puedes hacer predicciones con el metamodelo
y_pred = meta_model.predict(X_meta_test)


In [110]:
from sklearn.metrics import roc_auc_score

# Primero, obtén las probabilidades de la clase positiva con el método `predict_proba`
y_pred_proba = meta_model.predict_proba(X_meta_train)[:, 1]

# Ahora calcula la métrica ROC AUC
roc_auc = roc_auc_score(y_train, y_pred_proba)

print('ROC AUC: %.3f' % roc_auc)


ROC AUC: 0.992


# Results analysis

In [66]:
model = model_meta_LGBM

In [94]:
y_pred_analysis = model.predict_proba(X_train)[:,1]

ValueError: Number of features of the model must match the input. Model n_features_ is 5 and input n_features is 14

In [38]:
X_train_analysis = X_train.copy()
X_train_analysis['Machine_failure'] = y_train
X_train_analysis['Machine_failure_pred'] = y_pred_analysis
X_train_analysis['Error'] = X_train_analysis['Machine_failure'] - X_train_analysis['Machine_failure_pred']

In [41]:
X_train_analysis.to_csv('data/X_train_analysis.csv', index=False)

# Submission

In [111]:
X_test_submission = pd.read_csv('input/test.csv') # Test data

In [112]:
# Submission to a file
# Rounded pred increases the score
submission = pd.DataFrame({'id': X_test_submission['id'], 'Machine failure': y_pred})
submission.to_csv('submission.csv', index=False)