# <center>IMPACT PROJECT - GESTAMP</center> 
## <center>Defect Detection using Machine Learning</center> 
### <center>Bayesian optimizer using Hyperopt</center>
<center>Group 14</center> 

<img 
    src="https://www.gestamp.com/getattachment/c8d61c0f-e752-4156-8002-97e21ab43a3f/Imag2-2" width="2400" height="1000" align="center"/>

This notebook can be the 4 datasets

## <center>Table of Contents</center>
1. [Split Dataset](#1)
2. [Hyperparameters Tuning: Hyperopt](#2)
3. [Model Training](#3)
4. [Model Testing and Evaluating](#4)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import sklearn
from hyperopt import hp
import hyperopt

In [None]:
data = pd.read_csv('/kaggle/input/binary-strat1-2/binary_strat2_le_ss.csv')


<a id='1'>**Split Dataset**</a>

In [None]:
X = data.drop('Defect', axis=1)
y = data['Defect']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.176, random_state=42, stratify=y_train)


<a id='2'>**Hyperparameters Tuning: Hyperopt**</a>

In [None]:
hyperparams = {
  'gamma': hp.quniform('gamma', 1, 5, 1),
  'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1),
  'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
  'n_estimators': hp.quniform('n_estimators',1,100,1),
  'max_depth': hp.quniform('max_depth',1,10,1),
  'learning_rate': hp.uniform('learning_rate',0.01, 0.1)
}

In [None]:
def make_model(hyperparams):
    hyperparams['max_depth'] = round(hyperparams['max_depth'])
    hyperparams['n_estimators'] = round(hyperparams['n_estimators'])
    return xgb.XGBClassifier(
          **hyperparams,
          objective='binary:logistic',
          tree_method='hist',
          enable_categorical=True,
          eval_metric='auc',
          # early_stopping_rounds=25,
          n_jobs=-1,
          seed=0,
      )


def objective(hyperparams):
    model = make_model(hyperparams)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    y_hat = model.predict_proba(X_val)[:,1]
    auc_val = sklearn.metrics.roc_auc_score(y_val,y_hat)

    return {'loss': -auc_val, 'status': hyperopt.STATUS_OK }


In [None]:
trials = hyperopt.Trials()

best_hyperparams = hyperopt.fmin(
    fn=objective,
    space=hyperparams,
    algo=hyperopt.tpe.suggest,
    max_evals=200,
    trials=trials)

In [None]:
best_hyperparams

<a id='3'>**Model Training**</a>

In [None]:
model = make_model(best_hyperparams)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

In [None]:
xgb.plot_importance(model)

In [None]:
# Getting the feature importances from the model
importance_dict = model.get_booster().get_score(importance_type='weight')

# Creating a DataFrame from the importances
importance_df = pd.DataFrame(list(importance_dict.items()), columns=['Feature', 'Importance'])

# Sorting the DataFrame by importance (descending order)
importance_df = importance_df.sort_values('Importance', ascending=False)

# Printing the table format
print(importance_df)

<a id='4'>**Model Testing and Evaluation**</a>

In [None]:
y_pred_prob = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print("AUC: {:.4f}".format(auc))