In [None]:
import pandas as pd

# load dataset
train_data = pd.read_csv("./dataset/train.csv")
test_data = pd.read_csv("./dataset/test.csv")

# Check the first few rows of the dataframe
# print(train_data.isnull().sum)
print(train_data.head(5))

# Data preprocessing

In [None]:
target_columns = ["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]
feature_columns = [i for i in train_data.columns.tolist()[1:] if i not in target_columns]
train_Y = train_data.loc[:, target_columns]
train_X = train_data.loc[:, feature_columns]

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_X)
train_X_scaler  = scaler.transform(train_X)
test_X_scaler = scaler.transform(test_data.drop("id", axis="columns"))

# Grid search

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_X_scaler, train_Y.to_numpy(), test_size=0.3, random_state=10)

# Define the model
xgb = XGBClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [1500, 2000, 2500],
    'learning_rate': [0.003, 0.006, 0.009],
    'max_depth': [3, 5, 7],
    'subsample': [0.4, 0.7, 1.0],
    'colsample_bytree': [0.35, 0.40, 0.45]
}

# Configure Grid Search
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Execute Grid Search
grid_search.fit(X_train, y_train)

# Best parameters and score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best score achieved: {grid_search.best_score_}")


# Model building

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
import numpy as np

class Model:
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.k_fold = KFold(n_splits=5, shuffle=True, random_state=10)

    def fit(self, kwargs):
        for fold, (train_idx, test_idx) in enumerate(self.k_fold.split(self.x)):
            train_data, train_label = self.x[train_idx], self.y[train_idx]
            val_data, val_label = self.x[test_idx], self.y[test_idx]

            model = XGBClassifier(**kwargs)
            model.fit(train_data, train_label)

            train_roc = roc_auc_score(y_true=train_label,y_score=model.predict_proba(train_data))
            val_roc = roc_auc_score(y_true=val_label,y_score=model.predict_proba(val_data))
            print("Fold:",fold, " Train ROC:", np.round(train_roc,5), " Val ROC:",np.round(val_roc,5))

params = {"booster": "gbtree","verbosity": 0,"max_depth": 5,"subsample": 0.7,"reg_alpha": 0.54,
          "random_state": 18,"n_estimators": 2000,"gamma": 0.44,"min_child_weight": 4,
          "reg_lambda": 0.00001,"learning_rate": 0.006,"colsample_bytree": 0.38}
classifier = Model(train_X_scaler, train_Y.to_numpy())
classifier.fit(params)
