# Modeling
XGBoost and LightGBM


## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
from sklearn.metrics import classification_report, confusion_matrix
pd.set_option("display.max_columns", None)

In [None]:
df = pd.read_csv('../data/processed/HRDataset_p_v4.csv')
df.head()

In [None]:
categorical_features = [
    'GenderID',
    'FromDiversityJobFairID',
    'State',
    'CitizenDesc',
    'HispanicLatino',
    'RaceDesc',
    'Department',
    'ManagerName',
    'RecruitmentSource',
    'HireYear',
    'HireMonth'
]
numeric_features = [
    'Salary',
    'EngagementSurvey',
    'EmpSatisfaction',
    'SpecialProjectsCount',
    'DaysLateLast30',
    'Absences',
    'Age',
    'NumberOfColleagues'
]
label = 'PerfScoreID'

### Split the Data

In [None]:
# Separate features and labels
X = df.drop(columns=[label])
y = df[label]
y = y - 1

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


### Encode and Scale

In [None]:
# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # We don't have missing numerical values
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Handle unknown categories
        ]), categorical_features)
    ]
)


In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

## Model Training

In [None]:
# Define pipelines for classification
# Define pipelines for both classifiers
classifiers = {
    'XGBoost': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ]),
    'LightGBM': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier())
    ])
}

In [None]:
# Define pipelines for both classifiers
classifiers = {
    'XGBoost': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier())
    ]),
    'LightGBM': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier())
    ])
}

# Define parameter grids for GridSearchCV with multiple values
param_grids = {
    'XGBoost': {
        'classifier__n_estimators': [10, 20, 30, 40, 50, 70, 100],  # Different numbers of trees
        'classifier__max_depth': [1, 3, 5, 7, 9],          # Different tree depths
        'classifier__learning_rate': [0.01, 0.03, 0.05, 0.1] # Different learning rates
    },
    'LightGBM': {
        'classifier__n_estimators': [10, 20, 30, 40, 50, 70, 100],  # Different numbers of trees
        'classifier__max_depth': [1, 3, 5, 7, 9],          # Different tree depths
        'classifier__learning_rate': [0.01, 0.03, 0.05, 0.1], # Different learning rates
       'classifier__min_data_in_leaf': [1,2,3]
    }
}

param_distributions = {
    'XGBoost': {
        'classifier__n_estimators': np.random.randint(10,100, size=10).tolist(),  # Random integers between 50 and 150
        'classifier__max_depth':  np.random.randint(1,7, size=10).tolist(),
        'classifier__learning_rate':  np.random.random((0,1)).tolist()
    },
    'LightGBM': {
        'classifier__n_estimators':  np.random.randint(10,100, size=10).tolist(),  # Random integers between 50 and 150
        'classifier__max_depth': np.random.randint(1,7, size=10).tolist(),
        'classifier__learning_rate': np.random.random((0,1)).tolist(),
        'classifier__min_data_in_leaf': [1,2,3]
    }
}


### No Tuning

In [None]:
for name, model in classifiers.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)


    print(f"{name} Accuracy: {np.mean(y_pred == y_test)}")
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred))

### Grid Search

In [None]:
#Gridsearch optimization

for name, model in classifiers.items():
    grid_search = GridSearchCV(estimator=model,
                               param_grid=param_grids[name],
                               scoring='accuracy',
                               cv=kfold,
                               verbose=1,
                               n_jobs=-1,
                               error_score='raise')
    #model.fit(X_train, y_train)
    grid_search.fit(X_train, y_train)

    #y_pred = model.predict(X_test)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

   # plot the confusion matrix as a heatmap
    cm = confusion_matrix(y_test, y_pred)
    ax = sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
    ax.set_title(name + " Confusion Matrix")
    ax.set_xlabel("Predicted Label")
    ax.set_ylabel("True Label")
    plt.show()

    print(f"{name} Accuracy: {np.mean(y_pred == y_test)}")
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print('Best hyperparameters are: '+str(grid_search.best_estimator_))

### Random Search

In [None]:
for name, model in classifiers.items():
    random_search = RandomizedSearchCV(estimator=classifiers,
                                       param_distributions=param_distributions[name],
                                       n_iter=10,  # Number of parameter settings to sample
                                       scoring='accuracy',
                                       cv=kfold,
                                       verbose=1,
                                       n_jobs=-1,
                                       random_state=42,
                                       error_score='raise')
    #model.fit(X_train, y_train)
    grid_search.fit(X_train, y_train)

    #y_pred = model.predict(X_test)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

   # plot the confusion matrix as a heatmap
    cm = confusion_matrix(y_test, y_pred)
    ax = sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
    ax.set_title(name + " Confusion Matrix")
    ax.set_xlabel("Predicted Label")
    ax.set_ylabel("True Label")
    plt.show()

    print(f"{name} Accuracy: {np.mean(y_pred == y_test)}")
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print('Best hyperparameters are: '+str(grid_search.best_params_))

### Optuna

In [None]:
# Define the objective function for Optuna
def objective(trial, model_name):
    if model_name == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 150, step=50)
        max_depth = trial.suggest_int('max_depth', 3, 7)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)

        model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, use_label_encoder=False, eval_metric='logloss')
    else:  # LightGBM
        n_estimators = trial.suggest_int('n_estimators', 50, 150, step=50)
        max_depth = trial.suggest_int('max_depth', 3, 7)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
        min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 15)

        model = LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, min_data_in_leaf=min_data_in_leaf)

    # Perform k-fold cross-validation
    scores = []

    for train_index, valid_index in kfold.split(X_train):
        X_fold_train, X_fold_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_fold_train, y_fold_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

        # Preprocessing
        X_fold_train_processed = preprocessor.fit_transform(X_fold_train)
        X_fold_valid_processed = preprocessor.transform(X_fold_valid)

        model.fit(X_fold_train_processed, y_fold_train)
        y_valid_pred = model.predict(X_fold_valid_processed)
        score = accuracy_score(y_fold_valid, y_valid_pred)
        scores.append(score)

    return np.mean(scores)

# Optimize hyperparameters for both models
for model_name in ['XGBoost', 'LightGBM']:
    print(f"Starting Optuna optimization for {model_name}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=30)

    # Print the best parameters and best score
    print(f"Best hyperparameters for {model_name}: {study.best_params}")
    print(f"Best cross-validated score for {model_name}: {study.best_value}")

    # Train the best model on the entire training set and evaluate
    if model_name == 'XGBoost':
        best_model = XGBClassifier(**study.best_params, use_label_encoder=False, eval_metric='logloss')
    else:
        best_model = LGBMClassifier(**study.best_params)

    # Fit the model on the full training data
    X_train_processed = preprocessor.fit_transform(X_train)
    best_model.fit(X_train_processed, y_train)

    # Predict on the test set
    X_test_processed = preprocessor.transform(X_test)
    y_pred = best_model.predict(X_test_processed)

    # Plot the confusion matrix as a heatmap
    cm = confusion_matrix(y_test, y_pred)
    ax = sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
    ax.set_title(f"{model_name} Confusion Matrix")
    ax.set_xlabel("Predicted Label")
    ax.set_ylabel("True Label")
    plt.show()

    # Print metrics
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(f"{model_name} Classification Report:\n{class_report}")