In [None]:
### ATTEMPT AT PREDICTIVE MODELING ###

#0. Import packages

import pandas as pd
import numpy as np
import os
import xgboost
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.neural_network import MLPClassifier


# Set anything you might want to use later in the code, sucha as parameter grids, model specifications, etc

### Set parameters ahead of time for scoring models ###
scoring = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision (Macro)': make_scorer(precision_score, average='macro'),
    'Recall (Macro)': make_scorer(recall_score, average='macro'),
    'F1 Score (Macro)': make_scorer(f1_score, average='macro'),
    'AUC': make_scorer(roc_auc_score)} # Can add others if needed

# Define the parameter grid for xgboost
param_grid_XGBoost = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9]}

# Initialize XGBoost model
model_XGBoost = xgb.XGBClassifier()

# Define parameter grid for logistic regression

In [None]:
#1. Data Collection and Preprocessing:

#Read in dataset

#The dataset used for this is the IBM HR Dataset: https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset
#It is free to download, publicly availab, and seems to be the dataset of choice for this sort of endeavor
turnover_data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
print(turnover_data.head())
print(turnover_data.columns)

In [None]:
#Best Practice: Always split data at the beginning of the code to prevent any leakage

def split_data_train_val_test(data, DV_Name):
    X = data.drop(columns=[DV_Name])
    y = data[DV_Name]

    print(data.head())
    print(data.columns)
    print(data[DV_Name].value_counts())
    # Split the data into training and temporary sets
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

    # Split the temporary set into validation and test sets
    # The training set is 75% of the previous 80% of the data (or 60% of the overall data)
    # The validation set is the remaining 20%, making it equivalent to the test set in size
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=9)

    return X_train, X_val, X_test, y_train, y_val_y_test

In [None]:
#this dataset is incredibly clean, but if using a real dataset, you'll likely need to add a step or several for cleaning 
#and any sort of imputation of missing values, if using

In [None]:
#visualize data
def visualize_your_data(your_data):
    for column in your_data.columns:
        sns.histplot(data=turnover_data, x=column, kde=True, bins=your_data[column].nunique())
        plt.show()

In [None]:
### 2. Feature Selection: ##

#Correlation Analysis: Identify features that are highly correlated with turnover using correlation matrices or other statistical techniques.
def check_correlation_of_features(df):

    # Compute the correlation matrix
    corr = df.corr()

    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Set color pallette
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})

    # Add a title
    plt.title('Correlation of Features', fontsize=16)

    # Optionally, you may also want to add a color bar label to serve as a legend for the correlation values
    cbar = ax.collections[0].colorbar
    cbar.set_label('Correlation Coefficient')

    plt.show()

In [None]:
#4. Model Training and Evaluation: Logistic Regression

In [None]:
#4. Model Training and Evaluation: XGBoost
def fit_model_and_hyperparameter_tune(ParameterGrid, ModelSpecification, Data_Splits_List, Use_Validation_Set = "Yes", nCV = 5):

    ''' The goal of this function is to be the function that carries out both model fitting to the training set, hyperparameter/n
    tuning, fitting to the validation set, and fitting to the test set - and in a way that's reusable with different models/n
    Inputs:/n
    ParameterGrid: Specify a param_grid at the beginning of the code (or multiple if evaluating different models) to pass in for hyperparameter tuning/n
    ModelSpecification: Specify the model to use (ie neural network, XGBoost, logistic regression) and any parameters for it you don't want to tune/n
    Data_Splits_List: Just the X_train, X_val, X_test, y_train, y_val, y_test splits of the data. A list seemed easier than 6 more parameters/n
    Use_Validation_Set: Indicator for if you want to use train/val/test splits instead of train/test splits. Assumes use of validation set/n
    nCV = number of folds to use in CV part of gridsearch. Defaults to that default of 5, but is formatted this way in case you want to change it '''

    # Initialize GridSearch using parameters passed into function
    grid_search = GridSearchCV(estimator=ModelSpecification, param_grid=ParameterGrid,
                               cv=nCV, n_jobs=-1, verbose=2, scoring='accuracy') #use all available cores: n_jobs = -1

    # Fit to the training data
    grid_search.fit(X_train, y_train)

    # Get the best parameters, show them to the world
    best_params = grid_search.best_params_
    print(f'Best Parameters: {best_params}')

    #check that data was split in a way to use a validation set
    if Use_Validation_Set == "Yes":
        if len(Data_Splits_List) != 6:
            print("Your data splits do not support the use of a train/validation/test split./n/nMake sure you do this at the beginning of the code")
        
        # If okay, tell the model the best parameters to use so it can use them
        best_model = ModelSpecification(**best_params)

        #Then start the procedure to go fit to the training set, and if supported by the model, the validation set directly
        Standard_Model_Fit_Procedures = ['LogisticRegression','RandomForestClassifier','SVC', 'KNeighborsClassifier', 
                                     'GaussianNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 
                                     'Perceptron', 'SGDClassifier', 'RidgeClassifier', 'MLPClassifier',  # Multi-layer Perceptron
                                     'QuadraticDiscriminantAnalysis','AdaBoostClassifier']

        if type(best_model).__name__ in Standard_Model_Fit_Procedures:
            best_model.fit(X_train, y_train)
            
        #other types of models come with the validation procedure built in, and additional parameter options
        elif type(best_model).__name__ == 'XGBClassifier':
            best_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10)
        elif type(best_model).__name__ == 'Sequential':  # Assuming a Keras Sequential model
            early_stopping = EarlyStopping(monitor='val_loss', patience=10)
            best_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[early_stopping])
        else:
            print(f"Training procedure for {model_name} not defined")
    else:
        #Just fit to the training set without validation procedure
        best_model = ModelSpecification(**best_params)

        #Then start the procedure to go fit to the training set, and if supported by the model, the validation set directly
        Standard_Model_Fit_Procedures = ['LogisticRegression','RandomForestClassifier','SVC', 'KNeighborsClassifier', 
                                     'GaussianNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 
                                     'Perceptron', 'SGDClassifier', 'RidgeClassifier', 'MLPClassifier',  # Multi-layer Perceptron
                                     'QuadraticDiscriminantAnalysis','AdaBoostClassifier']

        if type(best_model).__name__ in Standard_Model_Fit_Procedures:
            best_model.fit(X_train, y_train)
            
        #other types of models come with the validation procedure built in
        elif type(best_model).__name__ == 'XGBClassifier':
            best_model.fit(X_train, y_train, early_stopping_rounds=10)
        elif type(best_model).__name__ == 'Sequential':  # Assuming a Keras Sequential model
            early_stopping = EarlyStopping(monitor='val_loss', patience=10)
            best_model.fit(X_train, y_train, epochs=100, callbacks=[early_stopping])
        else:
            print(f"Training procedure for {model_name} not defined")
            
    # Then evaluate on the test set
    test_predictions = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_predictions)
    print(f'Test Accuracy: {test_accuracy}')

In [None]:
#4. Model Training and Evaluation: RandomForests

In [None]:
def run_pipeline():
    X_train, X_val, X_test, y_train, y_val_y_test = split_data_train_val_test(turnover_data, DV_Name = 'Attrition')
    #visualize_your_data(turnover_data)
    check_correlation_of_features(turnover_data)
    fit_model_and_hyperparameter_tune(ParameterGrid, ModelSpecification, Data_Splits_List = [X_train, X_val, X_test, y_train, y_val_y_test], Use_Validation_Set = "Yes", nCV = 5)
run_pipeline()