# Utility Clustering

A novel implementation of methodologies described in the following paper:
<br>
###### The Utility of Clustering in Prediction Tasks: Shubhendu Trivedi, Zachary A. Pardos and Neil T. Heffernan
###### https://ttic.uchicago.edu/~shubhendu/Papers/clustering_bagging.pdf
<br>
Briefly, the following steps will be performed:
<br>
<br>
1) Create sample with c designated clusters
<br>
2) Perform kmeans clustering for k ∈ {1, 2, ..., K}
<br>
3) For each cluster set, fit k models (one model for each cluster), ultimately creating a "cluster model"
<br>
4) Obtain predictions for a test set for the K cluster models
<br>
5) Ensemble predictions for final prediction

###### 1) Load and preprocess HR data

In [1]:
from dfply import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pandas_profiling import profile_report
from matplotlib import pyplot as plt
from functools import reduce

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Import data
hr = pd.read_csv("/Users/mfairb/Documents/ML Projects/Project - HR Analytics/hr_analytics.csv")

# Keep numeric only for simplicity, reorder, drop 'employee_id'
X = hr._get_numeric_data() >> drop(X.is_promoted, X.previous_year_rating)
y = hr['is_promoted']

# Scale predictors
scaler = StandardScaler()
col_names = list(X.columns)
col_names.remove('employee_id')
X[col_names] = scaler.fit_transform(X[col_names])

# Create train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


# Check for null values
X_train.apply( lambda x: sum( x.isnull() ) )

# View X_train
#X_train

employee_id           0
no_of_trainings       0
age                   0
length_of_service     0
KPIs_met >80%         0
awards_won?           0
avg_training_score    0
dtype: int64

###### 2) Perform kmeans clustering for k ∈ {1, 2, ..., K}

In [46]:
# Import packages
from numpy import unique, where
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score

# Create K models
k_labels = pd.DataFrame()
K = 8
for i in range(1, K):
    s_i = str(i)
    exec(f'k{s_i} = KMeans(n_clusters = {s_i}, init = "random")') # Init. model
    exec(f'k{s_i}.fit(X_train >> drop(X.employee_id, contains("k")))') # Fit model
    exec(f'k{s_i}_yhat = k{s_i}.predict(X_train >> drop(X.employee_id, contains("k")))') # Get cluster labels
    exec(f'X_train["k{s_i}"] = k{s_i}_yhat') # Add to k_labels df
    exec(f'k{s_i}_clusters = unique(k{s_i}_yhat)') # Get unique clusters
    
# cbind X_train and k_labels
X_train = pd.concat([X_train.reset_index(drop=True), k_labels.reset_index(drop=True)], axis = 1)
y_train = y_train.reset_index(drop=True)

We now have all kmeans models and labels have been added to X_train

###### 3) For each cluster set, fit k models (one model for each cluster), ultimately creating a "cluster model"

Create training sets and logistic regression models

In [53]:
from sklearn.linear_model import LogisticRegression

final_probs = pd.DataFrame()

# Loop over K cluster sets
for k in range(1, K):
    
    str_k = str(k)
    
    # Loop though individual clusters in the cluster set
    for l in range(0, k):

        # Convert indices to str for exec statements
        str_l = str(l)
        indices = str_k + str_l
        X_indices = 'X' + indices
        y_indices = 'y' + indices

        # Create Xkl & ykl
        exec(f'{X_indices}= X_train[X_train.k{str_k} == {str_l}]')
        exec(f'{y_indices}= y_train[X_train.k{str_k} == {str_l}]')
        
        # Drop cluster label columns and employee_id
        exec(f'{X_indices}={X_indices} >> drop(contains("k"), X.employee_id)')
        
        # Fit logistic regression (logkl)
        exec(f'log{indices} = LogisticRegression().fit({X_indices}, {y_indices})')

We now have all X_kl training sets and log_kl regression models

###### 4) Get cluster lables and obtain predictions for a test set for the K cluster models

In [67]:
# Loop over K cluster sest
for k in range(1, K):
    
    str_k = str(k)
    
    # Create X_testk (to get all predictions)
    exec(f'X_test{str_k} = pd.DataFrame()') # Create X_testk

         
    # Get cluster labels for X_test and add them to X_test (call it hold)
    exec(f'labels = pd.DataFrame(k{str_k}.predict(X_test >> drop(X.employee_id)), columns = ["label"])')
    hold = pd.concat([X_test.reset_index(drop = True), labels], axis = 1)
         
    # Loop over clusters in cluster set
    for l in range(0, k):
        
       # Convert indices to str for exec statements
        str_l = str(l)
        indices = str_k + str_l
        X_test_indices = 'X_test_' + indices
        y_test_indices = 'y_test' + indices

        # Create X_test
        exec(f'{X_test_indices} = hold[hold.label == {str_l}]')
        exec(f'{X_test_indices} = {X_test_indices}.reset_index(drop = True)')        
        
        # Get predictions from respective models and attach to employee_id
        exec(f'{X_test_indices}["predicted_prob"] = pd.Series(log{indices}.predict_proba({X_test_indices} >> drop("employee_id", "label"))[:,1])')
        
        # Append to X_testi
        exec(f'X_test{str_k} = X_test{str_k}.append({X_test_indices}) >> select("employee_id", "predicted_prob")')

We now have predicted probabilities from all 7 cluster set models.

###### 5) Ensemble predictions

In [70]:
from functools import reduce
from sklearn.metrics import roc_auc_score

# Create list of dfs to merge
tests = [X_test1, X_test2, X_test3, X_test4, X_test5, X_test6, X_test7]

# Merge using reduce(lambda...)) & change column names
all_tests = reduce(lambda  left,right: pd.merge(left,right,on=['employee_id'],
                                            how='left'), tests)
all_tests.columns = ['employee_id', 'X_test1', 'X_test2', 'X_test3', 'X_test4', 'X_test5', 'X_test6', 'X_test7']

# Calculate rowmeans
all_tests['final_prediction'] = all_tests.drop('employee_id', axis=1).apply(lambda x: x.mean(), axis = 1)

In [71]:
# Print AUC for each 
for i in range(1, 9):
    if i != 8:
        print('X_test' + str(i) + ' AUC = ' + str(roc_auc_score(y_test, all_tests.iloc[:,i])))
    else:
        print('Ensemble AUC = ' + str(roc_auc_score(y_test, all_tests.iloc[:,i])))

X_test1 AUC = 0.7962544209178521
X_test2 AUC = 0.794527748413664
X_test3 AUC = 0.8001164223698546
X_test4 AUC = 0.8024241653783255
X_test5 AUC = 0.8013704875969277
X_test6 AUC = 0.8018165916231269
X_test7 AUC = 0.8000985357382658
Ensemble AUC = 0.8028421735083305


With no tuning of individual or cluster models, our ensemble performs best with an AUC of 0.8028