## Part 1: Preprocessing

In [45]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [46]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [47]:
# Create y_df with the Attrition and Department columns
y_df=attrition_df[['Attrition','Department']]
y_df.head()


Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [48]:
# Create a list of at least 10 column names to use as X data
columns = ['Age', 'Education', 'JobSatisfaction', 'OverTime', 'DistanceFromHome', 'StockOptionLevel', 'WorkLifeBalance', 'YearsAtCompany', 'YearsSinceLastPromotion', 'NumCompaniesWorked']


# Create X_df using your selected columns
X_df = attrition_df[columns]
display(X_df.head())

# Show the data types for X_df
X_df.dtypes


Unnamed: 0,Age,Education,JobSatisfaction,OverTime,DistanceFromHome,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked
0,41,2,4,Yes,1,0,1,6,0,8
1,49,1,2,No,8,1,3,10,1,1
2,37,2,3,Yes,2,0,3,0,0,6
3,33,4,3,Yes,3,0,3,8,3,1
4,27,1,2,No,2,1,3,2,2,9


Unnamed: 0,0
Age,int64
Education,int64
JobSatisfaction,int64
OverTime,object
DistanceFromHome,int64
StockOptionLevel,int64
WorkLifeBalance,int64
YearsAtCompany,int64
YearsSinceLastPromotion,int64
NumCompaniesWorked,int64


In [49]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=42)


In [50]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
X_train['OverTime'].value_counts()

Unnamed: 0_level_0,count
OverTime,Unnamed: 1_level_1
No,780
Yes,322


In [51]:
ohe = OneHotEncoder(drop='first', sparse_output=False)
ohe.fit(np.array(X_train['OverTime']).reshape(-1,1))
X_train['OverTime_ohe'] = ohe.transform(np.array(X_train['OverTime']).reshape(-1,1))
X_test['OverTime_ohe'] = ohe.transform(np.array(X_test['OverTime']).reshape(-1,1))

In [52]:
X_train

Unnamed: 0,Age,Education,JobSatisfaction,OverTime,DistanceFromHome,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked,OverTime_ohe
1343,29,3,1,No,7,0,3,3,1,3,0.0
1121,36,4,3,No,1,0,3,1,0,6,0.0
1048,34,3,1,No,3,0,3,13,3,3,0.0
1393,27,3,4,No,9,0,3,7,0,1,0.0
527,32,3,4,No,10,0,2,10,0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1130,35,3,3,No,28,2,2,10,6,1,0.0
1294,41,3,2,No,5,0,1,3,1,3,0.0
860,22,4,4,Yes,3,1,3,0,0,0,1.0
1459,29,2,2,Yes,13,1,3,4,0,4,1.0


In [53]:
X_train = X_train.drop(columns=['OverTime'])
X_test = X_test.drop(columns=['OverTime'])
display(X_train.head())
display(X_test.head())

Unnamed: 0,Age,Education,JobSatisfaction,DistanceFromHome,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked,OverTime_ohe
1343,29,3,1,7,0,3,3,1,3,0.0
1121,36,4,3,1,0,3,1,0,6,0.0
1048,34,3,1,3,0,3,13,3,3,0.0
1393,27,3,4,9,0,3,7,0,1,0.0
527,32,3,4,10,0,2,10,0,1,0.0


Unnamed: 0,Age,Education,JobSatisfaction,DistanceFromHome,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked,OverTime_ohe
1041,28,3,1,5,0,3,5,1,0,0.0
184,53,2,1,13,2,3,4,1,1,0.0
1222,24,1,3,22,1,3,1,0,1,0.0
67,45,3,1,7,1,3,1,0,2,0.0
220,36,2,2,5,0,4,13,3,8,0.0


In [54]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_scaler = scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [55]:
X_train_scaled, X_test_scaled

(array([[-0.85215857,  0.09401789, -1.56933068, ..., -0.36054841,
          0.13887164, -0.6425111 ],
        [-0.09308799,  1.05334858,  0.25109291, ..., -0.68226854,
          1.33136421, -0.6425111 ],
        [-0.3099653 ,  0.09401789, -1.56933068, ...,  0.28289183,
          0.13887164, -0.6425111 ],
        ...,
        [-1.61122915,  1.05334858,  1.1613047 , ..., -0.68226854,
         -1.05362093,  1.55639335],
        [-0.85215857, -0.8653128 , -0.65911888, ..., -0.68226854,
          0.53636916,  1.55639335],
        [ 1.42505317,  0.09401789,  0.25109291, ..., -0.68226854,
          0.53636916,  1.55639335]]),
 array([[-0.96059722,  0.09401789, -1.56933068, ..., -0.36054841,
         -1.05362093, -0.6425111 ],
        [ 1.75036913, -0.8653128 , -1.56933068, ..., -0.36054841,
         -0.65612341, -0.6425111 ],
        [-1.39435184, -1.8246435 ,  0.25109291, ..., -0.68226854,
         -0.65612341, -0.6425111 ],
        ...,
        [ 0.34066662, -0.8653128 ,  1.1613047 , ...,  

In [56]:
# Create a OneHotEncoder for the Department column
ohe_dept_encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
y_train_dept_encoded = ohe_dept_encoder.fit_transform(np.array(y_train['Department']).reshape(-1,1))
y_test_dept_encoded = ohe_dept_encoder.transform(np.array(y_test['Department']).reshape(-1,1))

# Create two new variables by applying the encoder
# to the training and testing data

y_train_dept_encoded


array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [57]:
# Create a OneHotEncoder for the Attrition column
ohe_attrn_encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
y_train_attrn_encoded = ohe_attrn_encoder.fit_transform(np.array(y_train['Attrition']).reshape(-1,1))
y_test_attrn_encoded = ohe_attrn_encoder.transform(np.array(y_test['Attrition']).reshape(-1,1))

# Create two new variables by applying the encoder
# to the training and testing data
y_train_attrn_encoded


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

## Create, Compile, and Train the Model

In [58]:
# Find the number of columns in the X training data
input_dim = X_train_scaled.shape[1]
print(input_dim)

# Create the input layer
input_layer = layers.Input(shape=(input_dim,))

# Create at least two shared layers
shared_layer1 = layers.Dense(units=64, activation='relu',name='shared_layer1')(input_layer)
shared_layer2 = layers.Dense(units=128, activation='relu',name='shared_layer2')(shared_layer1)

10


In [59]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
departement_hidden_layer = layers.Dense(units=32, activation='relu')(shared_layer2)

# Create the output layer
department_output_layer = layers.Dense(y_train_dept_encoded.shape[1],
                                       activation='softmax',
                                       name='department_output')(departement_hidden_layer)


In [60]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden_layer = layers.Dense(units=32, activation='relu')(shared_layer2)

# Create the output layer
attrition_output_layer = layers.Dense(y_train_attrn_encoded.shape[1],
                                       activation='sigmoid',
                                       name='attrition_output')(attrition_hidden_layer)


In [61]:
# Create the model
model = Model(inputs=input_layer,
              outputs=[department_output_layer, attrition_output_layer])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy',
                    'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize the model
model.summary()

In [62]:
# Train the model
result = model.fit(X_train_scaled,
                   {'department_output': y_train_dept_encoded, 'attrition_output': y_train_attrn_encoded},
                   epochs=100,
                   batch_size=32,
                   validation_data=(X_test_scaled, {'department_output': y_test_dept_encoded,
                                                    'attrition_output': y_test_attrn_encoded}), verbose=1)


Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - attrition_output_accuracy: 0.8003 - department_output_accuracy: 0.6516 - loss: 1.4595 - val_attrition_output_accuracy: 0.8696 - val_department_output_accuracy: 0.6522 - val_loss: 1.1956
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8334 - department_output_accuracy: 0.6639 - loss: 1.2186 - val_attrition_output_accuracy: 0.8696 - val_department_output_accuracy: 0.6522 - val_loss: 1.1763
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.8280 - department_output_accuracy: 0.6362 - loss: 1.1805 - val_attrition_output_accuracy: 0.8696 - val_department_output_accuracy: 0.6522 - val_loss: 1.1732
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - attrition_output_accuracy: 0.8461 - department_output_accuracy: 0.6441 - loss: 1.1039 - val_a

In [63]:
# Evaluate the model with the testing data
evaluation = model.evaluate(x=X_test_scaled,
 y={'department_output': y_test_dept_encoded,
    'attrition_output': y_test_attrn_encoded}, verbose=1)

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8134 - department_output_accuracy: 0.5216 - loss: 4.3071 


In [64]:
# Display the evaluation results
evaluation.append(np.mean(result.history['loss']))
evaluation.append(np.mean(result.history['department_output_accuracy']))
evaluation.append(np.mean(result.history['attrition_output_accuracy']))
display(evaluation)

[4.073868274688721,
 0.823369562625885,
 0.54076087474823,
 0.4529895916208625,
 0.8733575332164765,
 0.9487295842170715]

In [65]:
# Extract specific metrics from the `result` object
loss = result.history['loss'][-1]
department_accuracy = result.history['department_output_accuracy'][-1]
attrition_accuracy = result.history['attrition_output_accuracy'][-1]
display(loss)
display(department_accuracy)
display(attrition_accuracy)

0.045184411108493805

0.9972776770591736

0.9990925788879395

In [66]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {evaluation[2]}")
print(f"Attrition Accuracy: {evaluation[1]}")

Department Accuracy: 0.54076087474823
Attrition Accuracy: 0.823369562625885


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Accuracy is not the best metric to use on this data as it does not account for class imbalances. The number of employees leaving the company or attrition is greater than employees that stay, accuracy can be inaccurate. We have to use alternative metrics such as balanced accuracy, precision, recall, or F1 score that can provide more insight into the model performance on each class, particularly, the minority class. These metrics take into consideration the true positive, false positive and false negative rates; resulting in more comprehensive view of the model's effectiveness.

2. Attrition_output is a binary classification problem, sigmoid activation function is used. It outputs a value between 0 and 1, which can be interpreted as probability of the positive class. For Department_output which is a multi-class classification problem, softmax activation function is preferred. Softmax outputs a probability distribution across multiple classes, ensuring that the sum of the probabilities equals 1. This makes it suitable for problems where each instance belongs to one of several classes. In this model, these activation functions enable better decision-making based off the predicted probabilities.

3. For improvements in this HR model we can use the following:
a. Data Collection: Gather more comprehensive data, including employee feedback from surveys, exit interviews, and historical data on promotions and transfers. This can help identify patterns related to attrition and departmental fit.

b. Hyperparameter Tuning: Use techniques like grid search or random search to optimize the hyperparameters of the model, which can lead to better performance.

c. Class Imbalance Techniques: If the dataset is imbalanced (e.g., significantly more employees staying than leaving), consider using techniques like oversampling the minority class, undersampling the majority class, or applying algorithms that are robust to class imbalance.

d. Regularization: Apply regularization techniques (like L1 or L2 regularization) to reduce overfitting, especially if the model has a large number of features.

e.Feedback Loop: Establish a feedback mechanism to continuously update the model with new data and insights as employee behaviors and organizational dynamics change over time.

f. Model Selection: Experiment with different algorithms, such as decision trees, random forests, gradient boosting machines, or neural networks, to find the best fit for the data.

g. Cross-Validation: Implement k-fold cross-validation to ensure that the model's performance is consistent across different subsets of the data and to avoid overfitting.

h. Ensemble Methods: Combine predictions from multiple models (e.g., using bagging or boosting techniques) to improve overall predictive performance.

By implementing these strategies, the HR model can become more accurate and effective in predicting employee attrition and identifying the best-fit department for each employee.