## Part 1: Preprocessing

In [186]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Header modified to account for changes required for the later versions of keras and tensorflow
# from tensorflow.keras.models import Model
# from tensorflow.keras import layers

import os
import tensorflow as tf
from keras import Model
from keras import layers
os.environ["KERAS_BACKEND"] = "tensorflow"


from sklearn.preprocessing import OneHotEncoder
#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [207]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [188]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[["Attrition", "Department"]].copy()
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [208]:
y_df.value_counts().unique

<bound method Series.unique of Attrition  Department            
No         Research & Development    828
           Sales                     354
Yes        Research & Development    133
           Sales                      92
No         Human Resources            51
Yes        Human Resources            12
Name: count, dtype: int64>

In [190]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df,y_df)

# Show results of the split
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1102, 11)
X_test shape: (368, 11)
y_train shape: (1102, 2)
y_test shape: (368, 2)


In [191]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

# Create an instance of the OHE
ohe = OneHotEncoder(sparse_output=False)

# Fit ohe and transform X_train
X_train_encoded = ohe.fit_transform(X_train[['JobRole']])
# Transform x_test
X_test_encoded = ohe.transform(X_test[['JobRole']])

# Get the column names from the ohe
encoded_cols = ohe.get_feature_names_out(['JobRole'])

# Convert the arrays to dataframes 
X_train_ohe_df = pd.DataFrame(X_train_encoded, columns=encoded_cols, index=X_train.index)
X_test_ohe_df = pd.DataFrame(X_test_encoded, columns=encoded_cols, index=X_test.index)

# Drop the original JobRole column
X_train = X_train.drop('JobRole', axis=1)
X_test = X_test.drop('JobRole', axis=1)

# Check the results
X_train.head()

Unnamed: 0,DistanceFromHome,Education,JobSatisfaction,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,JobLevel,Age,PercentSalaryHike,HourlyRate
774,2,1,1,3,4,9,4,55,15,40
341,15,2,4,3,3,11,3,38,14,92
1124,6,3,3,3,3,7,3,39,11,38
737,7,2,3,6,3,2,2,41,14,42
1314,2,4,3,3,4,8,2,45,22,67


In [192]:
from sklearn.preprocessing import StandardScaler
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scale the training and testing data
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# Add the ohe features
X_train_final_df = pd.concat([X_train_scaled_df, X_train_ohe_df], axis=1)
X_test_final_df = pd.concat([X_test_scaled_df, X_test_ohe_df], axis=1)

X_train_final_df.head()

Unnamed: 0,DistanceFromHome,Education,JobSatisfaction,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,JobLevel,Age,PercentSalaryHike,HourlyRate,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
774,-0.879112,-1.872975,-1.577609,0.14592,1.782562,0.317424,1.744016,1.95766,-0.07452,-1.271838,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
341,0.733456,-0.892909,1.147352,0.14592,0.351839,0.642065,0.840015,0.104087,-0.347349,1.329861,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1124,-0.382937,0.087156,0.239032,0.14592,0.351839,-0.007218,0.840015,0.21312,-1.165834,-1.371903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
737,-0.258893,-0.892909,0.239032,2.476413,0.351839,-0.818821,-0.063986,0.431188,-0.347349,-1.171772,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1314,-0.879112,1.067222,0.239032,0.14592,1.782562,0.155103,-0.063986,0.867323,1.835279,0.079044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [193]:
# Create a OneHotEncoder for the Department column
dep_ohe = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
dep_ohe.fit(y_train[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_dep = dep_ohe.transform(y_train[['Department']])
y_test_dep = dep_ohe.transform(y_test[['Department']])

y_train_dep

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [194]:
# Create a OneHotEncoder for the Attrition column
att_ohe = OneHotEncoder(sparse_output=False, handle_unknown = 'ignore')

# Fit the encoder to the training data
att_ohe.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_att = att_ohe.transform(y_train[['Attrition']])
y_test_att = att_ohe.transform(y_test[['Attrition']])

y_train_att

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

## Create, Compile, and Train the Model

In [195]:
# Find the number of columns in the X training data
input_shape = X_train_final_df.shape[1]

# Create the input layer
input_layer = layers.Input(shape=(input_shape,), name='input_features')

# Create at least two shared layers
shared_layer_1 = layers.Dense(64, activation='relu')(input_layer)
shared_layer_2 = layers.Dense(128, activation='relu')(shared_layer_1)
input_shape

19

In [196]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
dep_branch = layers.Dense(32, activation='relu')(shared_layer_2)

# Create the output layer
dep_shape = y_test_dep.shape[1]
dep_output = layers.Dense(dep_shape, activation='sigmoid', name='department_output')(dep_branch)

dep_shape

3

In [197]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
att_branch = layers.Dense(32, activation='relu')(shared_layer_2)

# Create the output layer
att_shape = y_test_att.shape[1]
att_output = layers.Dense(att_shape, activation='sigmoid', name='attrition_output')(att_branch)

att_shape

2

In [198]:
# Create the model
# model = Model(inputs=input_layer, outputs=[dep_output, att_output])
model = Model(inputs=input_layer, outputs={'department_output': dep_output, 'attrition_output': att_output})

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize the model
model.summary()

In [199]:
X_train_final_df


Unnamed: 0,DistanceFromHome,Education,JobSatisfaction,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,JobLevel,Age,PercentSalaryHike,HourlyRate,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative
774,-0.879112,-1.872975,-1.577609,0.145920,1.782562,0.317424,1.744016,1.957660,-0.074520,-1.271838,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
341,0.733456,-0.892909,1.147352,0.145920,0.351839,0.642065,0.840015,0.104087,-0.347349,1.329861,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1124,-0.382937,0.087156,0.239032,0.145920,0.351839,-0.007218,0.840015,0.213120,-1.165834,-1.371903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
737,-0.258893,-0.892909,0.239032,2.476413,0.351839,-0.818821,-0.063986,0.431188,-0.347349,-1.171772,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1314,-0.879112,1.067222,0.239032,0.145920,1.782562,0.155103,-0.063986,0.867323,1.835279,0.079044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,1.229631,1.067222,-1.577609,-0.630911,0.351839,-0.169538,-0.967986,-0.986251,-0.893006,0.629404,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
721,1.849850,0.087156,0.239032,-0.630911,0.351839,0.804386,1.744016,1.412492,0.743965,1.479959,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
365,-0.258893,1.067222,0.239032,-0.630911,1.782562,-0.981141,-0.063986,0.976357,-0.347349,-1.772165,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
911,1.849850,-1.872975,1.147352,0.922751,0.351839,-0.981141,-0.967986,-1.313352,-0.347349,0.379240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [200]:
# # Convert y_train_dep and y_train_att to tuples
# y_train_dep_tuple = tuple(map(tuple, y_train_dep))
# y_train_att_tuple = tuple(map(tuple, y_train_att))

# y_test_dep_tuple = tuple(map(tuple, y_test_dep))
# y_test_att_tuple = tuple(map(tuple, y_test_att))

# print(type(y_train_dep_tuple), type(y_train_att_tuple))

In [201]:
# Train the model

history = model.fit(X_train_final_df, 
                    {'department_output': y_train_dep, 'attrition_output': y_train_att},
                    epochs=100, 
                    batch_size=32, 
                    validation_split=0.25)

Epoch 1/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - attrition_output_accuracy: 0.7474 - attrition_output_loss: 0.6085 - department_output_accuracy: 0.5305 - department_output_loss: 0.9563 - loss: 1.5651 - val_attrition_output_accuracy: 0.8370 - val_attrition_output_loss: 0.4564 - val_department_output_accuracy: 0.7065 - val_department_output_loss: 0.6664 - val_loss: 1.1216
Epoch 2/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8476 - attrition_output_loss: 0.4140 - department_output_accuracy: 0.7528 - department_output_loss: 0.5870 - loss: 1.0010 - val_attrition_output_accuracy: 0.8370 - val_attrition_output_loss: 0.4417 - val_department_output_accuracy: 0.8696 - val_department_output_loss: 0.4802 - val_loss: 0.9249
Epoch 3/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8655 - attrition_output_loss: 0.3834 - department_output_accu

In [202]:

# Evaluate the model with the testing data
eval = model.evaluate(X_test_final_df, 
                            {'department_output': y_test_dep, 'attrition_output': y_test_att}, 
                            verbose=2)
eval

12/12 - 0s - 4ms/step - attrition_output_accuracy: 0.7636 - attrition_output_loss: 1.9916 - department_output_accuracy: 0.9511 - department_output_loss: 0.3330 - loss: 2.1703


[2.1703450679779053,
 1.9915879964828491,
 0.33299461007118225,
 0.7635869383811951,
 0.9510869383811951]

In [203]:

# Print the accuracy for both department and attrition
attrition_accuracy = eval[3] 
department_accuracy = eval[4]  

print(f"Attrition Accuracy: {attrition_accuracy * 100:.2f}%")
print(f"Department Accuracy: {department_accuracy * 100:.2f}%")


Attrition Accuracy: 76.36%
Department Accuracy: 95.11%


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. * Accuracy may not be the best metric for evaluating this model because the data is imbalanced. There are significantly more employees in the R&D department than in HR. the model might predict attrition accurately for R&D employees but perform poorly when predicting HR attrition.

2. 
    * I chose the softmax activation function for the department output because it is a multicase classification problem. 
    * I chose sigmoid for the attrition classificaiton because it is a yes or no evaluation. 

3. 
    * Hyperparameter tuning could improve the model. Adjusting epochs and learning rate could lead to better performance. 
    * I could work on the features. Changing features or transforming features could improve the model. 