## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [4]:
# Create a list of at least 10 column names to use as X data
cols_X = [
    'Age', 'BusinessTravel', 'DistanceFromHome', 'Education', 'EducationField',
    'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
    'JobRole', 'JobSatisfaction', 'MaritalStatus', 'NumCompaniesWorked',
    'OverTime', 'PercentSalaryHike', 'PerformanceRating',
    'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
    'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'
]

# Prompted options if only 10 columns to be used, commented out to retain list
# as reference in case column selection needs to be trimmed down for model performance
# cols_X = [
#     'Age', 'DistanceFromHome', 'Education', 'JobSatisfaction',
#     'NumCompaniesWorked', 'OverTime', 'StockOptionLevel',  'WorkLifeBalance',
#     'YearsAtCompany',  'YearsSinceLastPromotion'
# ]

# Create X_df using your selected columns
X_df = attrition_df[cols_X]

# Show the data types for X_df
X_df.dtypes

Age                          int64
BusinessTravel              object
DistanceFromHome             int64
Education                    int64
EducationField              object
EnvironmentSatisfaction      int64
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
NumCompaniesWorked           int64
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_atrn_train, y_atrn_test, y_dept_train, y_dept_test =\
    train_test_split(X_df, y_df['Attrition'], y_df['Department'])

In [6]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

# Creating a list of columns to encode from X_df
cols_to_encode = [
    'BusinessTravel', 'EducationField', 'JobRole', 'MaritalStatus', 'OverTime'
]

# Confirming unique values for `cols_to_encode`
for col in cols_to_encode:
    display(X_train[col].value_counts())

BusinessTravel
Travel_Rarely        791
Travel_Frequently    199
Non-Travel           112
Name: count, dtype: int64

EducationField
Life Sciences       452
Medical             344
Marketing           117
Technical Degree     99
Other                69
Human Resources      21
Name: count, dtype: int64

JobRole
Sales Executive              242
Research Scientist           216
Laboratory Technician        205
Manufacturing Director       106
Healthcare Representative     93
Manager                       75
Sales Representative          63
Research Director             58
Human Resources               44
Name: count, dtype: int64

MaritalStatus
Married     521
Single      340
Divorced    241
Name: count, dtype: int64

OverTime
No     793
Yes    309
Name: count, dtype: int64

In [7]:
# Seperating columns by which encoder to use

# Creating a list of columns for OneHotEncoder
cols_for_ohe = [
    'BusinessTravel', 'EducationField', 'JobRole', 'MaritalStatus'
]

# Declaring column for LabelEncoder
cols_for_le = 'OverTime'

In [8]:
# Creating an instance of OneHotEncoder for `cols_for_ohe` in X datasets
encoder_X_ohe = OneHotEncoder(sparse_output=False)

# Fitting `encoder_X_ohe` to training and testing data
X_train_ohe = encoder_X_ohe.fit_transform(X_train[cols_for_ohe])
X_test_ohe = encoder_X_ohe.transform(X_test[cols_for_ohe])

# Converting results to DF for later concatenation
X_train_ohe = pd.DataFrame(X_train_ohe, columns=encoder_X_ohe.get_feature_names_out())
X_test_ohe = pd.DataFrame(X_test_ohe, columns=encoder_X_ohe.get_feature_names_out())

# Confirming conversion
X_train_ohe.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,JobRole_Healthcare Representative,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [9]:
# Creating an instance of LabelEncoder for `cols_for_le` in X datasets
encoder_X_le = LabelEncoder()

# Fitting `encoder_X_le` to training and testing data
X_train_le = encoder_X_le.fit_transform(X_train[cols_for_le])
X_test_le = encoder_X_le.transform(X_test[cols_for_le])

# Converting the resutls to DF for later concatenation
X_train_le = pd.DataFrame(X_train_le, columns=[cols_for_le])
X_test_le = pd.DataFrame(X_test_le, columns=[cols_for_le])

# Confirming conversion
X_train_le.head()

Unnamed: 0,OverTime
0,0
1,0
2,0
3,1
4,0


In [10]:
# Creating a list of columns for StandardScalar
cols_to_scale = [
    'Age', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction',
    'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction',
    'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 
    'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
    'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'
]

In [11]:
# Create a StandardScaler for 'cols_to_scale' in X datasets
scalar_X = StandardScaler()

# Fit the StandardScaler to the training data
X_train_scaled = scalar_X.fit_transform(X_train[cols_to_scale])

# Scale the testing data
X_test_scaled = scalar_X.transform(X_test[cols_to_scale])

# Converting the results to DF for later concatenation
X_train_scaled = pd.DataFrame(X_train_scaled, columns=cols_to_scale)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=cols_to_scale)

# Confirming conversion
X_train_scaled.head()

Unnamed: 0,Age,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.983655,-0.899723,0.084359,-1.586885,-0.724295,0.401278,1.766868,0.234419,1.30469,-0.607392,-0.430054,0.274903,-0.940489,1.877169,-0.611087,-2.494328,-0.6511,-0.615969,-0.692863,-0.873941
1,-1.203789,-0.417303,0.084359,0.237577,-0.970492,-1.011528,-0.952529,-0.672,-0.685576,1.037866,-0.430054,0.274903,-0.940489,-0.808214,0.180238,-1.095473,-0.323917,-0.063202,0.562435,-0.319649
2,0.764911,-0.537908,0.084359,0.237577,1.196047,-1.011528,-0.952529,0.234419,0.11053,-0.333182,-0.430054,1.201335,-0.940489,-0.552463,1.76289,0.303381,-0.323917,-0.615969,-0.692863,-0.319649
3,-1.969394,1.391775,0.084359,1.149809,-1.413648,-1.011528,-0.952529,-0.672,-0.685576,-0.607392,-0.430054,-0.651529,-0.940489,-1.319715,0.180238,1.702236,-0.978283,-1.168736,-0.692863,-1.151087
4,-0.110067,0.185723,1.062922,-0.674654,1.688442,-1.011528,-0.046064,1.140838,0.508584,-0.607392,-0.430054,-1.577961,0.241288,-0.424588,-0.611087,-1.095473,-0.160326,-0.339586,-0.692863,-1.151087


In [12]:
# Concatenating the encoded and scaled data for X datasets
X_train = pd.concat([X_train_ohe, X_train_le, X_train_scaled], axis=1)
X_test = pd.concat([X_test_ohe, X_test_le, X_test_scaled], axis=1)

# Confirming total records to verify concatenated properly
print(f'Total records in original data: {attrition_df.shape[0]}')
print(f'Total records in X datasets:    {X_train.shape[0] + X_test.shape[0]}')

Total records in original data: 1470
Total records in X datasets:    1470


In [13]:
# Converting y datasets to DF for encoding
y_atrn_train_df = pd.DataFrame(y_atrn_train, columns=['Attrition'])
y_atrn_test_df = pd.DataFrame(y_atrn_test, columns=['Attrition'])
y_dept_train_df = pd.DataFrame(y_dept_train, columns=['Department'])
y_dept_test_df = pd.DataFrame(y_dept_test, columns=['Department'])

# Confiming conversion
y_atrn_train_df.head()

Unnamed: 0,Attrition
429,No
125,No
1042,No
688,Yes
1453,No


In [14]:
# Create a OneHotEncoder for the Department column
encoder_y_dept = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
y_dept_ohe_train = encoder_y_dept.fit_transform(y_dept_train_df)

# Create two new variables by applying the encoder
# to the testing data
y_dept_ohe_test = encoder_y_dept.transform(y_dept_test_df)

# Confirming conversion
y_dept_ohe_train

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [15]:
# Create a OneHotEncoder for the Attrition column
encoder_y_atrn = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
y_atrn_ohe_train = encoder_y_atrn.fit_transform(y_atrn_train_df)

# Create two new variables by applying the encoder
# to the testing data
y_atrn_ohe_test = encoder_y_atrn.transform(y_atrn_test_df)

# Confirming conversion
y_atrn_ohe_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

## Create, Compile, and Train the Model

In [16]:
# Find the number of columns in the X training data
input_shape = (X_train.shape[1],)

# Create the input layer
input_layer = layers.Input(shape=input_shape, name='input_layer')

# Create at least two shared layers
shared_dense_1 = layers.Dense(128, activation='relu', name='shared_1')(input_layer)
shared_dense_2 = layers.Dense(64, activation='relu', name='shared_2')(shared_dense_1)
shared_dense_3 = layers.Dense(32, activation='relu', name='shared_3')(shared_dense_2)

In [17]:
# Create a branch for Department
# with a hidden layer and an output layer

# Finding the shape of `y_dept_ohe_train` for the output layer
output_shape_dept = y_dept_ohe_train.shape[1]

# Create the hidden layer
dept_dense = layers.Dense(16, activation='relu', name='dept_dense')(shared_dense_3)

# Create the output layer
dept_output = layers.Dense(output_shape_dept,
                           activation='softmax',
                           name='output_department')(dept_dense)

In [18]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Finding the shape of `y_atrn_ohe_train` for the output layer
output_shape_atrn = y_atrn_ohe_train.shape[1]

# Create the hidden layer
atrn_dense = layers.Dense(16, activation='relu', name='atrn_dense')(shared_dense_3)

# Create the output layer
atrn_output = layers.Dense(output_shape_atrn,
                           activation='sigmoid',
                           name='output_attrition')(atrn_dense)

In [19]:
# Create the model
model = Model(inputs=input_layer,
              outputs=[dept_output,
                       atrn_output],
              name='model')

# Compile the model
model.compile(optimizer='adam',
              loss={
                    'output_department': 'categorical_crossentropy',
                    'output_attrition': 'binary_crossentropy'
                   },
              metrics={
                        'output_department': [
                                              'accuracy',
                                              'f1_score',
                                              'precision'
                                              ],
                        'output_attrition': [
                                             'accuracy',
                                             'recall',
                                             'precision'
                                            ]
                      })

# Summarize the model
model.summary()

In [20]:
# Train the model
model.fit(
    X_train,
    {
        'output_department': y_dept_ohe_train,
        'output_attrition': y_atrn_ohe_train
    },
    epochs=100,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1.6384 - output_attrition_accuracy: 0.8280 - output_attrition_precision: 0.7339 - output_attrition_recall: 0.8515 - output_department_accuracy: 0.4305 - output_department_f1_score: 0.2654 - output_department_precision: 0.4947 - val_loss: 1.1242 - val_output_attrition_accuracy: 0.8597 - val_output_attrition_precision: 0.8597 - val_output_attrition_recall: 0.8597 - val_output_department_accuracy: 0.6742 - val_output_department_f1_score: 0.2685 - val_output_department_precision: 0.6742
Epoch 2/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.1266 - output_attrition_accuracy: 0.8259 - output_attrition_precision: 0.8259 - output_attrition_recall: 0.8259 - output_department_accuracy: 0.6924 - output_department_f1_score: 0.3444 - output_department_precision: 0.7040 - val_loss: 0.8548 - val_output_attrition_accuracy: 0.8597 - val_output_attrition_precision: 0.8597

<keras.src.callbacks.history.History at 0x23893b8bac0>

In [21]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test,
                              {
                                'output_department': y_dept_ohe_test,
                                'output_attrition': y_atrn_ohe_test
                              }
                             )

# Displaying the results
test_results

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.6411 - output_attrition_accuracy: 0.8593 - output_attrition_precision: 0.8534 - output_attrition_recall: 0.8593 - output_department_accuracy: 0.9565 - output_department_f1_score: 0.8092 - output_department_precision: 0.9565 


[1.6699202060699463,
 0.85326087474823,
 0.85326087474823,
 0.85326087474823,
 0.9510869383811951,
 <tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.73684204, 0.96523505, 0.9385964 ], dtype=float32)>,
 0.9510869383811951]

In [22]:
# Print the accuracy for both department and attrition
print(f'Department accuracy: {test_results[4]:.4f}')
print(f'Attrition accuracy:  {test_results[1]:.4f}')

Department accuracy: 0.9511
Attrition accuracy:  0.8533


Additional metrics *(selected for refence and to inform answer for question 1)*

In [23]:
# Printing other metrics for comparison
f1_scores = test_results[5]
for i, score in enumerate(f1_scores):
    print(f'Department F1-score, Class {i+1}:  {score:.4f}')
print(f'Department precision:          {test_results[6]:.4f}')
print(f'Attrition precision:           {test_results[2]:.4f}')
print(f'Attrition recall:              {test_results[3]:.4f}')


Department F1-score, Class 1:  0.7368
Department F1-score, Class 2:  0.9652
Department F1-score, Class 3:  0.9386
Department precision:          0.9511
Attrition precision:           0.8533
Attrition recall:              0.8533


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Best metric;
    * Accuracy may be a commonly used metric, but the multi-class nature of the `Department` categories and the imbalanced data for both `Attrition` *and* `Department` justify other metrics, as well
    1. `Department`: F1 score and precision;
    * F1 Score; Would provide a more balanced reflection on how less populated categories are placed
    * Precision; Would provide more direct insight on true positive predictions
    2. `Attrition`: precision and recall;
    * Precision; Same as above, more direct insight on true positive predictions
    * Recall; Identifies the proportopm of true positive predictions among the positive predictions, giving a more reliable read on the model's performance
2. Actication functions chosen;
    1. `Department`: `softmax`
    * Because this feature had three (3) categories, `softmax` fit better with the multi-class classification needed
    2. `Attrition`: `sigmoid`
    * With only two (2) categories, `sigmoid` made the most sense for this binary classification
3. Potential improvements;
    1. Class imbalances:
    * With select categories in both `Attrition` and `Department` having far less represntation in the data than other categories, adjusting the class weights or resampling (ie; oversampling) the minority classes may improve performances
    2. Model structure;
    * The Dense layers selected above proved a viable starting point, but adjusting the number of hidden layers, units in the Dense layers, and other such measures could improve the performance for specifically the `Attriction` binary classification
    3. Feature reduction;
    * While the assignment specified "at least 10 columns" for the X datasets, I had made the decision to include ***alll*** non-target features from the original dataset
    * Reducing the selection of features may reduce the noise in the data and provide more balanced results