### Creating the Model with Tensorflow

In [105]:
import pandas as pd
import tensorflow as tf

In [106]:
df_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
df_preprocessed.describe()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.945714,6.36,2.011429,222.347143,29.892857,36.417143,271.801774,26.737143,0.167143,1.021429,0.687143,6.761429
std,0.433322,0.09225,0.286386,0.226743,3.50501,1.480396,66.31296,14.804446,6.379083,40.021804,4.254701,0.37337,1.112215,1.166095,12.670082
min,0.0,0.0,0.0,0.0,1.0,0.0,118.0,5.0,27.0,205.917,19.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,3.0,1.0,179.0,16.0,31.0,241.476,24.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,0.0,1.0,6.0,2.0,225.0,26.0,37.0,264.249,25.0,0.0,1.0,0.0,3.0
75%,0.25,0.0,0.0,1.0,10.0,3.0,260.0,50.0,40.0,294.217,31.0,0.0,2.0,1.0,8.0
max,1.0,1.0,1.0,1.0,12.0,6.0,388.0,52.0,58.0,378.884,38.0,1.0,4.0,8.0,120.0


#### Target data for classification
* set median value as the threshold to cut samples into excessive absence and regular absence
* using median, the resulting dataset will be balanced between the two classes
* But, keep in mind that such decision should be made according to the requests and questions you are trying to answer.

In [107]:
med = df_preprocessed['Absenteeism Time in Hours'].median(axis=0)
med

3.0

In [108]:
targets = df_preprocessed['Absenteeism Time in Hours'].map(lambda x: 1 if (x>med) else 0)
df_preprocessed['Excessively Absenteeism'] = targets
df_with_targets = df_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

In [109]:
df_with_targets.describe()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessively Absenteeism
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.945714,6.36,2.011429,222.347143,29.892857,36.417143,271.801774,26.737143,0.167143,1.021429,0.687143,0.455714
std,0.433322,0.09225,0.286386,0.226743,3.50501,1.480396,66.31296,14.804446,6.379083,40.021804,4.254701,0.37337,1.112215,1.166095,0.498391
min,0.0,0.0,0.0,0.0,1.0,0.0,118.0,5.0,27.0,205.917,19.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,3.0,1.0,179.0,16.0,31.0,241.476,24.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,6.0,2.0,225.0,26.0,37.0,264.249,25.0,0.0,1.0,0.0,0.0
75%,0.25,0.0,0.0,1.0,10.0,3.0,260.0,50.0,40.0,294.217,31.0,0.0,2.0,1.0,1.0
max,1.0,1.0,1.0,1.0,12.0,6.0,388.0,52.0,58.0,378.884,38.0,1.0,4.0,8.0,1.0


#### select the inputs

In [110]:
unscaled_inputs = df_with_targets.iloc[:, :-1]

In [111]:
dummy_cols = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

# tobe_scaled_cols = ['Month Value',
#        'Day of the Week', 'Transportation Expense', 'Distance to Work',
#        'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
#        'Children', 'Pets']
tobe_scaled_cols = unscaled_inputs.loc[:, ~unscaled_inputs.columns.isin(dummy_cols)].columns.values

In [112]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()
absenteeism_scaler.fit(unscaled_inputs[tobe_scaled_cols])
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs[tobe_scaled_cols])
scaled_inputs_df = pd.DataFrame(data=scaled_inputs, columns=tobe_scaled_cols)
scaled_inputs_df = pd.concat([unscaled_inputs[dummy_cols], scaled_inputs_df], axis=1)
scaled_inputs_df.describe()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.945714,-8.120488e-17,-1.040438e-16,-2.131628e-16,5.582836e-17,1.319579e-16,-8.526513e-16,1.446462e-16,0.0,9.135549e-17,-1.2688260000000002e-17
std,0.433322,0.09225,0.286386,0.226743,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715,1.000715
min,0.0,0.0,0.0,0.0,-1.530333,-1.359682,-1.574681,-1.682647,-1.477309,-1.647399,-1.819793,-0.44798,-0.91903,-0.5896898
25%,0.0,0.0,0.0,1.0,-0.9593133,-0.6837035,-0.6541427,-0.9390957,-0.8498113,-0.7582731,-0.643782,-0.44798,-0.91903,-0.5896898
50%,0.0,0.0,0.0,1.0,-0.1027836,-0.007725463,0.04003371,-0.2631399,0.09143539,-0.1888514,-0.4085798,-0.44798,-0.01928035,-0.5896898
75%,0.25,0.0,0.0,1.0,1.039256,0.6682526,0.5682114,1.359154,0.5620587,0.5604758,1.002633,-0.44798,0.8804693,0.2684866
max,1.0,1.0,1.0,1.0,1.610276,2.696187,2.499833,1.494345,3.385799,2.67751,2.649049,2.232242,2.679969,6.275721


In [113]:
scaled_inputs_df.shape

(700, 14)

#### Divide the data into train, validation and test
* Instead of manual shuffle and manual division, this time we use sklearn ready function

In [114]:
from sklearn.model_selection import train_test_split
x_train, test_input, y_train, test_target = train_test_split(scaled_inputs_df, targets, train_size=0.9, random_state=42)

#### divide the training data again, this time to extract the validation dataset

In [115]:
# test_input, validation_input, test_target, validation_target = train_test_split(test_input, test_target, train_size=0.5, random_state=42)

In [116]:
tf.reshape(x_train, [-1])
print(x_train.shape)
print(y_train.shape)
# print(validation_input.shape)
# print(validation_target.shape)
print(test_input.shape)
print(test_target.shape)

(630, 14)
(630,)
(70, 14)
(70,)


#### set the Hyperparameters

In [117]:
HIDDEN_LAYER_SIZE = 14
DROPOUT_RATE = 0.5
EPOCHS = 100 # I will set early-stopping function
BATCH_SIZE = 16
OUTPUT_SIZE = 1

In [118]:
initi=tf.keras.initializers.variance_scaling()
model = tf.keras.Sequential([
                            tf.keras.layers.Dense(HIDDEN_LAYER_SIZE, activation='relu'),
                            # tf.keras.layers.Dense(HIDDEN_LAYER_SIZE, activation='relu'),
                            tf.keras.layers.Dense(OUTPUT_SIZE, activation='sigmoid')
                            ])

#### Optimization and loss function

In [119]:
custom_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.001)
# custom_loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# custom_loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [120]:
# callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5) 

model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1, 
          # validation_data=(validation_input, validation_target), 
          callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f199024e760>