In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import tensorflow as tf
from tensorflow import keras
import pickle
from sklearn.preprocessing import StandardScaler
from keras.models import load_model as tfk__load_model


## cleaner function

In [36]:

# fitted_model = tfk__load_model('absenteeism_model.h5')    
def load_and_clean_data(data_csv, scaler_file):

    # import the data
    df = pd.read_csv(data_csv)
    # store the data within the object for later use
    df_with_predicitons = df.copy()
    # drop the ID column
    df = df.drop(['ID'], axis=1)
    # to preserve the same structure of our previous code, we will add the target column here but with NaN
    df['Absenteeism Time in Hours'] = 'NaN'

    # create a seperate dataframe for dummy variables
    reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first=True)

    # split reason_columns into 4 types
    reason_type_1 = reason_columns.loc[:, 1:14].max(axis=1)
    reason_type_2 = reason_columns.loc[:, 15:17].max(axis=1)
    reason_type_3 = reason_columns.loc[:, 18:21].max(axis=1)
    reason_type_4 = reason_columns.loc[:, 22:].max(axis=1)

    # to avoid multicollinearity, drop the 'Reason for Absence' column from df
    df.drop(['Reason for Absence'], axis=1, inplace=True)

    # concatenate df and the 4 types of reason for absence
    df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis=1)

    # assign names to the 4 reason type columns
    # df.columns = ['Transportation Expense', 'Distance to Work', 'Age',
    #       'Daily Work Load Average', 'Body Mass Index', 'Education',
    #       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value', 'Day of the Week', 
    #       'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',]
    df.rename(columns={0: 'Reason_1', 1: 'Reason_2', 2: 'Reason_3', 3: 'Reason_4'}, inplace=True)
    # print(df.head())
    # convert the 'Date' column into datetime
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, format="%d/%m/%Y")

    # extract month and day of the week value from column Date
    df['Month Value'] = df['Date'].dt.month
    df['Day of the Week'] = df['Date'].dt.day_of_week
    df.drop('Date', axis=1, inplace=True)

        # reorder the columns
    df = df[['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
            'Day of the Week', 'Transportation Expense', 'Distance to Work',
            'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
            'Children', 'Pets', 'Absenteeism Time in Hours']]
    
    # Education to dummies
    df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})

    # replace the NaN values
    df = df.fillna(value=0)

    # drop the original absenteeism time
    df = df.drop(['Absenteeism Time in Hours'],axis=1)
    
    # drop the variables we decide we don't need
    # I inlcuded 'Education' because in my version, 'Education' is also a useless parameter
    # df = df.drop(['Day of the Week','Daily Work Load Average','Distance to Work', 'Education'],axis=1)
    
    # we have included this line of code if you want to call the 'preprocessed data'
    preprocessed_data = df.copy()

    dummy_cols = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

    # exclude the dummies from scaler function
    tobe_scaled_cols = df.loc[:, ~df.columns.isin(dummy_cols)].columns.values
    
    with open(scaler_file, 'rb') as sc:
        scaler = pickle.load(sc)
    scaled_inputs = scaler.transform(df[tobe_scaled_cols])

    # concatenate the dummies
    scaled_inputs_df = pd.DataFrame(data=scaled_inputs, columns=tobe_scaled_cols)
    data = pd.concat([df[dummy_cols], scaled_inputs_df], axis=1) 
    return data



In [37]:
new_data = load_and_clean_data('Absenteeism_new_data.csv', 'absenteeism_scaler')

In [38]:
tfmodel = tfk__load_model('absenteeism_model.h5')
# fitted_model.evaluate('../')

In [39]:
new_data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,False,0.0,False,True,-0.102784,1.344231,-0.654143,-0.26314,-1.006686,-0.853789,-1.819793,2.232242,-0.91903,-0.58969
1,True,0.0,False,False,-0.102784,-1.359682,2.092381,1.494345,-1.320435,-0.853789,0.061825,-0.44798,-0.01928,2.843016
2,False,0.0,False,True,-0.102784,-0.007725,-1.016322,-1.209478,-0.379188,-0.853789,-0.40858,-0.44798,0.880469,-0.58969
3,False,0.0,False,True,-0.102784,1.344231,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,2.232242,0.880469,-0.58969
4,True,0.0,False,False,-0.102784,1.344231,-1.016322,-1.209478,-0.379188,-0.853789,-0.40858,-0.44798,0.880469,-0.58969


In [40]:
x = np.asarray(new_data).astype('float32')

In [41]:
tensorData = tf.convert_to_tensor(x)

In [42]:
tfmodel.predict(x)



array([[0.13952322],
       [0.8704048 ],
       [0.29811144],
       [0.19340254],
       [0.5732989 ],
       [0.72202265],
       [0.45661134],
       [0.1786873 ],
       [0.1052721 ],
       [0.41322753],
       [0.46206832],
       [0.4571003 ],
       [0.22681977],
       [0.13656506],
       [0.13180661],
       [0.25728056],
       [0.4571003 ],
       [0.44665125],
       [0.38033733],
       [0.5693227 ],
       [0.18433529],
       [0.05385768],
       [0.34198353],
       [0.4168747 ],
       [0.06258371],
       [0.4695955 ],
       [0.2306987 ],
       [0.59777343],
       [0.18538465],
       [0.5589177 ],
       [0.30202526],
       [0.1419565 ],
       [0.52805406],
       [0.30202118],
       [0.967336  ],
       [0.8896069 ],
       [0.7054479 ],
       [0.02548965],
       [0.32993215],
       [0.07756863]], dtype=float32)