In [56]:
import sys
sys.path.append("../src")
import pandas as pd
import numpy as np
from tensorflow.keras import Sequential, layers, callbacks
from functions import get_df_uniques

In [57]:
df = pd.read_csv("../data/01_raw/attrition_train.csv")

In [58]:
df["Attrition"] = df["Attrition"].apply(lambda x: 0 if x == "No" else 1)

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1100 non-null   int64 
 1   Attrition                 1100 non-null   int64 
 2   BusinessTravel            1100 non-null   object
 3   DailyRate                 1100 non-null   int64 
 4   Department                1100 non-null   object
 5   DistanceFromHome          1100 non-null   int64 
 6   Education                 1100 non-null   int64 
 7   EducationField            1100 non-null   object
 8   EmployeeCount             1100 non-null   int64 
 9   EmployeeNumber            1100 non-null   int64 
 10  EnvironmentSatisfaction   1100 non-null   int64 
 11  Gender                    1100 non-null   object
 12  HourlyRate                1100 non-null   int64 
 13  JobInvolvement            1100 non-null   int64 
 14  JobLevel                

In [60]:
df["Age"].unique()

array([38, 52, 28, 43, 27, 37, 29, 36, 55, 40, 57, 31, 33, 21, 54, 41, 25,
       47, 32, 58, 20, 50, 39, 30, 42, 35, 26, 49, 56, 34, 51, 24, 46, 48,
       45, 59, 22, 23, 19, 44, 18, 53, 60])

In [61]:
get_df_uniques(df)

Unnamed: 0,Features,Unique Number,Values
0,Age,43,"[38, 52, 28, 43, 27, 37, 29, 36, 55, 40, 57, 3..."
1,Attrition,2,"[0, 1]"
2,BusinessTravel,3,"[Travel_Frequently, Non-Travel, Travel_Rarely]"
3,DailyRate,762,"[240, 322, 1476, 920, 443, 309, 459, 566, 1229..."
4,Department,3,"[Research & Development, Sales, Human Resources]"
5,DistanceFromHome,29,"[2, 28, 1, 3, 10, 24, 18, 4, 20, 6, 26, 12, 23..."
6,Education,5,"[4, 2, 3, 1, 5]"
7,EducationField,6,"[Life Sciences, Medical, Human Resources, Othe..."
8,EmployeeCount,1,[1]
9,EmployeeNumber,1100,"[803, 1401, 1315, 1255, 850, 1105, 1868, 407, ..."


In [62]:
y = df["Attrition"]
X = df.drop('Attrition', axis=1)

In [63]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2

cat_cols = X.select_dtypes(include='O').columns
num_cols = X.select_dtypes(exclude='O').columns

X=np.asarray(X).astype('float32')

# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=1)

# Preprocessing for numerical data
num_imp = SimpleImputer(missing_values=np.nan, strategy='mean')

# Preprocessing for categorical data
cat_imp = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', num_imp, num_cols),
        ('category', cat_imp, cat_cols)
    ])

# Preprocessing for feature selection
feat_sel = SelectKBest(chi2, k=20)

# model
model = Sequential([
    layers.Dense(32, activation="relu", input_shape=[20]),
    layers.Dense(16, activation="relu"),
    layers.Dense(8, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["binary_accuracy"]
)

early_stopping = callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)
#Pipeline
Sequential_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feat_sel', feat_sel),
    ('model', model )
])

Seq = Sequential_pipeline.fit(
    X_train, y_train,
    model__validation_data=(X_test, y_test),
    model__batch_size=512,
    model__epochs=1000,
    model__callbacks=[early_stopping],
    model__verbose=0, # hide the output because we have so many epochs
)
Sequential_pred = Seq.predict(X_test)
show_results(y_test, Sequential_pred)

ValueError: could not convert string to float: 'Travel_Frequently'

In [39]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1100 non-null   float32
 1   BusinessTravel            1100 non-null   object 
 2   DailyRate                 1100 non-null   float32
 3   Department                1100 non-null   object 
 4   DistanceFromHome          1100 non-null   float32
 5   Education                 1100 non-null   float32
 6   EducationField            1100 non-null   object 
 7   EmployeeCount             1100 non-null   float32
 8   EmployeeNumber            1100 non-null   float32
 9   EnvironmentSatisfaction   1100 non-null   float32
 10  Gender                    1100 non-null   object 
 11  HourlyRate                1100 non-null   float32
 12  JobInvolvement            1100 non-null   float32
 13  JobLevel                  1100 non-null   float32
 14  JobRole 

In [10]:
history = model.fit(
    X_train, y_train,
    
)

history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5
history_df.loc[5:, ['loss', 'val_loss']].plot()
history_df.loc[5:, ['binary_accuracy', 'val_binary_accuracy']].plot()

print(("Best Validation Loss: {:0.4f}" +\
      "\nBest Validation Accuracy: {:0.4f}")\
      .format(history_df['val_loss'].min(), 
              history_df['val_binary_accuracy'].max()))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).