# Example of a Full Data Cleaning and Model Fitting Pipeline

In [55]:
# Import libraries
import pickle
import sys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [20]:
# Get data
df = pd.read_csv('Data/employee_data.csv')
df.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,status,tenure
0,221,engineering,,0.932868,4,,low,0.829896,Left,5.0
1,232,support,,,3,,low,0.834544,Employed,2.0
2,184,sales,,0.78883,3,,medium,0.834988,Employed,3.0
3,206,sales,,0.575688,4,,low,0.424764,Employed,2.0
4,249,sales,,0.845217,3,,low,0.779043,Employed,3.0


In [72]:
# Drop duplicates and remove temporary workers
df = df.drop_duplicates()

df = df[df.department != 'temp']

df.department.unique()

array(['engineering', 'support', 'sales', 'IT', 'product', 'marketing',
       'procurement', 'finance', nan, 'management',
       'information_technology', 'admin'], dtype=object)

In [73]:
# Split data into training and test sets
np.random.seed(42)
X = df.drop('status', axis=1)
y = df['status']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(11254, 9) (2814, 9) (11254,) (2814,)


In [74]:
# X_train.select_dtypes(include=[np.number])
X_train.dtypes

avg_monthly_hrs        int64
department            object
filed_complaint      float64
last_evaluation      float64
n_projects             int64
recently_promoted    float64
salary                object
satisfaction         float64
tenure               float64
dtype: object

In [75]:
# Create a custom transformer to select columns by their data type

class SelectColByType(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])


In [76]:
# Create a custom transformer to clean data

class CleanData(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        # Missing filed_complaint values should be 0
        X['filed_complaint'] = X.filed_complaint.fillna(0)

        # Missing recently_promoted values should be 0
        X['recently_promoted'] = X.recently_promoted.fillna(0)

        # 'information_technology' should be 'IT'
        X.department.replace('information_technology', 'IT', inplace=True)

        # Fill missing categorical department values with 'Missing'
        X['department'].fillna('Missing', inplace=True)

        # Create indicator variable if last_evaluation is missing
        X['last_evaluation_missing'] = X.last_evaluation.isnull().astype(int)

        # Fill missing values in last_evaluation with 0
        X.last_evaluation.fillna(0, inplace=True)
        
        # Return cleaned dataframe
        return X

In [83]:
last_eval_ix, satisfaction_ix  = [
    list(X.columns).index(col)
    for col in ("last_evaluation", "satisfaction")]

last_eval_miss_ix = 9  # not created until CleanData class run
print(list(X.columns).index('satisfaction'))
X.columns

7


Index(['avg_monthly_hrs', 'department', 'filed_complaint', 'last_evaluation',
       'n_projects', 'recently_promoted', 'salary', 'satisfaction', 'tenure'],
      dtype='object')

In [66]:
# Create a custom transformer to engineer features

class AddFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):      
        # Create new indicator features
        X['underperformer'] = ((X.last_evaluation < 0.6) &
                               (X.last_evaluation_missing == 0)).astype(int)

        X['unhappy'] = (X.satisfaction < 0.2).astype(int)

        X['overachiever'] = ((X.last_evaluation > 0.8) &
                             (X.satisfaction > 0.7)).astype(int)

        # Create new dataframe with dummy features
        # TODO: Move to separate pipeline task using OneHotEncoder instead?
#         X = pd.get_dummies(X, columns=['department', 'salary'])

        # Return augmented DataFrame
        return X

In [64]:
X_cols = X_train.columns
list(X_cols)

['avg_monthly_hrs',
 'department',
 'filed_complaint',
 'last_evaluation',
 'n_projects',
 'recently_promoted',
 'salary',
 'satisfaction',
 'tenure',
 'last_evaluation_missing',
 'underperformer',
 'unhappy',
 'overachiever']

In [88]:
X_cols = list(X_train.columns)

pipeline_process = ColumnTransformer(
    [('clean_data', CleanData(), X_cols),
#      ('one_hot', OneHotEncoder(dtype='int'), ['department', 'salary']) # doesn't work bec of nansa
    ],
     remainder='passthrough')

pipeline_addfeats = Pipeline([
    ('add_features', AddFeatures()),
])

pipeline_full = Pipeline([
    ('process', pipeline_process),
#     ('addfeats', pipeline_addfeats),
])

new_X = pipeline_full.fit_transform(X_train)

print(new_X.shape)
new_X[0, :]

(11254, 10)


array([229, 'support', 1.0, 0.571134911664, 5, 0.0, 'low', 0.571764696819,
       4.0, 0], dtype=object)