# Example of a Full Data Cleaning and Model Fitting Pipeline

In [8]:
# Import libraries
import pickle
import sys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

In [5]:
# Get data
df = pd.read_csv('Data/employee_data.csv')
df.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,status,tenure
0,221,engineering,,0.932868,4,,low,0.829896,Left,5.0
1,232,support,,,3,,low,0.834544,Employed,2.0
2,184,sales,,0.78883,3,,medium,0.834988,Employed,3.0
3,206,sales,,0.575688,4,,low,0.424764,Employed,2.0
4,249,sales,,0.845217,3,,low,0.779043,Employed,3.0


In [6]:
# Function to clean data

def clean_data(df):
    # Drop duplicates
    df = df.drop_duplicates()

    # Drop temporary workers
    df = df[df.department != 'temp']

    # Missing filed_complaint values should be 0
    df['filed_complaint'] = df.filed_complaint.fillna(0)

    # Missing recently_promoted values should be 0
    df['recently_promoted'] = df.recently_promoted.fillna(0)

    # 'information_technology' should be 'IT'
    df.department.replace('information_technology', 'IT', inplace=True)

    # Fill missing values in department with 'Missing'
    df['department'].fillna('Missing', inplace=True)

    # Indicator variable for missing last_evaluation
    df['last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)

    # Fill missing values in last_evaluation with 0
    df.last_evaluation.fillna(0, inplace=True)

    # Return cleaned dataframe
    return df

In [7]:
# Function to engineer features

def engineer_features(self, df):
    # Create indicator features
    df['underperformer'] = ((df.last_evaluation < 0.6) &
                            (df.last_evaluation_missing == 0)).astype(int)

    df['unhappy'] = (df.satisfaction < 0.2).astype(int)

    df['overachiever'] = ((df.last_evaluation > 0.8) &
                          (df.satisfaction > 0.7)).astype(int)

    # Create new dataframe with dummy features
    # TODO: use OneHotEncoder instead
    df = pd.get_dummies(df, columns=['department', 'salary'])

    # Return augmented DataFrame
    return df

In [12]:
# Split data into training and test sets

X = df.drop('status', axis=1)
y = df['status']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(11399, 9) (2850, 9) (11399,) (2850,)
