In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler


from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve 
from sklearn.metrics import confusion_matrix, f1_score, fbeta_score, confusion_matrix

from collections import Counter

from helper import clean_churn_df, model_baseline, model_baseline_no_cv, score_model_no_cv, score_model
from helper import split_with_dupe_rows_in_train, rf_no_cv_iterx

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler

from sqlalchemy import create_engine

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from sqlid import sql_id
engine = create_engine(sql_id()) #stored my unique SQL identifier in a variable, set 
churn_df = pd.read_sql_query('select * from hr',con=engine)  
churn_df.drop('index', axis=1,inplace=True) #remove extra index from database

In [None]:
churn_df[churn_df.left == 0].shape[0]/churn_df.shape[0] ## imbalanced dataset with .76 in 1 class

## Split data

In [None]:
X_train, X_holdout, y_train, y_holdout = train_test_split(churn_df, churn_df['left'], \
                                                            test_size=0.2, random_state=41)

## Exploratory EDA

In [None]:
sns.pairplot(X_train, hue='left');

In [None]:
X_train.columns

In [None]:
X_train['satisfaction_level'].hist()

In [None]:
X_train['last_evaluation'].hist()

In [None]:
churn_df.head() #a few categorical variables

In [None]:
X_train['number_project'].hist()

In [None]:
X_train['average_montly_hours'].hist()

In [None]:
X_train['time_spend_company'].hist() #how many years have they been there? Could create a feature of left by years

In [None]:
X_train['Work_accident'].hist() # most people haven't suffered an accident

In [None]:
X_train['promotion_last_5years'].hist() #very few have been promoted

In [None]:
X_train.columns = X_train.columns.str.strip()
a = sns.countplot('Departments',data=X_train, hue='left') #sales had most attrition
#g.set_xticklabels(rotation=30)

In [None]:
X_train.Departments.unique()

In [None]:
sns.countplot('salary',data=X_train, hue='left') #higher salary were more likely to leave

In [None]:
X_train = pd.concat([X_train, pd.get_dummies(X_train['Departments'])], axis=1)
X_train = pd.concat([X_train, pd.get_dummies(X_train['salary'])], axis=1)
X_train.drop(['left','Departments','salary'],axis=1,inplace=True)

## Baseline model

In [None]:
model_baseline(X_train, y_train)

Those are some good results! Almost too good. I checked and found there are a bunch of duplicate rows in my data (20% overall and 16% in my train set)

## Investigate dupes

In [None]:
total_dupes = churn_df.duplicated().sum()/churn_df.shape[0]
train_dupes = X_train.duplicated().sum()/X_train.shape[0]
print (total_dupes, train_dupes)

After some research I found a few studies that suggest that this is called being a slingshot employee and is pretty common (2 1000+ person studies suggest 15-30% is a common rate for slingshot employees to return)

https://workplacetrends.com/the-corporate-culture-and-boomerang-employee-study/  
https://www.prnewswire.com/news-releases/study-nearly-one-third-of-workers-going-back-to-previous-employers-300245827.html  


## Resplit data

In [None]:
print (X_train.shape, y_train.shape)

In [None]:
X_train, X_val, X_holdout, y_train, y_val, y_holdout = split_with_dupe_rows_in_train(churn_df)

In [None]:
rf = RandomForestClassifier(random_state=41)
rf.fit(X_train,y_train)

In [None]:
recall_score(rf.predict(X_val),y_val)

4% improvement on recall when I make sure the dupes are all in one side! Let's go on and see what happens when we also try and improve the class imbalance.