In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
from imblearn.over_sampling import RandomOverSampler

from collections import Counter

from helper import clean_churn_df
from helper import model_baseline

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

plt.style.use('ggplot')
%matplotlib inline


%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
#import data
churn_df = pd.read_csv('HR_comma_sep.csv')
y = churn_df.left
clean_churn = clean_churn_df(churn_df)

In [62]:
#first baseline
X_train, X_holdout, y_train, y_holdout = train_test_split(clean_churn, y, \
                                                            test_size=0.2, random_state=41)
model_baseline(X_train,y_train)

(0.7883983461339721,
 0.9369950979592184,
 0.6623067351371648,
 0.7618968154913454,
 0.9484125643222046,
 0.9863323943430083)

In [63]:
#fix class imbalances

#Random OverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(clean_churn,y)
X_resampled_df = pd.DataFrame(X_resampled, columns=clean_churn.columns)
X_train, X_holdout, y_train, y_holdout = train_test_split(X_resampled_df, y_resampled, \
                                                            test_size=0.2, random_state=41)
model_baseline(X_train,y_train)

(0.7751588183448426,
 0.9368303586120321,
 0.6545065845963108,
 0.6817980444565711,
 0.941643061566529,
 0.9946948808944688)

In [64]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_resampled_df,y_resampled)
X_train, X_holdout, y_train, y_holdout = train_test_split(X_smoted, y_smoted, \
                                                            test_size=0.2, random_state=41)
model_baseline(X_train,y_train)

(0.7751588183448426,
 0.9368303586120321,
 0.6545065845963108,
 0.6817980444565711,
 0.941643061566529,
 0.9945854866629688)

In [65]:
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X_resampled_df,y_resampled)
X_train, X_holdout, y_train, y_holdout = train_test_split(X_adasyn, y_adasyn, \
                                                            test_size=0.2, random_state=41)

model_baseline(X_train,y_train)

(0.7751588183448426,
 0.9368303586120321,
 0.6545065845963108,
 0.6817980444565711,
 0.941643061566529,
 0.9948589348445382)