In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
from imblearn.over_sampling import RandomOverSampler

from collections import Counter

from helper import clean_churn_df, model_baseline, model_baseline_no_cv, score_model_no_cv, score_model
from helper import split_with_dupe_rows_in_train

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

plt.style.use('ggplot')
%matplotlib inline


%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
#import data
churn_df = pd.read_csv('HR_comma_sep.csv')
y = churn_df.left
clean_churn = clean_churn_df(churn_df)

In [13]:
#first baseline
X_train, X_val, X_holdout, y_train, y_val, y_holdout = split_with_dupe_rows_in_train(churn_df)
model_baseline_no_cv(X_train, y_train, X_val, y_val)

(0.5753424657534246,
 0.7488888888888889,
 0.28934506353861195,
 0.3333333333333333,
 0.8329177057356608,
 0.9801699716713881)

In [17]:
#fix class imbalances

#Random OverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train,y_train)
X_resampled_df = pd.DataFrame(X_resampled, columns=X_train.columns)
model_baseline_no_cv(X_resampled_df, y_resampled, X_val, y_val)

(0.4372163388804841,
 0.6731141199226306,
 0.2498011137629276,
 0.28313253012048195,
 0.7911832946635731,
 0.9803370786516854)

In [20]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_train,y_train)
model_baseline_no_cv(X_smoted,y_smoted, X_val, y_val)

(0.4339339339339339,
 0.6577946768060836,
 0.25060048038430743,
 0.28643724696356276,
 0.7772727272727272,
 0.980225988700565)

In [65]:
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X_train,y_train)
X_train, X_holdout, y_train, y_holdout = train_test_split(X_adasyn, y_adasyn, \
                                                            test_size=0.2, random_state=41)

model_baseline(X_train,y_train)

(0.7751588183448426,
 0.9368303586120321,
 0.6545065845963108,
 0.6817980444565711,
 0.941643061566529,
 0.9948589348445382)