In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler


from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve 
from sklearn.metrics import confusion_matrix, f1_score, fbeta_score, confusion_matrix

from collections import Counter

from helper import clean_churn_df, model_baseline, model_baseline_no_cv, score_model_no_cv, score_model
from helper import split_with_dupe_rows_in_train, rf_no_cv_iterx

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler

from sqlalchemy import create_engine

plt.style.use('ggplot')
%matplotlib inline


%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
#import data
engine = create_engine('postgresql://test2:password@localhost:5432/churn2')
churn_df = pd.read_sql_query('select * from hr',con=engine)  
churn_df.drop('index', axis=1,inplace=True)
X_train, X_val, X_holdout, y_train, y_val, y_holdout = split_with_dupe_rows_in_train(churn_df)

In [12]:
#first baseline
model_baseline_no_cv(X_train, y_train, X_val, y_val)

(0.5753424657534246,
 0.7488888888888889,
 0.28934506353861195,
 0.3333333333333333,
 0.8329177057356608,
 0.9828571428571429)

In [5]:
#fix class imbalances

#Random OverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train,y_train)
X_resampled_df = pd.DataFrame(X_resampled, columns=X_train.columns)
model_baseline_no_cv(X_resampled_df, y_resampled, X_val, y_val)

(0.4372163388804841,
 0.6731141199226306,
 0.2498011137629276,
 0.28313253012048195,
 0.7911832946635731,
 0.9692737430167597)

In [7]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_train,y_train)
model_baseline_no_cv(X_smoted,y_smoted, X_val, y_val)

(0.43471810089020774,
 0.6830708661417323,
 0.2425431711145997,
 0.2818181818181818,
 0.792147806004619,
 0.9743589743589743)

In [9]:
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X_train,y_train)
model_baseline_no_cv(X_adasyn, y_adasyn, X_val, y_val)

(0.42,
 0.5844370860927153,
 0.2466403162055336,
 0.255050505050505,
 0.6881091617933723,
 0.9555555555555556)

Class imbalance doesn't seem to be a huge deal, best models do worse.