<a href="https://colab.research.google.com/github/Nolanole/DS-Unit-2-Sprint-3-Classification-Validation/blob/master/Josh_Mancuso_Wednesday_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ASSIGNMENT

#### Bank Marketing
- Try the `class_weight` parameter.
- Explore and visualize your data. 
- Wrangle [bad data](https://github.com/Quartz/bad-data-guide), outliers, and missing values.
- Try engineering more features. You can transform, bin, and combine features. 
- Try selecting fewer features.


#### Imbalanced Classes demo with synthetic data
- Play around with the demo. Change parameter values.
- Be able to calculate precision, recall, F1, and accuracy "by hand", given a confusion matrix and access to Wikipedia.

# STRETCH
- Read the blog post, [Visualizing Machine Learning Thresholds to Make Better Business Decisions](https://blog.insightdatascience.com/visualizing-machine-learning-thresholds-to-make-better-business-decisions-4ab07f823415). You can replicate the code as-is,  ["the hard way"](https://docs.google.com/document/d/1ubOw9B3Hfip27hF2ZFnW3a3z9xAgrUDRReOEo-FHCVs/edit). Or you can apply it to the Bank Marketing dataset.
- Try the [imbalance-learn](https://github.com/scikit-learn-contrib/imbalanced-learn) library.
- Try other [scikit-learn classifiers](https://scikit-learn.org/stable/supervised_learning.html), beyond Logistic Regression.

In [0]:
# Imports
%matplotlib inline
import warnings
import category_encoders as ce
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import graphviz
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import DataConversionWarning
from sklearn.metrics import classification_report, confusion_matrix
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

pd.options.display.max_columns = None
pd.options.display.max_rows = None


In [0]:
#!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip

In [0]:
#!unzip bank-additional.zip

In [0]:
#read in the data into pandas df, and remove duration column:

bank = pd.read_csv('bank-additional/bank-additional-full.csv', sep=';').drop(columns='duration')

In [5]:
#Baseline - predict no with 88.7% accuracy
bank.y.value_counts(normalize=True)

no     0.887346
yes    0.112654
Name: y, dtype: float64

In [0]:
#Bin the pdays column to categorical:

bins = pd.IntervalIndex.from_tuples([(-1, 3.1), (3.1, 5.1), (5.1, 7.1), (7.1, 14.1), (14.1, 30.1), (30.1, 1000)])
  
bank['pdays'] = pd.cut(bank.pdays, bins)
#bank['pdays'] = bank['pdays'].cat.codes

In [0]:
def convert_categoricals(df, cat_cols):
  copy = df.copy()
  for col in cat_cols:
    copy[col] = pd.Categorical(copy[col])
    copy[col] = copy[col].cat.codes
  return copy
  

In [0]:
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'poutcome', 'pdays', 'y']

bank = convert_categoricals(bank, cat_cols)

In [0]:
scaler = StandardScaler()
numeric_cols = ['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx',
                'cons.conf.idx', 'euribor3m', 'nr.employed']
scaled = pd.DataFrame(scaler.fit_transform(bank[numeric_cols]), columns=numeric_cols)
cat_bank = bank[cat_cols]

scaled_bank = pd.concat([scaled, cat_bank], axis=1)

In [0]:
scaled_bank = convert_categoricals(scaled_bank, cat_cols)

In [11]:
scaled_bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 20 columns):
age               41188 non-null float64
campaign          41188 non-null float64
previous          41188 non-null float64
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
job               41188 non-null int8
marital           41188 non-null int8
education         41188 non-null int8
default           41188 non-null int8
housing           41188 non-null int8
loan              41188 non-null int8
contact           41188 non-null int8
month             41188 non-null int8
day_of_week       41188 non-null int8
poutcome          41188 non-null int8
pdays             41188 non-null int8
y                 41188 non-null int8
dtypes: float64(8), int8(12)
memory usage: 3.0 MB


In [12]:
X = scaled_bank.drop(columns='y')
y = scaled_bank['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

log_reg = LogisticRegression(solver='lbfgs', max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:,1]

roc_auc_score(y_test, y_pred_proba)

0.7955460752865701

In [13]:
accuracy_score(y_pred, y_test)

0.9002184996358339

In [0]:
# Make pipeline
pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    StandardScaler(), 
    LogisticRegression(solver='lbfgs', max_iter=1000)
)

#y_pred_proba = cross_val_predict(pipeline, X_train, y_train, cv=3, n_jobs=-1, 
#                                 method='predict_proba')[:,1]

def test_pipeline(pipeline, X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
  
  pipeline.fit(X_train, y_train)
  y_pred = pipeline.predict(X_test)
  print('Accuracy score: ' + str(accuracy_score(y_pred, y_test)))
  y_pred_proba = pipeline.predict_proba(X_test)[:,1]
  print('Roc_auc_score: ' + str(roc_auc_score(y_test, y_pred_proba)))

In [15]:
test_pipeline(pipeline, X, y)

Accuracy score: 0.9002184996358339
Roc_auc_score: 0.7955491709514599


In [19]:
tree_pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    StandardScaler(), 
    DecisionTreeClassifier()
)

test_pipeline(tree_pipeline, X, y)

Accuracy score: 0.8914785142024764
Roc_auc_score: 0.6520068753243077


In [16]:
y_pred_proba = cross_val_predict(pipeline, X_train, y_train, cv=3, n_jobs=-1, 
                                 method='predict_proba')[:,1]

threshold = 0.90
y_pred = y_pred_proba >= threshold

print(classification_report(y_train, y_pred))

pd.DataFrame(confusion_matrix(y_train, y_pred), 
             columns=['Predicted Negative', 'Predicted Positive'], 
             index=['Actual Negative', 'Actual Positive'])

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     29238
           1       1.00      0.00      0.00      3712

   micro avg       0.89      0.89      0.89     32950
   macro avg       0.94      0.50      0.47     32950
weighted avg       0.90      0.89      0.83     32950



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,29238,0
Actual Positive,3711,1


In [18]:
#Try fewer features:
features = ['day_of_week', 'month', 'housing', 'loan', 'education', 'poutcome', 'pdays', 'contact']

X = scaled_bank[features]
y = scaled_bank['y']

test_pipeline(pipeline, X, y)


Accuracy score: 0.8953629521728574
Roc_auc_score: 0.6938461130241993


In [20]:
test_pipeline(tree_pipeline, X, y)

Accuracy score: 0.8915999028890508
Roc_auc_score: 0.6521327656964951
