## Fit Machine Learning Models
- Train and Test Models using a Variety of Appropriate [Classification] Algorithms
- Tune (Optimize) Each Model''s Hyperparameters
- Evaluate the Optimally-Fitted Models Against a Hold-Out (Out-of-Sample) Dataset
- Perform K-Fold Cross-Validation Using Each Model
- Determine Each Model''s Accuracy and Identify the Best Model.

### Import Required Libraries

In [None]:
import os
import numpy as np
import pandas as pd

import sklearn
print("SciKit-Learn Version:", sklearn.__version__)

if sklearn.__version__ >= '0.18.0':  # Starting with sklearn version 18.0
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
else:
    from sklearn.cross_validation import train_test_split, cross_val_score
    from sklearn.cross_validation import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")

from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
from sklearn.metrics import roc_curve, auc, roc_auc_score

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

### Define Globals

In [None]:
hold_out_percent = 0.2
CLASSIFICATION_TYPE = 'Binary'

###################################################################################################################
# Calculate the Null Accuracy Score: Accommodates Binary or Multiple Classification Type
###################################################################################################################
def null_accuracy_score(labels, classification_type):  #NOTE: 'labels' must be a pd.Series
    if classification_type == CLASSIFICATION_TYPE:
        return max(labels[:,].astype(int).mean(), 1 - labels[:,].astype(int).mean())
    
    elif classification_type == CLASSIFICATION_TYPE:
        return labels.value_counts().head(1).item() / len(labels) 
    
    else:
        raise ValueError("Bad input {0}. Must specify either 'Binary' or 'Multiple'.".format(classification_type))
        

##################################################################################################################
# Print Evaluation Metrics: Accuracy and Null Accuracy Scores
###################################################################################################################
def show_accuracy(y_actuals, y_predictions, classification_type):
    accuracy = accuracy_score(y_actuals, y_predictions)
    null_accuracy = null_accuracy_score(y_actuals, classification_type)

    print('----------------------------------------------------------------------------------')
    print('Accuracy (The Percentage of Correct Predictions): %0.3f' % accuracy)
    print('----------------------------------------------------------------------------------')
    print('Null Accuracy (Achieved by Always Predicting the Most Frequent Class): %0.3f' % null_accuracy)
    print('----------------------------------------------------------------------------------')   
    print('True:', list(y_actuals[0:10]))
    print('Pred:', y_predictions[0:10].tolist())
    print('----------------------------------------------------------------------------------\n')
        

##################################################################################################################
# Print Skree Plots: Compare the Explained Variances per Component Between Train/Test and Hold-Out Datasets
###################################################################################################################
def show_skree_plots(components, variances_explained):
    fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(7, 7))
    
    ax0.set_title("PCA Explained Variances: Train/Test Data")
    ax0.scatter(components, variances_explained[0])
    ax0.set_ylabel('Explained Variance (Eigenvalues)')
    ax0.grid(True)

    ax1.set_title("PCA Explained Variances: Hold-Out Data")
    ax1.scatter(components, variances_explained[1])
    ax1.set_ylabel('Explained Variance (Eigenvalues)')
    ax1.set_xlabel('Number of Components (Eigenvectors)')
    ax1.grid(True)

    plt.tight_layout()
    plt.show()
        

###################################################################################################################
# Print a Confusion Matrix:  Accommodates Binary or Multiple Classification Type
###################################################################################################################
def show_confusion_matrix(y_actuals, y_predictions, y_probabilities, classification_type):
    print('-------------------------------------------------------')
    print('Confusion Matrix')
    print('-------------------------------------------------------')

    cm = pd.crosstab(y_actuals, y_predictions, rownames=['Actuals'], colnames=['Predictions'])
    print(cm)
        
    if classification_type == CLASSIFICATION_TYPE:
        print('-------------------------------------------------------')
        print('Area Under the Curve (AUC): %0.3f' % roc_auc_score(y_actuals, y_probabilities[:, 1]))
        
    print('-------------------------------------------------------\n')
    
    
###################################################################################################################
# Print a Classification Report
###################################################################################################################
def show_classification_report(y_actuals, y_predictions):
    print('-------------------------------------------------------')
    print('Classification Report')
    print('-------------------------------------------------------')
    print(classification_report(y_actuals, y_predictions))
    print('-------------------------------------------------------\n')
    
    
###################################################################################################################
# Print a Classification Report Resulting from K-Fold Cross-Validation
###################################################################################################################
def show_cv_classification_report(classifier, X, y, K):
    accuracy = cross_val_score(classifier, X, y, scoring='accuracy', cv=K)
    print('--------------------------------------------------------------------------------------------')
    print("Accuracy per Fold: ", accuracy)
    print("Average Accuracy: %0.3f" % accuracy.mean())
    print("Standard Deviation of Accuracy: %0.3f" % accuracy.std())
    print('--------------------------------------------------------------------------------------------')
    
    precision = cross_val_score(classifier, X, y, scoring='precision_weighted', cv=K)
    print("Precision per Fold: ", precision)
    print("Average Precision: %0.3f" % precision.mean())
    print("Standard Deviation of Precision: %0.3f" % precision.std())
    print('--------------------------------------------------------------------------------------------')
    
    recall = cross_val_score(classifier, X, y, scoring='recall_weighted', cv=K)
    print("Recall per Fold: ", recall)
    print("Average Recall: %0.3f" % recall.mean())
    print("Standard Deviation of Recall: %0.3f" % recall.std())
    print('--------------------------------------------------------------------------------------------')
    
    f1 = cross_val_score(classifier, X, y, scoring='f1_weighted', cv=K)
    print("F1 per Fold: ", f1)
    print("Average F1: %0.3f" % f1.mean())
    print("Standard Deviation of F1: %0.3f" % f1.std())
    print('--------------------------------------------------------------------------------------------')
    

###################################################################################################################
# Print a Receiver Operating Characteristic (ROC) Curve Plot (Binary Classification Only).
###################################################################################################################
def show_roc_plot(y_actuals, y_probabilities, classifier_algorithm):
    plt.figure(figsize=(5,5))
    plt.title('ROC Curve (' + classifier_algorithm + ')')

    fpr, tpr, _ = roc_curve(y_actuals, y_probabilities[:, 1])
    auc = roc_auc_score(y_actuals, y_probabilities[:, 1])

    plt.plot(fpr, tpr, color='darkorange', label='ROC Curve (area = %0.2f)' % auc)
    plt.plot([0,1], [0,1], color='steelblue', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.show()
    

###################################################################################################################
# Save a Trained Model to a Pickle File.
###################################################################################################################
def export_model(model, algorithm_name):
    file_name = "Customer_Churn_{0}.pkl".format(algorithm_name)
        
    file_target = open(os.path.join(os.getcwd(), file_name), 'wb')
    pickle.dump(rfc, file_target, protocol=pickle.HIGHEST_PROTOCOL)
    file_target.close()

### Load Data

In [None]:
# Locate the Data File to be Ingested.
data_dir = os.path.join(os.getcwd(), 'Data')
source_file = os.path.join(data_dir, 'WA-Telco-Customer-Churn-ML.xlsx')

# Read the Data from the Source File
df = pd.read_excel(source_file, header=0)

# Drop the Unique Identifier
df = df.drop(labels=['customerID'], axis=1)

# Display the Dimensions of the DataSet Being Used.
print("DataSet Dimensions:", df.shape)

#### Make Appropriate Data Type Assignments.

In [None]:
df.gender = df.gender.astype('category')
df.SeniorCitizen = df.SeniorCitizen.astype('category')
df.Partner = df.Partner.astype('category')
df.Dependents = df.Dependents.astype('category')
df.tenure = pd.to_numeric(df.tenure, downcast='integer', errors='coerce')
df.PhoneService = df.PhoneService.astype('category')
df.MultipleLines = df.MultipleLines.astype('category')
df.InternetService = df.InternetService.astype('category')
df.OnlineSecurity = df.OnlineSecurity.astype('category')
df.OnlineBackup = df.OnlineBackup.astype('category')
df.DeviceProtection = df.DeviceProtection.astype('category')
df.TechSupport = df.TechSupport.astype('category')
df.StreamingTV = df.StreamingTV.astype('category')
df.StreamingMovies = df.StreamingMovies.astype('category')
df.Contract = df.Contract.astype('category')
df.PaperlessBilling = df.PaperlessBilling.astype('category')
df.PaymentMethod = df.PaymentMethod.astype('category')
df.MonthlyCharges = pd.to_numeric(df.MonthlyCharges, downcast='float', errors='coerce')
df.TotalCharges = pd.to_numeric(df.TotalCharges, downcast='float', errors='coerce')
df.Churn = df.Churn.astype('category')
df.dtypes

#### Separate the Independant (Predictor) Variables (X) from the Dependant (Target, Response, or Label) Variable (y)

In [None]:
X = df.drop(labels=['Churn'], axis=1)
y = df.Churn

#### Create a Hold-Out Data Set for Model Evaluation

In [None]:
# Divide into Train/Test and Evaluation (Hold-Out) Sets.
X, X_eval, y, y_eval = train_test_split(X, y, test_size = hold_out_percent, random_state=42)

# Use 80% of the observations for training and testing...
print('-------------------------------------------------------------------')
print("Train/Test Observations:", X.shape)

# ...And hod out the rest for final evaluation.
print("Evaluation Observations:", X_eval.shape)
print('-------------------------------------------------------------------\n')

# Examine the distribution of Labels for the two datasets.
print("Train/Test Label Distribution:\n", y.value_counts())
print('-------------------------------------------------------------------')
print("Evaluation Label Distribution:\n", y_eval.value_counts())
print('-------------------------------------------------------------------')

#### Create Training and Testing Datasets 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use 60% of the observations for training...
print('-------------------------------------------------------------------')
print("Training Observations:", X_train.shape)

# ...And use the rest for testing.
print("Testing Observations:", X_test.shape)
print('-------------------------------------------------------------------\n')

# Examine the distribution of Labels for the two datasets.
print("Training Label Distribution:\n", y_train.value_counts())
print('-------------------------------------------------------------------')
print("Testing Label Distribution:\n", y_test.value_counts())
print('-------------------------------------------------------------------')

#### Perform Scalar Normalization of Each Dataset.

In [None]:
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X = sc.fit_transform(X)
X_eval = sc.transform(X_eval)

### Use Principal Component Analysis (PCA) to Reduce Dimensionality

In [None]:
# Define the Number of Components to Target
n_components = 18

# Instantiate a List to Hold Collections of PCA Explained Variances.
variances = []

##### Perform PCA Feature Reduction on the Train/Test Data

In [None]:
%%time

pca = PCA(n_components=n_components)

# Reduce the data, outputting an ndarray
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# Get the explained variance associated with each component.
explained_variance = pca.explained_variance_ratio_

# Append the explained variances to the Variances List.
variances.append(explained_variance)

# Print out the explained variance associated with each component.
print(explained_variance)
print('-------------------------------------------------------------------')
print("Total Variance Explained: %0.4f" % sum(explained_variance))
print('-------------------------------------------------------------------')

##### Perform PCA Parameter Reduction on the Hold-Out Data

In [None]:
%%time

pca_holdout = PCA(n_components=n_components)

# Reduce the data, outputting an ndarray
X = pca_holdout.fit_transform(X)
X_eval = pca_holdout.transform(X_eval)

# Get the explained variance associated with each component.
explained_variance_holdout = pca_holdout.explained_variance_ratio_

# Append the explained variances to the Variances List.
variances.append(explained_variance_holdout)

# Print out the explained variance associated with each component.
print(explained_variance_holdout)
print('-------------------------------------------------------------------')
print("Total Variance Explained: %0.4f" % sum(explained_variance_holdout))
print('-------------------------------------------------------------------')

##### Use Skree Plots to Illustrate the Explained Variance of Each Component

In [None]:
# Get the range of components
components = list(range(1, n_components+1))

# Display the Skree Plots to Compare the two Variance Distributions
show_skree_plots(components, variances)

### Train a Classification Model Using the RandomForestsClassifier 

In [None]:
%%time

# Train using the Training data.
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

# Make Predictions using the Test data.
y_pred = rfc.predict(X_test)
y_probs = rfc.predict_proba(X_test)

#### Evaluate the Model's Efficacy using Metrics, a Confusion Matrix and Classification Report

In [None]:
show_accuracy(y_test, y_pred, CLASSIFICATION_TYPE)
show_confusion_matrix(y_test, y_pred, y_probs, CLASSIFICATION_TYPE)
show_classification_report(y_test, y_pred)

if CLASSIFICATION_TYPE == 'Binary':
    show_roc_plot(y_test, y_probs, 'Random Forests Classifier')

#### Perform Hyper-Parameter Tuning for the Random Forests Classifier

In [None]:
%%time

# Define the parameter values to be searched
estimator_range = list(range(2, 21))
criterion_options = ['gini','entropy']
max_features_options = [None, 'auto','sqrt','log2']
max_depth_range = list(range(2, 21))

# Create parameter distributions
param_dist = dict(n_estimators = estimator_range
                  , max_features = max_features_options
                  , criterion = criterion_options
                  , max_depth = max_depth_range
                 )

# Instantiate the grid
rand = RandomizedSearchCV(rfc, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=42, n_jobs=-1) 
rand.return_train_score=False

# Finally, fit the grid
rand.fit(X_train, y_train)

In [None]:
# View the results
pd.DataFrame(rand.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

In [None]:
dist_mean_scores = rand.cv_results_['mean_test_score']
param_estimators = rand.cv_results_['params']
items = list(range(0, len(param_estimators)))

estimators = []

for i in items:
    estimators.append(param_estimators[i]['n_estimators'])
    estimators.sort()
    
print('-------------------------------------------------------------------')    
print('Mean Test Scores:', dist_mean_scores)
print('-------------------------------------------------------------------')
print('Estimator Values:', estimators)
print('-------------------------------------------------------------------')

In [None]:
# Plot the scores
plt.title('Mean Test Scores by Number of Estimators')
plt.plot(estimators, dist_mean_scores, color='steelblue')
plt.xlabel('Number of Estimators')
plt.ylabel('Mean Test Scores')

In [None]:
# Examine the best score
print(rand.best_score_)
print(rand.best_params_)
print(rand.best_estimator_)

#### Use the Best Hyper-Parameters to Make Predictions Using the Hold-Out Data

In [None]:
%%time

n_estimators = rand.best_params_['n_estimators']
max_feature = rand.best_params_['max_features']
max_depth = rand.best_params_['max_depth']
criteria = rand.best_params_['criterion']

rfc = RandomForestClassifier(n_estimators = n_estimators
                             , max_features = max_feature
                             , criterion = criteria
                             , max_depth = max_depth
                             , random_state = 42
                            )
rfc.fit(X, y)

y_pred = rand.predict(X_eval)
y_probs = rand.predict_proba(X_eval)

#### Evaluate the Model's Efficacy using Metrics, a Confusion Matrix and Classification Report

In [None]:
show_accuracy(y_eval, y_pred, CLASSIFICATION_TYPE)
show_confusion_matrix(y_eval, y_pred, y_probs, CLASSIFICATION_TYPE)
show_classification_report(y_eval, y_pred)

if CLASSIFICATION_TYPE == 'Binary':
    show_roc_plot(y_eval, y_probs, 'Random Forests Classifier')

#### Use K-Fold Cross-Validation to Detect Overfitting

In [None]:
%%time

show_cv_classification_report(rfc, X, y, 10)

#### Export the Best Fit Model to a Pickle file.

In [None]:
export_model(rfc, "RandomForestsClassifier")