In [None]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, precision_recall_curve, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import pickle
import timeit
import pprint
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer

# Sections of the report 

-	Introduction
-	Methods
    -	Cleaning the data and creating new input features
    - Analysing and visualising the data
    - Preparing the inputs and choosing suitable features
    - Selecting and training a model
-	Evaluation

-	Conclusion

## 1. Introduction

The telecommunications sector has become one of the main industries in developed countries. The technical progress and the increasing number of operators globally have made the industry competititive. Companies are working hard to survive in this competitive market depending on multiple strategies.

There are often three main strategies for generating more revenue within a business: 
1) Acquiring new customers
2) Upselling existing customers 
3) Increase the retention period of customers

However, comparing these strategies taking the value of return on investment (RoI) of each into account has shown that the third strategy is the most profitable strategy. 
The reason being is that retaining an existing customer costs much lower than acquiring a new one, in addition to being considered much easier than the upselling strategy. 
To apply the third strategy, we need to decrease the potential of customer churn by putting systems in place to do so. Hence why exploring machine learning techniques for predicting customer churn can provide huge financial benefits to companies. 

## 2. Method

### 2.1 Loading and cleaning the data set

In [None]:
# Load the datasets
train = pd.read_csv('proposal_docs/dataset/train.csv', index_col= False)
test = pd.read_csv('proposal_docs/dataset/test.csv', index_col= False)



### Cleaning the dataset 

a) **Check for Null-values and inconsistent data types:** Upon looking at that dataset, we can see that each column has an equal amount of non-null values, indicating that there are **no instances** of missing data. 

However, the datatypes in the datasets are of type objet, int64, float64 indicating varying data types and a mixture between categorical and numerical variables

In [None]:
# Check for Null-values
print(train.info(null_counts=True))
print(test.info(null_counts=True))

In [None]:
train.head()

In [None]:
train.describe()

b) **Check for duplicate values**: No duplicate values were found

In [None]:
datasets = [train, test]

for data in datasets: 
    duplicate_rows = data[data.duplicated()]
    print(len(duplicate_rows))

### 2.1.2 Create new input features
- Created additional features `total_minutes`, `total_calls`  and `total_charges` to obsverve correlations and and see if that affects the model as well
- One-hot encoded the categorical features to in order to create a correlation matrix of all features
- label encoded the target variables 

In [None]:
def add_total_minutes_calls_charge(dataset):
    """
    Function for totally the minutes, calls and charges features
    :dataset: churn X_train or X_test dataset
    """
    dataset['total_minutes'] = dataset['total_day_minutes']  + dataset['total_eve_minutes'] + dataset['total_night_minutes']+ dataset['total_intl_minutes']
    dataset['total_calls'] = dataset['total_day_calls'] + dataset['total_eve_calls'] + dataset['total_night_calls'] + dataset['total_intl_calls']
    dataset['total_charge'] = dataset['total_day_charge'] + dataset['total_eve_charge'] + dataset['total_night_charge'] + dataset['total_intl_charge']
    return dataset

train = add_total_minutes_calls_charge(train)
test  = add_total_minutes_calls_charge(test)

In [None]:
# Convert the Y_train to 0 and 1 

le = LabelEncoder()
train['churn'] = le.fit_transform(train['churn'])

In [None]:
# Convert the categorical variables to 1 hot encoding
cols = train.columns
feature_cols = list(train.columns)
feature_cols.remove('churn')

# Get the numerical and categorical columns
num_cols = train._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))

#one hot encode the testing and training set
train_one_hot = pd.get_dummies(data = train, columns = cat_cols)
test_one_hot = pd.get_dummies(data = test, columns = cat_cols)

# Get feature cols
feature_cols = [x for x in train_one_hot.columns if x != 'churn' ]


### 2.1.3 Create a validation set

Despite there being a already a test set in place, a preliminary validation set was created, to evaluate the performance of the model and to see how well the model could
generalise on unseen data. The training data was split with an 90/10 split in stratified fashion.


In [None]:
# create X and y datasets for machine learning purposes
X_train, y_train, X_test = train_one_hot[feature_cols], train['churn'], test_one_hot[feature_cols]

# Split the training set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, stratify=y_train, random_state=27)

# Resets the indexes
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)


### 2.2 Analysing and Visualising the dataset

#### 2.2.1 Visualising the class distribution


In [None]:
class_value_counts = y_train.value_counts()
classes = np.array(class_value_counts.keys())
class_freq = class_value_counts.values
title = 'Churn'

def get_pct(array, value):
    """
    Gets the percentage of a value in proportion to the sum of an array
    return: pct_string
    """
    pct = value/array.sum()
    pct_string =  " ({:.2%})".format(pct)
    return pct_string
#
def plot_class_dist(data,classes, title):
    """
    Plots a 2-D plot, showing the relationship between classes and frequency
    Plots the images seen in Figure 2 of the report. 
    """
    fig, ax = plt.subplots(figsize=(6, 3))    
    width = 0.75 # the width of the bars 
    ind = np.arange(len(data))  # the x locations for the groups
    ax.barh(ind, data, width, color="blue")
    ax.set_yticks(ind+width/2)
    ax.set_yticklabels(classes, minor=False)
    plt.title("Class Frequency Distribution of the " + title +  " Dataset")
    plt.xlabel('Frequency')
    plt.ylabel('Classes')
    for i, v in enumerate(data):
        ax.text(v + 3, i + .25, str(v) + get_pct(data,v), color='green', fontweight='bold')
    plt.show()
    
plot_class_dist(class_freq, classes, title)

It is evident that the churn dataset is highly imbalanced, with 85% of the dataset not churned while a decent 14% of the dataset being churned. 

#### 2.2.2 Visualising the distributions of the numerical features

Most of the features tend to form a normal distribution, so aren't majorly right skewed or left skewed. However, number_service_calls, total international calls, number_vmail_messages are more right skewed than others

In [None]:
X_train.hist(figsize = (20,20))

#### 2.2.3 Correlation matrix 



In [None]:
corr = train_one_hot.corr()

top_features_corr = corr['churn'].sort_values(ascending = False).head(10)
top_features = top_features_corr.index.values
top_features_corr

In [None]:
bot_features_corr = corr['churn'].sort_values(ascending = True).head(10)
bot_features = list(bot_features_corr.index.values)
bot_features.append('churn')
bot_features_corr

In [None]:
def correlation_heatmap(train):
    correlations = train.corr()

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
    plt.show()
    
correlation_heatmap(train_one_hot[top_features])


In [None]:
correlation_heatmap(train_one_hot[bot_features])

#### 2.2.3 Plots vs churn

In [None]:
sns.countplot(data=train, x="international_plan", hue="churn")


In [None]:
sns.countplot(x=train["churn"])

In [None]:
sns.histplot(data=train, x="total_charge", hue="churn")

In [None]:
sns.histplot(data=train, x="total_minutes", hue="churn")


### 2.2.4 Plot the scatterplots - maybe consider plotting a pair plot as well

In [None]:
train.plot(kind='scatter', x='total_charge', y='total_day_minutes', alpha=0.5)


## 2.3 Preparing the inputs and choosing suitable features 

In order to pick the best features, i'll create a few datasets with subsets of features and then train a model using 5 fold cross validation to find 
the best dataset to use for hyperparameter tuning.

These are the datasets that will be created: 

- Full dataset
- Top 10 features that correlated with churn
- Recursive feature elimination (RFE)



In [None]:
voters = [('lgbm', LGBMClassifier(verbosity = -1, num_threads = 8)),
          ('rf', RandomForestClassifier()),
          ('xgb', XGBClassifier(verbosity = 0)),
          ('cat', CatBoostClassifier(verbose=True))]
        

models = {'logreg': LogisticRegression(),
          'lgbm': LGBMClassifier(verbosity = -1, num_threads = 8), 
          'xgb': XGBClassifier(verbosity = 0), 
          'rf': RandomForestClassifier(n_jobs = -1),
          'cat': CatBoostClassifier(verbose=True),
          'vc:': VotingClassifier(estimators=voters, voting='soft')}


In [None]:
# catboost, xgboost, logistic regression, lgbm, mlp classifier, random forest

def evaluate_rfe(RFEClassifier, X_train, y_train): 
    """
   # Evaluates the recursive feature elimination performance for a given model using the LGBM classifier
   # :returns: cross_validation scores
    """
    # create pipeline
    rfe = RFECV(estimator=RFEClassifier)
    model = LGBMClassifier()
    pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('feature_selection',rfe),('model',model)])
    # evaluate model
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    n_scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
    return n_scores

scores, names = list(), list()
eval_rfe_results = {}

for model in models: 
    print(model)
    score = evaluate_rfe(models[model], X_train, y_train)
    scores.append(score)
    names.append(model)
    eval_rfe_results [model] = score
    print('>%s %.3f (%.3f)' % (model, mean(score), std(score)))

results = pd.DataFrame(eval_rfe_results)
results.to_csv('results/eval_rfe_results.csv', index=False, header=True)


In [None]:
def plot_box_plot(X, labels, xlabel, ylabel, title, filename, figsize, t):
    """
    Plots a boxplot of values while annotating the mean and standard deviation
    :X: The input data
    :labels: column labels of the input data
    :xlabel: x axis label
    :ylabel: y axis label
    :filename: Name of the figure file
    :figsize: size of the boxplot
    :t: vector of fit times
    """
    
    # Plot boxpot
    fig, ax = plt.subplots(figsize=figsize)
    bp = plt.boxplot(X, labels=labels, showmeans=True) 
    
    # Annotate the boxplot with mean and std
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    for i, line in enumerate(bp['medians']):
        x, y = line.get_xydata()[1]
        text = ' μ={:.3f}\n σ={:.3f}\n t={:.2f}'.format(mean[i], std[i],t[i])
        ax.annotate(text, xy=(x, y))
    
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.savefig(filename)
    plt.show()

In [None]:
results_df = pd.read_csv('results/eval_rfe_results.csv')

plot_box_plot(X=results_df.values,  
              labels=results_df.columns, 
              xlabel = 'Models', 
              ylabel = 'AUC Score',
              title = 'Recursive feature elimination performance across 4 models',
              filename = 'images/eval_rfe_results.png',
              figsize = (10,5))

In [None]:
def rfe (X_train, y_train, classifier):
    """
    Implements recursive feature elimination with 5 fold cross validation
    :X_train: 
    :y_train: 
    :return: transformed dataframe (X_train_rfe), features selected
    """
    estimator = classifier
    selector = RFECV(estimator, step=3, cv=5, scoring = 'roc_auc')
    selector = selector.fit(X_train, y_train)
    X_train_rfe = selector.transform(X_train) # transform the training set
    X_val_rfe = selector.transform(X_val)  # transform the validation set with the training fit
    X_test_rfe = selector.transform(X_test) # transform the test set with the training fit
    
    features_selected = {}
    
    for i in range(len(selector.support_)):
        if selector.support_[i] == True:
            features_selected[i] = selector.feature_names_in_[i]
        
    return X_train_rfe,  X_val_rfe, X_test_rfe, features_selected

X_train_rfe, X_val_rfe, X_test_rfe, features_selected = rfe(X_train, y_train, models['xgb'])

In [None]:
features_selected

In [None]:
# remove churn from top features array as it is the target variables
churn_index = 0
top_features = np.delete(top_features, churn_index)

X_train_list = {'full_dataset': X_train, 
                'top_10_features': X_train[top_features], 
                'RFE' : X_train_rfe}

In [None]:
"""
with open('X_train_list.pickle', 'wb') as f:
    pickle.dump(X_train_list, f)
"""

In [None]:
with open('X_train_list.pickle', 'rb') as f:
    X_train_datasets = pickle.load(f)


## 2.4 Train the models


### 2.4.1 Find top peforming dataset

Here we evaluate the top peforming 

In [None]:
def cross_validate(model, X_train, y_train): 
    """
    Evaluates trainiing performance with repeated stratified cross validation of a given dataset
    :returns: cross_validation scores
    """
    # evaluate model
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
    return n_scores

In [None]:
scores, names = list(), list()
eval_dataset_results = {}

for X_train in X_train_list: 
    print(X_train)
    score = evaluate_dataset(models['lgbm'], X_train_list[X_train], y_train)
    scores.append(score)
    names.append(X_train)
    eval_dataset_results [X_train] = score
    print('>%s %.3f (%.3f)' % (X_train, score.mean(), score.std()))

results = pd.DataFrame(eval_dataset_results)
results.to_csv('results/eval_dataset_results.csv', index=False, header=True)

In [None]:
results_df = pd.read_csv('results/eval_dataset_results.csv')

plot_box_plot(X=results_df.values,  
              labels=results_df.columns, 
              xlabel = 'Datasets', 
              ylabel = 'AUC Score',
              title = 'Training  performance across the dataset with 5 fold cross validation',
              filename = 'images/eval_dataset_results.png',
              figsize = (10,5))

### 2.4.2 Find top peforming models

In this section we compare the performance of the 5 models with their default parameters with 10 fold cross validation on the Rfe dataset: 
- Logistic Regression
- Catboost 
- XgBoost 
- random forest
- Ligthgbm 

In [None]:

def evaluate_models(models, X, y , csv_name):
    """
    Model list setup for evaluation
    :X_train: type of dataset
    :models: Dictionary of models
    :filenme: filename of the results
    """
    
    # Set up data storage
    scores, names, times  = list(), list(), list()
    eval_dataset_results = {}

    for model in models: 
        # Run the evaluation
        start = timeit.default_timer()
        print(f'Evaluation has started on {model}')
        score = cross_validate(models[model], X, y)
        
        # store values
        scores.append(score)
        names.append(X_train)
        eval_dataset_results[model] = score
        end = timeit.default_timer()
        t = end - start
        times.append(t)

    # Store the final results
    results = pd.DataFrame(eval_dataset_results)
    results.to_csv(csv_name, index=False, header=True)
    
    return results, times
    
    


In [None]:
results, times = evaluate_models(models=models, 
                                       X= X_train_list['RFE'], 
                                       y = y_train
                                       csv_name = 'results/eval_model_results.csv')

In [None]:
plot_box_plot(X=results.values,  
              labels=results.columns, 
              xlabel = 'Datasets', 
              ylabel = 'AUC Score',
              title = 'Training  performance across the dataset with 5 fold cross validation',
              filename = 'images/eval_model_results.png',
              figsize = (15,8),
              t=times)

### 2.4.3 Hyperparameter tuning of top models

#### Bayesian Optimisation

Bayesian optimization works by constructing a posterior distribution of functions (gaussian process) that best describes the function you want to optimize. As the number of observations grows, the posterior distribution improves, and the algorithm becomes more certain of which regions in parameter space are worth exploring and which are not, as seen in the picture below.

![bayesian_opt](images/bo_example.png)



First we create a wrapper function to report the results of each of the hyperparameter sessions

In [None]:
def report_perf(optimizer, X, y, model_name,  csv_name, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = timeit.default_timer()
    
    if callbacks is not None:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
        
    d=pd.DataFrame(optimizer.cv_results_)
    d.to_csv(csv_name, index=False, header=True)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    duration = timeit.default_timer() - start
    print((model_name + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           + u"\u00B1"+" %.3f") % (duration, 
                                   len(optimizer.cv_results_['params']),
                                   best_score,
                                   best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params, duration

In [None]:
# hyper parameters spaces of each of the models

# Setting the search space
lgbm_search_spaces = {
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),     # Boosting learning rate
    'n_estimators': Integer(30, 5000),                   # Number of boosted trees to fit
    'num_leaves': Integer(2, 512),                       # Maximum tree leaves for base learners
    'max_depth': Integer(-1, 256),                       # Maximum tree depth for base learners, <=0 means no limit
    'subsample': Real(0.01, 1.0, 'uniform'),             # Subsample ratio of the training instance
    'subsample_freq': Integer(1, 10),                    # Frequency of subsample, <=0 means no enable
    'colsample_bytree': Real(0.01, 1.0, 'uniform'),      # Subsample ratio of columns when constructing each tree
    'reg_lambda': Real(1e-9, 100.0, 'log-uniform'),      # L2 regularization
    'reg_alpha': Real(1e-9, 100.0, 'log-uniform'),       # L1 regularization
   }

rf_search_spaces = {
    'bootstrap': Categorical([True, False]),             # Method of selecting samples for training each tree
    'max_depth': Integer(1, 200),                        # Maximum number of levels in tree
    'max_features': Categorical(['auto', 'sqrt']),       # Number of features to consider at every split
    'min_samples_leaf': Integer(1, 5),                   # Minimum number of samples required at each leaf node
    'min_samples_split': Integer(2, 10),                 # Minimum number of samples required to split a node
    'n_estimators': Integer(200, 2000)}                  # Number of trees in the random forest
    
xgb_search_spaces = {
    'learning_rate': Real(0.01, 1.0, 'uniform'),
     'max_depth': Integer(2, 12),
     'subsample': Real(0.1, 1.0, 'uniform'),
     'colsample_bytree': Real(0.1, 1.0, 'uniform'), # subsample ratio of columns by tree
     'reg_lambda': Real(1e-9, 100., 'uniform'), # L2 regularization
     'reg_alpha': Real(1e-9, 100., 'uniform'), # L1 regularization
     'n_estimators': Integer(50, 5000)}

cat_search_spaces = {
    'iterations': Integer(10, 2000),
    'depth': Integer(1, 12),
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'random_strength': Real(1e-9, 10, 'log-uniform'), # randomness for scoring splits
    'bagging_temperature': Real(0.0, 1.0), # settings of the Bayesian bootstrap
    'l2_leaf_reg': Integer(2, 100), # L2 regularization
   }

models_search_spaces = {
    'lgbm': [LGBMClassifier(verbosity = -1, num_threads = 8, random_state=0, objective ='binary'), lgbm_search_spaces], 
    'xgb': [XGBClassifier(verbosity = 0, random_state = 0), xgb_search_spaces], 
    'rf': [RandomForestClassifier(n_jobs = -1, random_state=0), rf_search_spaces],
    'cat': [CatBoostClassifier(verbose=False, random_state=0), cat_search_spaces]}


In [None]:
def bayesian_search (model, search_spaces, model_name, X_train, y_train): 
    
    
    # Wrapping everything up into the Bayesian optimizer
    opt = BayesSearchCV(estimator=model,                                    
                    search_spaces=search_spaces,                      
                    scoring='roc_auc',                           
                    cv=5,                                           
                    n_iter=60,                                        # max number of trials
                    n_points=3,                                       # number of hyperparameter sets evaluated at the same time
                    n_jobs=-1,                                        # number of jobs
                    iid=False,                                        # if not iid it optimizes on the cv score
                    return_train_score=False,                         
                    refit=False,                                      
                    optimizer_kwargs={'base_estimator': 'GP'},        # optmizer parameters: we use Gaussian Process (GP)
                    random_state=0)                                   # random state for replicability

    overdone_control = DeltaYStopper(delta=0.0001)               # We stop if the gain of the optimization becomes too small
    time_limit_control = DeadlineStopper(total_time=60 * 60 * 7) # We impose a time limit (6 hours)

    best_params, duration = report_perf(optimizer = opt, 
                              X = X_train, 
                              y = y_train, 
                              model_name = model_name, 
                              csv_name = f'hyperopt_results_{model_name}.csv', 
                              callbacks=[overdone_control, time_limit_control])
    return [best_params, duration]

In [None]:
hyperopt = {}

for model_name in models_search_spaces:
    hyperopt[model_name] =  bayesian_search (model = models_search_spaces[model_name][0], 
                                             search_spaces = models_search_spaces[model_name][1],
                                             model_name = model_name, 
                                             X_train = X_train_datasets['RFE'], 
                                             y_train  = y_train)
    

### 3. Evaluate on validation set and submit to kaggle

After hyperparameter tuning, I will instantiate the the models with their newly tuned parameters to then evaluate on the validation set. 

Evaluate the model based on other metrics as well
- Confusion matrix 
- Accuracy, f1 score, recall and precision
- AUC curves

In [None]:
tuned_models = {
    'lgbm': LGBMClassifier(verbosity = -1, num_threads = 8, random_state=0, objective ='binary', **hyperopt['lgbm'][0]), 
    'xgb':  XGBClassifier(verbosity = 0, random_state = 0, **hyperopt['xgb'][0]), 
    'rf': RandomForestClassifier(n_jobs = -1, random_state=0, **hyperopt['rf'][0]),
    'cat': CatBoostClassifier(verbose=False, random_state=0, **hyperopt['cat'][0])}
vclf = VotingClassifier(n_jobs =-1, estimators=[('lgbm', tuned_models['lgbm']), 
                                              ('xgb', tuned_models['xgb']), 
                                              ('rf', tuned_models['rf']),
                                              ('cat', tuned_models['cat'])],
                                               voting='soft')
tuned_models['vclf'] = vclf

In [None]:
# Get all the methods ready 

def save_object(obj, file_name):
    """
    Saves the chosen onbject
    :obj: chosen object
    :file_name: name of file
    """
    with open(f'{file_name}.pickle', 'wb') as f:
        pickle.dump(obj, f)
        
def plot_confusion_matrix(clf, clf_name, X_val, y_val): 
    ConfusionMatrixDisplay.from_estimator(clf, X_val, y_val)
    plt.show()
    plt.savefig(f'{clf_name} confusion matrix on the validation set.png')
    



def plot_roc_curve(models, X_val, y_val):
    """
    Plots the receiving operating characteristic curve. 
    Corresponds to Figure 7 in the report
    """ 
    plt.figure(figsize=(8, 6))
    plt.xlim(0, 0.2)
    plt.ylim(0.8, 1)
    for model_name in models:
        model = models[model_name]
        y_pred_proba = model.predict_proba(X_val)
        fpr, tpr, thresholds = roc_curve(y_val_B, y_pred_proba)
        plt.plot(fpr, tpr, linewidth=2, label=key)

    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16) 
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)    
    plt.grid(True)
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

def plot_precision_recall_curve(models, X_val, y_val):
    """
    Plots precision vs recall. 
    """
    plt.figure(figsize=(8, 6))
    plt.xlim(0, 0.2)
    plt.ylim(0.8, 1)
    for key in results:
        model = results[key][0]
        y_pred_proba = model.predict_proba(X_val)
        precisions, recalls, thresholds = precision_recall_curve(y_val_B, y_pred_proba)
        plt.plot(precisions, recalls, linewidth=2, label=key)

    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])
    plt.xlabel('Recall', fontsize=16) 
    plt.ylabel('Precision', fontsize=16)    
    plt.grid(True)
    plt.title('Precision Recall curve')
    plt.legend(loc='best')
    plt.show()

#### 3.2 Confusion Matrices 


In [1]:
for model_name in tuned_models:
    model = tuned_models[model_name]
    model.fit(X_train_datasets['RFE'], y_train)
    y_pred = model.predict(X_val_rfe)
    target_names = [0,1]
                  
    # Get results        
    plot_confusion_matrix(model, model_name, X_val_rfe, y_val)
    print(classification_report(y_val, y_pred, target_names=target_names))

NameError: name 'tuned_models' is not defined

In [None]:
save_object(tuned_models, 'tuned_models')

### 4. Deploy model to sagemaker endpoint