In [1]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, accuracy_score

def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return fig

In [2]:
import itertools   
import matplotlib.pyplot as plt

def evaluate_model(train_y, test_y, train_predictions, test_predictions):
  
  print("train accuracy", accuracy_score(train_y, train_predictions))
  print("train f1_score", f1_score(train_y, train_predictions, average='weighted'))
  print("train precision_score", precision_score(train_y, train_predictions, average='weighted'))
  print("train recall_score", recall_score(train_y, train_predictions, average='weighted'))
  
  print("test accuracy", accuracy_score(test_y, test_predictions))
  print("test f1_score", f1_score(test_y, test_predictions, average='weighted'))
  print("test precision_score", precision_score(test_y, test_predictions, average='weighted'))
  print("test recall_score", recall_score(test_y, test_predictions, average='weighted'))


def display_confusion_matrix( train_y, test_y, train_predictions, test_predictions):
  
  variety_labels = ["pinot noir", "chardonnay", "cabernet sauvignon", "syrah", "malbec", "ros", "tempranillo", "nebbiolo", "sauvignon blanc", "zinfandel"]
  C = confusion_matrix(test_y, test_predictions)
  plot_confusion_matrix(C, variety_labels, normalize=True)


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    plt.figure(figsize=(10,7))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    display()


In [3]:
import numpy as np
import pandas as pd

#Read model results from table, change table name as required
test_logis_df=spark.table("test_svc_600F_results")
train_logis_df=spark.table("train_svc_600F_results")


train_logis_pd = train_logis_df.toPandas()
test_logis_pd = test_logis_df.toPandas()


In [4]:
evaluate_model(train_logis_pd['label'], test_logis_pd['label'], train_logis_pd['prediction'], test_logis_pd['prediction'])

In [5]:
display_confusion_matrix(train_logis_pd['label'], test_logis_pd['label'], train_logis_pd['prediction'], test_logis_pd['prediction'])