# Precision - Recall 

#### Sources: 

http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html

http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html
       

In [None]:
print(__doc__)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from itertools import cycle

In [None]:
from sklearn import svm, datasets
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

###### Import some data to play with

In [3]:
iris = datasets.load_iris()
X = iris.data
y = iris.target #target is synonym of:label, class, name of object (in this exemple, classes = types of flower)

###### Set up plot details

In [7]:
colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])
lw = 2

#Cycle generates an infinitely repeating series of values. 
#It receives an iterable collection. And it repeats those elements (in a cycle) endlessly, 
# with no concern for your feelings

###### Binarize the output

In [8]:
y = label_binarize(y, classes = [0,1,2])
n_classes = y.shape[1] # Nb of columns in y i.e. nb of classes (see below)

# sklearn.preprocessing.label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False)
# Binarize labels in a one-vs-all fashion

# Several regression and binary classification algorithms are available in the scikit. 
# A simple way to extend these algorithms to the multi-class classification case is to use the so-called one-vs-all scheme.

# Returns:
# Y : numpy array or CSR matrix of shape [n_samples, n_classes]
# Shape will be [n_samples, 1] for binary problems.

# Source: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.label_binarize.html#sklearn.preprocessing.label_binarize

###### Add noisy features

In [18]:
random_state = np.random.RandomState(0)

# np.random.RandomState()
# constructs a random number generator. 
# It does not have any effect on the freestanding functions in np.random, but must be used explicitly:

n_samples, n_features = X.shape

# The shape attribute for numpy arrays returns the dimensions of the array. 
# If Y has n rows and m columns, then Y.shape is (n,m). So Y.shape[0] is n.

"""In [46]: Y = np.arange(12).reshape(3,4)

In [47]: Y
Out[47]: 
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [48]: Y.shape
Out[48]: (3, 4)

In [49]: Y.shape[0]
Out[49]: 3

"""

'In [46]: Y = np.arange(12).reshape(3,4)\n\nIn [47]: Y\nOut[47]: \narray([[ 0,  1,  2,  3],\n       [ 4,  5,  6,  7],\n       [ 8,  9, 10, 11]])\n\nIn [48]: Y.shape\nOut[48]: (3, 4)\n\nIn [49]: Y.shape[0]\nOut[49]: 3\n\n'

In [24]:
 print("Rows: %s" % n_samples)
 print("Columns: %s" % n_features)

Rows: 150
Columns: 4


In [30]:
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] #concatenate noise and original data

# numpy.random.randn(d0, d1, ..., dn): 
# Return a sample (or samples) from the “standard normal” distribution
# Parameters: d0, d1, ..., dn : int, optional. The dimensions of the returned array, should be all positive. 
 # If no argument is given a single Python float is returned.

# exemple of concatenation: 

np.c_[np.array([[1,2,3]]), 0, 0, np.array([[4,5,6]])]

array([[1, 2, 3, 0, 0, 4, 5, 6]])

###### Split into training and test

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .5,
                                                   random_state = random_state) 
                                                   # split X and y in half and send each portion to respective set

# http://scikit-learn.org/0.16/modules/generated/sklearn.cross_validation.train_test_split.html 

###### Run classifier

In [44]:
classifier = OneVsRestClassifier(svm.SVC(kernel = 'linear', probability = True, random_state = random_state))

# SCV = Supor Vector Classification 

# class sklearn.multiclass.OneVsRestClassifier(estimator, n_jobs=1)

# Also known as one-vs-all, this strategy consists in fitting one classifier per class. 
# For each classifier, the class is fitted against all the other classes. 
# In addition to its computational efficiency (only n_classes classifiers are needed), 
# one advantage of this approach is its interpretability. Since each class is represented by one 
# and one classifier only, it is possible to gain knowledge about the class by inspecting its corresponding classifier. 
# This is the most commonly used strategy for multiclass classification and is a fair default choice.

# Source: http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html

# Fit model using features, X, and labels, Y.
y_score = classifier.fit(X_train,y_train).decision_function(X_test)

# decision_function is only used with a SVM classifier reason being it gives out the distance of your data points 
# from the hyperplane that separates the data
# Source: http://stackoverflow.com/questions/38437845/onevsrestclassifier-and-random-forest

# The decision function tells us on which side of the hyperplane generated by the classifier we are 
# (and how far we are away from it). Based on that information, the estimator then label the examples 
# with the corresponding label.

# When you call decision_function(), you get the output from each of the pairwise classifiers (n*(n-1)/2 numbers total). 
# See pages 127 and 128 of "Support Vector Machines for Pattern Classification".

# Each classifier puts in a vote as to what the correct answer is (based on the sign of the output of that classifier); 
# predict() returns the class with the most votes.
# Source: http://stackoverflow.com/questions/20113206/scikit-learn-svc-decision-function-and-predict

###### Compute Precision - Recall and plot curve

In [55]:
precision = dict() # Dictionnary
recall = dict()
average_precision = dict()

for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(y_test[:,i],y_score[:,i]) # Compute precision-recall pairs 
                                                                                  # for different probability thresholds
                                                                                  # "_" stands for threshold (?)
    average_precision[i] = average_precision_score(y_test[:,i], y_score[:,i]) 
    # Compute average precision (AP) from prediction score.
    # This score corresponds to the area under the precision-recall curve.
    # Returns: average_precision : float
    
# The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. 
  # The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.
# The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. 
  # The recall is intuitively the ability of the classifier to find all the positive samples.

##### Compute micro-average ROC curve and ROC area

In [58]:
precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(), y_score.ravel()) 
# ravel() Return a contiguous flattened array.

average_precision["micro"] = average_precision_score(y_test, y_score, average = "micro")

# average : If None, the scores for each class are returned. 
# Otherwise, this determines the type of averaging performed on the data:

# 'micro':
 # Calculate metrics globally by considering each element of the label indicator matrix as a label.


##### Plot Precision-Recall Curve

In [None]:
plt.clf() # Clear figure with all its axes, but leaves the window opened, such that it may be reused for other plots.

plt.plot(recall[0], precision[0], lw = lw, color = 'navy', label = 'Precision-revall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0,1.05])
plt.ylim([0.0,1.0])
plt.title("Precision-Recall exemple: AUC = {0:0.2f}".format(average_precision[0]))
plt.legend(loc = "lower left")
plt.show()

## Works in Spyder

##### Plot Precision - Recall curve for each class

In [None]:
plt.clf()
plt.plot(recall['micro'], precision['micro'], color = "gold", lw = lw, 
         label = "micro-average Precision - recall curve (area = {0:0.2f})"
                 ''.format(average_precision['micro']))
for i, color in zip(range(n_classes), colors):
    plt.plot(recall[i], precision[i], color = color, lw = lw,
            label = "Precision - recal curve of class {0} (area = {0:0.2f})"
            ''.format(i,average_precision[i]))

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0,1.05])
plt.ylim([0.0,1.0])
plt.title("Extension of Precision- Recall curve to multi-class")
plt.legend(loc = "lower left")
plt.show()