# Lecture 37: Evaluating the Accuracy of Classifiers

In [1]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Functions from last time

In [2]:
def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    return np.sqrt(sum((pt1-pt2)**2))

In [3]:
def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    row1_array = np.array(row1)
    row2_array = np.array(row2)
    return distance(row1_array,row2_array)

In [4]:
def distances(training, example):
    """
    Compute distance between example and every row in training.
    Return training augmented with Distance column
    """
    distances = make_array()
    attributes_only = training.drop('Class')
    
    for row in attributes_only.rows:
        distances = np.append(distances,row_distance(row,example)) # append distance between row and example
        
    return training.with_column('Distance_to_ex', distances)

In [5]:
def closest(training, example, k):
    """
    Return a table of the k closest neighbors to example
    """
    sorted_distances = distances(training,example).sort('Distance_to_ex')
    return sorted_distances.take(np.arange(k))

In [6]:
def majority_class(topk):
    """
    Return the class with the highest count
    """
    return topk.group('Class').sort('count', descending=True).column(0).item(0)

In [7]:
def classify(training, example, k):
    """
    Return the majority class among the 
    k nearest neighbors of example
    """
    return majority_class(closest(training, example, k))

## Returning to Google Science Fair example

In [8]:
# Data
patients = Table.read_table('breast-cancer.csv').drop('ID')
attributes = patients.drop('Class')
attributes.show(3)

Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
5,1,1,1,2,1,3,1,1
5,4,4,5,7,10,3,2,1
3,1,1,1,2,2,3,1,1


## Accuracy of a Classifier ##

In [9]:
patients.num_rows

683

In [10]:
shuffled = patients.sample(with_replacement=False) # Randomly permute the rows
training_set = shuffled.take(np.arange(342))
test_set  = shuffled.take(np.arange(342, 683))

In [11]:
def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        is_correct = (c == test.column('Class').item(i))
        num_correct = num_correct + is_correct
    return num_correct / test.num_rows

In [12]:
evaluate_accuracy(training_set, test_set, 5)

0.967741935483871

In [13]:
evaluate_accuracy(training_set, test_set, 3)

0.9794721407624634

In [14]:
evaluate_accuracy(training_set, test_set, 11)

0.9648093841642229

In [15]:
evaluate_accuracy(training_set, test_set, 1)

0.9530791788856305

# Standardize if Necessary

In [16]:
def standard_units(x):
    return (x - np.average(x)) / np.std(x)

In [17]:
ckd = Table.read_table('ckd.csv')
ckd = ckd.relabeled('Blood Glucose Random', 'Glucose').select('Glucose', 'Hemoglobin', 'White Blood Cell Count', 'Class')
ckd.show(5)

Glucose,Hemoglobin,White Blood Cell Count,Class
117,11.2,6700,1
70,9.5,12100,1
380,10.8,4500,1
157,5.6,11000,1
173,7.7,9200,1


In [18]:
ckd_new = ckd.select('Class').with_columns(
    'Glucose_su', standard_units(ckd.column('Glucose')),
    'Hemoglobin_su', standard_units(ckd.column('Hemoglobin')),
    'WBC_su', standard_units(ckd.column('White Blood Cell Count'))
)

In [19]:
ckd_new

Class,Glucose_su,Hemoglobin_su,WBC_su
1,-0.221549,-0.865744,-0.569768
1,-0.947597,-1.45745,1.16268
1,3.84123,-1.00497,-1.27558
1,0.396364,-2.81488,0.809777
1,0.643529,-2.08395,0.232293
1,-0.561402,-1.35303,-0.505603
1,2.04928,-0.413266,0.360623
1,-0.947597,-1.28342,3.34429
1,1.87936,-1.10939,-0.409356
1,0.489051,-1.35303,1.96475


In [20]:
eshuffled = ckd_new.sample(with_replacement=False) 
training_set = shuffled.take(np.arange(74))
test_set  = shuffled.take(np.arange(74, 148))

In [21]:
evaluate_accuracy(training_set, test_set, 3)

0.972972972972973

In [22]:
shuffled = ckd.sample(with_replacement=False) 
training_set = shuffled.take(np.arange(74))
test_set  = shuffled.take(np.arange(74, 148))

In [23]:
evaluate_accuracy(training_set, test_set, 3)

0.7837837837837838

## Updating Probabilities ##

In [24]:
n = 100
second = round(n * 0.6)
third = round(n * 0.4)

year = np.array(['Second'] * second + ['Third'] * third)
major = np.array(['Declared'] * (round(second * 0.5)) + ['Undeclared'] * (round(second * 0.5)) + \
                 ['Declared'] * (round(third * 0.8))  + ['Undeclared'] * (round(third * 0.2)))
                 
students = Table().with_columns(
    'Year', year,
    'Major', major
)

In [25]:
students.show(3)

Year,Major
Second,Declared
Second,Declared
Second,Declared


In [26]:
students.pivot('Major', 'Year')

Year,Declared,Undeclared
Second,30,30
Third,32,8


In [27]:
# Verify: 60% of students are Second years, 40% are Third years
60 / (60 + 40)

0.6

In [28]:
# Verify: 50% of Second years have Declared
30 / 60

0.5

In [29]:
# Verify: 80% of Third years have Declared
32 / 40

0.8

In [None]:
# Chance of second year, given that they have declared
# P(second year | declared)

30 / 62

In [None]:
# P(third year | declared)

32 / 62

## Tree Diagram Calculation

In [None]:
# P(second year | declared), from tree diagram

(0.6 * 0.5) / (0.6 * 0.5 + 0.4 * 0.8)

## Decisions ##

In [None]:
def create_population(prior_disease_prob, n):
    disease = round(n * prior_disease_prob)
    no_disease = round(n * (1 - prior_disease_prob))

    status = np.array(['Disease'] * disease  +  ['No disease'] * no_disease)
    result = np.array(['Test +'] * (round(disease * 0.99)) + ['Test -'] * (round(disease * 0.01)) +\
                      ['Test +'] * (round(no_disease * 0.05)) + ['Test -'] * (round(no_disease * 0.95)) )
                 
    t = Table().with_columns(
    'Status', status,
    'Test Result', result
    )
    return t.pivot('Test Result', 'Status')

In [None]:
create_population(1/1000, 100000)

In [None]:
99 / (99 + 4995)

In [None]:
#P(disease | tested +)

# = P(disease & tested +) / P(tested +)

(0.001 * 0.99) / (0.001*0.99 + 0.999*0.05)

## Subjective Probabilities

In [None]:
# P(disease | tested +)

# = P(disease & tested +) / P(tested +)

# if prior probability of disease is 1/10

(0.1 * 0.99) / (0.1*0.99 + 0.9*0.05)

In [None]:
create_population(1/10, 10000)

In [None]:
990/(990+450)

In [None]:
# P(disease | tested +)
# if prior probability of disease is 0.5

(0.5 * 0.99) / (0.5*0.99 + 0.5*0.05)

In [None]:
create_population(0.5, 10000)

In [None]:
4950/(4950+250)