In [10]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

1. Load the Wisconsin breast cancer data from sklearn (binary classification problem), do a train/test split, and fit a logistic regression and 10 nearest neighbors model. Instead of using any built-in sklearn scoring methods, write your own accuracy, precision, recall, and F1 evaluation functions that take arrays of actual and predicted target labels as arguments. Score your models on the test set.
    * e.g.  `def accuracy(actuals, preds)`

In [3]:
X, y = load_breast_cancer(return_X_y=True)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.30, random_state=4444)

In [11]:
lr = LogisticRegression()
km = KNeighborsClassifier(n_neighbors=10)

In [13]:
lr.fit(X_train, y_train)
km.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [15]:
y_lr_pred = lr.predict(X_test)
y_km_pred = km.predict(X_test)

In [23]:
def recall(actuals, preds):
    tp = 0
    fn = 0
    for i in range(len(preds)):
        if actuals[i] == 1:
            if preds[i] == 1:
                tp += 1
            else:
                fn += 1
    return (tp / (tp + fn))

In [25]:
print(recall(y_test, y_lr_pred))
print(recall(y_test, y_km_pred))

0.9696969696969697
0.9595959595959596


In [26]:
def accuracy(actuals, preds):
    tp = 0
    tn = 0
    for i in range(len(preds)):
        if actuals[i] == 0 and preds[i] == 0:
            tn += 1
        if actuals[i] == 1 and preds[i] == 1:
            tp += 1
    return ((tp + tn)/len(preds))
            

In [27]:
print(accuracy(y_test, y_lr_pred))
print(accuracy(y_test, y_km_pred))

0.9532163742690059
0.935672514619883


In [28]:
def precision(actuals, preds):
    tp = 0
    fp = 0
    for i in range(len(preds)):
        if preds[i] == 1:
            if actuals[i] == 1:
                tp += 1
            else:
                fp += 1
    return (tp/(tp + fp))
            

In [29]:
print(precision(y_test, y_lr_pred))
print(precision(y_test, y_km_pred))

0.9504950495049505
0.9313725490196079


In [31]:
def f1_score(actuals, preds):
    precision_score = precision(actuals, preds)
    recall_score = recall(actuals, preds)
    return (2 * precision_score * recall_score / (precision_score + recall_score))

In [32]:
print(f1_score(y_test, y_lr_pred))
print(f1_score(y_test, y_km_pred))

0.96
0.945273631840796


2. Write your own function for generating an ROC curve plot from model predictions without using sklearn's assistance. Remember that ROC plots true positive rate (recall) vs. false positive rate for a given probability decision threshold. So you should loop over a range of probability cutoffs from 1 to 0, convert a model's predicted probabilities (`model.predict_proba()[:,1]`) to target labels using each cutoff, and plot the results as a curve.  

In [38]:
y_p = lr.predict_proba(X_test)[:, 1]

In [None]:
for i in linspace(0)
temp_y = y_p.map(lambda x: 0 if x < threshold else 1)
return temp_y
