In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11,8)})

import warnings
warnings.filterwarnings('ignore')

### How good is your model
#### Classification metrics
- Measuring model performance with accuracy:
 - Fraction of correctly classified samples
 - Not always a useful metric

#### Class imbalance examples: Emails
- Spam classification 
 - 99% of emails are real; 1% of emails are spam
- Could build a classifier that predicts ALL emails as real
 - 99% accurate!!
 - But horrible at actually classifying spam
 - Fails at its original purpose
- Needs more nuanced metrics

#### Diagosing classification predictions
- Confusion matrix

|          |    Predicted: Spam Email|    Predicted: Real Email|
| ---- | ---- | ---- |
|Actual: Spam Email|True Positive|False Negative|
|Actual: Real Email|False Positive|True Negative|

### Metrics from the confusion matrix
- Precision: $\displaystyle\frac{tp}{tp+fp}$
<br>
- Recall: $\displaystyle\frac{tp}{tp+fn}$
<br> 
- F1 score:$\displaystyle2*\frac{precision~*~recall}{precision~+~recall}$
 - The harmonic mean of precision and recall
- High precision: Not many real emails predicted as spam
- High recall: Predicted most spam emails correctly 

In [2]:
columns = ['party', 'infants', 'water', 'budget','physician', 'salvador', 'religious', 
          'satellite', 'aid', 'missile', 'immigration', 'synfuels', 'education', 'superfund',
          'crime', 'duty_free_exports', 'eaa_rsa']
df = pd.read_csv('house-votes-84.csv', names=columns)
df = df.replace('?', np.nan)
df = df.fillna(method='bfill')
df.iloc[434,15] = 'n'
df = df.replace({'n': 0, 'y':1})
df.head()

Unnamed: 0,party,infants,water,budget,physician,salvador,religious,satellite,aid,missile,immigration,synfuels,education,superfund,crime,duty_free_exports,eaa_rsa
0,republican,0,1,0,1,1,1,0,0,0,1,0,1,1,1,0,1
1,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,0
2,democrat,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,0
3,democrat,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,1
4,democrat,1,1,1,0,1,1,0,0,0,0,1,0,1,1,1,1


In [4]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
def auto(): 
    print('X_train, X_test, y_train, y_test = train_test_split()')

In [5]:
auto()

X_train, X_test, y_train, y_test = train_test_split()


In [6]:
y = df['party'].values
X = df.drop('party', axis=1).values

In [7]:
knn = KNeighborsClassifier(n_neighbors=8)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [8]:
print(confusion_matrix(y_test, y_pred))

[[107   8]
 [  5  54]]


In [9]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

   democrat       0.96      0.93      0.94       115
 republican       0.87      0.92      0.89        59

avg / total       0.93      0.93      0.93       174

