# MODELS

In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn
from KNN import organize_and_split, standard_confusion_matrix, get_KNN, get_gridsearch_params, get_true_values, get_smaller_df
filename = '../Data/heart-disease-cleaned.csv'


### The organize_and_split function splits the data into training and test sets with training=0.7 and test=0.3

In [29]:
X_train, X_test, y_train, y_test = organize_and_split(filename)

### KNN with K = 1

In [30]:
get_KNN(X_train, X_test, y_train, y_test, n_neighbors=1)

K =  1

Accuracy:  0.844444444444

Precision:  0.813953488372

Recal:  0.853658536585

Confusion Matrix:  [[ 35.   8.]
 [  6.  41.]]


### Using GridSearch to fit different parameters to the classifier
#### GridSearchCV is a cross validation algorithm. It is passed a parameter grid and performs cross validation test for every possible combination of parameters. In this case, the parameters I'm trying to optimize are the number of neighbors passed to the classifier

In [31]:
clf = KNeighborsClassifier(n_jobs=-1)

In [32]:
clf

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform')

In [33]:
gs = get_gridsearch_params(clf, X_train, y_train)

### The 'best params' attribute gives the parameters that had the best score

In [34]:
gs.best_params_


{'n_neighbors': 8}

### The 'best score' attribute gives the score of the best parameters

In [35]:
gs.best_score_

0.81159420289855078

#### Applying this to our test data:

In [36]:
get_KNN(X_train, X_test, y_train, y_test, n_neighbors=8)

K =  8

Accuracy:  0.866666666667

Precision:  0.939393939394

Recal:  0.756097560976

Confusion Matrix:  [[ 31.   2.]
 [ 10.  47.]]


#### We have increased our accuracy by 0.02

## With reduced feature space (6 variable most highly correlated to the target)

In [49]:
corr_cols = ['cp', 'exang', 'oldpeak', 'ca', 'thal', 'thalach', 'diagnosis']
X2_train, X2_test, y2_train, y2_test = get_smaller_df(filename, corr_cols)a

In [51]:
get_KNN(X2_train, X2_test, y2_train, y2_test, n_neighbors=1)

K =  1

Accuracy:  0.744444444444

Precision:  0.725

Recal:  0.707317073171

Confusion Matrix:  [[ 29.  11.]
 [ 12.  38.]]


In [52]:
clf = KNeighborsClassifier()


In [55]:
gs2 = get_gridsearch_params(clf, X2_train, y2_train)

In [56]:
gs2.best_params_


{'n_neighbors': 8}

In [57]:
gs2.best_score_

0.81642512077294682

### Reduced Features with optimal value for K:

In [59]:
get_KNN(X2_train, X2_test, y2_train, y2_test, n_neighbors=8)

K =  8

Accuracy:  0.866666666667

Precision:  1.0

Recal:  0.707317073171

Confusion Matrix: 
[[ 29.   0.]
 [ 12.  49.]]
