In [None]:
# 1.Import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns # for data visualization
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 2. Import dataset
df = pd.read_csv("/content/heart_disease_data.csv", header=None)

In [None]:
# 3.Exploratory data analysis
df.shape
df.head()

In [None]:
# 4.Rename column names
col_names = ['Id', 'Clump_thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape', 'Marginal_Adhesion',
             'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class','Colum12','Colum13','colum14']

df.columns = col_names

df.columns
df.head()

In [None]:
#Drop redundant columns
df.drop('Id', axis=1, inplace=True)

In [None]:
#View summary of dataset
df.info()

In [None]:
# Frequency distribution of values in variables
for var in df.columns:

    print(df[var].value_counts())


In [None]:
#Convert data type of Bare_Nuclei to integer
#concert string to integer
df['Bare_Nuclei'] = pd.to_numeric(df['Bare_Nuclei'], errors='coerce')

In [None]:
# Check data types of columns of dataframe
df.dtypes

In [None]:
# Missing values in variables
df.isnull().sum()

In [None]:
# check `na` values in the dataframe

df.isna().sum()

In [None]:
# check frequency distribution of `Bare_Nuclei` column

df['Bare_Nuclei'].value_counts()

In [None]:
# check unique values in `Bare_Nuclei` column

df['Bare_Nuclei'].unique()

In [None]:
# check for nan values in `Bare_Nuclei` column

df['Bare_Nuclei'].isna().sum()

In [None]:
# view frequency distribution of values in `Class` variable

df['Class'].value_counts()

In [None]:
# view percentage of frequency distribution of values in `Class` variable

df['Class'].value_counts()/np.float(len(df))

In [None]:
#Load the necessary python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
df = pd.read_csv("/content/heart.csv")
df.head(163)

In [None]:
#Let's create numpy arrays for features and target
X = df.drop('target',axis=1).values
y = df['target'].values

In [None]:
#importing train_test_split
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=42, stratify=y)

In [None]:
#import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

#Setup arrays to store training and test accuracies
neighbors = np.arange(1,9)
train_accuracy =np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    #Setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)

    #Fit the model
    knn.fit(X_train, y_train)

    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)

    #Compute accuracy on the test set
    test_accuracy[i] = knn.score(X_test, y_test)

In [None]:
#Generate plot
plt.title('k-NN Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=7)

In [None]:
#Fit the model
knn.fit(X_train,y_train)

In [None]:
#Get accuracy. Note: In case of classification algorithms score method represents accuracy.
knn.score(X_test,y_test)

In [None]:
#import confusion_matrix
from sklearn.metrics import confusion_matrix

In [None]:
#let us get the predictions using the classifier we had fit above
y_pred = knn.predict(X_test)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
#import classification_report
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
# ROC (Reciever Operating Charecteristic) curve
y_pred_proba = knn.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=7) ROC curve')
plt.show()

In [None]:
#Area under ROC curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_proba)

In [None]:
# Hyperparameter tuning
#import GridSearchCV
from sklearn.model_selection import GridSearchCV
#In case of classifier like knn the parameter to be tuned is n_neighbors
param_grid = {'n_neighbors':np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X,y)

In [None]:
knn_cv.best_score_

In [None]:
knn_cv.best_params_
# Thus a knn classifier with number of neighbors as 14 achieves the best score/accuracy of 0.7578 i.e about 76%