In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Content
* [Import Data](#1)
* [Data Cleaning](#2)
* [KNN](#3)

<a id = "1"></a>
#### Import Data

* First look at data
* We must read our csv files from directory

In [30]:
data_dir = "../input/disease-prediction-using-machine-learning"
train_df = pd.read_csv(data_dir + "/Training.csv")
test_df = pd.read_csv(data_dir + "/Testing.csv")
train_df.head()

In [31]:
train_df.info()

There are 4920 samples and 134 features in the dataset. First we have to dive into deeper of the data so we can completely understand it. The data is clean and balanced so we dont need to handle with missing values, outliers etc. Just one column will be dropped below stages of this kernel

In [32]:
# display 30-60. indices of columns. 
train_df.columns[30:60]

In [33]:
# all of these are encoded data and types are integer.
train_df["indigestion"]

In [34]:
train_df["prognosis"].unique()

In [35]:
# the data is balanced for classification training. lets see value counts and also visualize them
sns.set_theme(style="darkgrid")
plt.figure(figsize = (12,30))
plt.xticks(rotation = 90)
sns.countplot(y="prognosis", data=train_df)
print(train_df["prognosis"].value_counts())

In [36]:
# linear relationships between some of features using correlation heatmap: for example which symptoms occur together?
df_corr = train_df.iloc[:, 10:40]
plt.figure(figsize = (30, 30))
sns.heatmap(df_corr.corr(), annot = True)
plt.show()

as we look at from correlation map, we can infer these:
* yellowish skin and abdominal pain have a high correlation coefficent which means these features usually seem together (maybe a liver problem)
* cough and breathlessness also have high correlation (it's usual because lung diseases give the same symptoms)
* restlessness and irregular sugar usually seem together

<a id = "2"></a>
#### Data Cleaning

In [37]:
# drop unnamed feature from train data
train_df.drop("Unnamed: 133", axis = 1, inplace = True)
# train_df["Unnamed: 133"]  # it's not here anymore

In [38]:
train_df.info()

In [39]:
train_df.head()

OK. Now let's create our ML models
<a id = "3"></a>
### KNN

In [40]:
# Modelling
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)  # k = 5
x_train, y_train = train_df.loc[:,train_df.columns != "prognosis"], train_df.loc[:,"prognosis"]
x_test, y_test = test_df.loc[:,train_df.columns != "prognosis"], test_df.loc[:,"prognosis"]
knn.fit(x_train, y_train)
prediction = knn.predict(x_test)
print("Prediction list: {}".format(prediction[0:20]))
print("With KNN (K=5) accuracy is: ",knn.score(x_test, y_test))

<a id = "5"></a>
#### Cross Validation

In [41]:
from sklearn.model_selection import cross_val_score
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
k = 5 # for K-fold cross validation
cv_result = cross_val_score(knn, x_train, y_train, cv = k) # uses R^2 score
print("CV scores: ", cv_result)
print("CV scores average: ", np.sum(cv_result)/len(cv_result))

<a id = "10"></a>
#### Metrics

In [42]:
y_predictions = {"KNN": knn.predict(x_test)}

from sklearn.metrics import classification_report, confusion_matrix

for classifier, y_pred in y_predictions.items():
    cm = confusion_matrix(y_test, y_pred)
    print(classifier,'Confusion matrix: \n',cm)
    print("------------------")
    print(classifier, 'Classification report: \n',classification_report(y_test,y_pred))