In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
df = pd.read_csv('Diabetes-Classification.csv')
df.head()

Unnamed: 0,Age,Gender,BMI,Blood Pressure,FBS,HbA1c,Family History of Diabetes,Smoking,Diet,Exercise,Diagnosis
0,45,Male,25.0,Normal,100,5.7,No,No,Healthy,Regular,No
1,55,Female,30.0,High,120,6.4,Yes,Yes,Poor,No,Yes
2,65,Male,35.0,High,140,7.1,Yes,Yes,Poor,No,Yes
3,75,Female,40.0,High,160,7.8,Yes,Yes,Poor,No,Yes
4,40,Male,20.0,Normal,80,5.0,No,No,Healthy,,No


In [9]:
df.info() #vizualise data type of each label and if they have any null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         128 non-null    int64  
 1   Gender                      128 non-null    object 
 2   BMI                         125 non-null    float64
 3   Blood Pressure              128 non-null    object 
 4   FBS                         128 non-null    int64  
 5   HbA1c                       128 non-null    float64
 6   Family History of Diabetes  126 non-null    object 
 7   Smoking                     126 non-null    object 
 8   Diet                        126 non-null    object 
 9   Exercise                    127 non-null    object 
 10  Diagnosis                   128 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 11.1+ KB


In [10]:
df.isnull().sum()

Age                           0
Gender                        0
BMI                           3
Blood Pressure                0
FBS                           0
HbA1c                         0
Family History of Diabetes    2
Smoking                       2
Diet                          2
Exercise                      1
Diagnosis                     0
dtype: int64

In [11]:
df = df.dropna()
df.head()


Unnamed: 0,Age,Gender,BMI,Blood Pressure,FBS,HbA1c,Family History of Diabetes,Smoking,Diet,Exercise,Diagnosis
0,45,Male,25.0,Normal,100,5.7,No,No,Healthy,Regular,No
1,55,Female,30.0,High,120,6.4,Yes,Yes,Poor,No,Yes
2,65,Male,35.0,High,140,7.1,Yes,Yes,Poor,No,Yes
3,75,Female,40.0,High,160,7.8,Yes,Yes,Poor,No,Yes
5,50,Female,25.0,Normal,100,5.7,No,No,Healthy,Regular,No


In [14]:
#1: Categorical data is in string format
#2: Categorical variables confuse the model

#label encoding

from sklearn.preprocessing import LabelEncoder

cat_col = ['Gender','Blood Pressure', 'Family History of Diabetes', 'Smoking', 'Diet', 'Exercise', 'Diagnosis']

le = LabelEncoder()

for col in cat_col:
    df[col] = le.fit_transform(df[col])

In [15]:
df.head()

Unnamed: 0,Age,Gender,BMI,Blood Pressure,FBS,HbA1c,Family History of Diabetes,Smoking,Diet,Exercise,Diagnosis
0,45,1,25.0,2,100,5.7,0,0,0,1,0
1,55,0,30.0,0,120,6.4,1,1,1,0,1
2,65,1,35.0,0,140,7.1,1,1,1,0,1
3,75,0,40.0,0,160,7.8,1,1,1,0,1
5,50,0,25.0,2,100,5.7,0,0,0,1,0


In [27]:
#One hot encoding (important because many algorithms expect numeric input and may misinterpret integer-coded categorical data 
#as containing ordinal (ranked) information, which is not correct for unordered categories.)
#one hot encoding can increase the number of features (columns) in your dataset

#reload the dataset again
df = pd.read_csv('Diabetes-Classification.csv')
one_encoded = pd.get_dummies(df, columns = ['Gender'])
one_encoded.head()

Unnamed: 0,Age,BMI,Blood Pressure,FBS,HbA1c,Family History of Diabetes,Smoking,Diet,Exercise,Diagnosis,Gender_Female,Gender_Male
0,45,25.0,Normal,100,5.7,No,No,Healthy,Regular,No,False,True
1,55,30.0,High,120,6.4,Yes,Yes,Poor,No,Yes,True,False
2,65,35.0,High,140,7.1,Yes,Yes,Poor,No,Yes,False,True
3,75,40.0,High,160,7.8,Yes,Yes,Poor,No,Yes,True,False
4,40,20.0,Normal,80,5.0,No,No,Healthy,,No,False,True


In [30]:
one_encoded['Gender_Male'] = le.fit_transform(one_encoded['Gender_Male'])
one_encoded['Gender_Female'] = le.fit_transform(one_encoded['Gender_Female'])
one_encoded

Unnamed: 0,Age,BMI,Blood Pressure,FBS,HbA1c,Family History of Diabetes,Smoking,Diet,Exercise,Diagnosis,Gender_Female,Gender_Male
0,45,25.0,Normal,100,5.7,No,No,Healthy,Regular,No,0,1
1,55,30.0,High,120,6.4,Yes,Yes,Poor,No,Yes,1,0
2,65,35.0,High,140,7.1,Yes,Yes,Poor,No,Yes,0,1
3,75,40.0,High,160,7.8,Yes,Yes,Poor,No,Yes,1,0
4,40,20.0,Normal,80,5.0,No,No,Healthy,,No,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
123,17,15.0,Normal,100,5.7,No,Yes,Poor,No,Yes,1,0
124,22,19.0,Normal,120,6.4,No,Yes,Poor,No,Yes,0,1
125,27,24.0,High,140,7.1,No,Yes,Poor,No,Yes,1,0
126,32,29.0,High,160,7.8,No,Yes,Poor,No,Yes,0,1


In [None]:
#how to choose from either label encoding or onehot encoding:
#it depends on the type of categorical variables:

#label encoding: when values in features are ordinal: order Ex: Education - Bachelors > Masters > PhD
#onehot encoding: when categories are not in order(Nominal Values): Gender: Female, Male, Smoking etc.

In [None]:
#applying one hot encoding to all columns except Diagnosis which is the label

cat_col = ['Gender', 'Exercise', 'Blood Pressure', 'Family History of Diabetes', 'Smoking', 'Diet']  
one_encoded = pd.get_dummies(df, columns = cat_col)

#one_encoded['Gender_Female'] = le.fit_transform(one_encoded['Gender_Female'])

one_encoded

Unnamed: 0,Age,BMI,FBS,HbA1c,Diagnosis,Gender_Female,Gender_Male,Exercise_No,Exercise_Regular,Blood Pressure_High,Blood Pressure_Low,Blood Pressure_Normal,Family History of Diabetes_No,Family History of Diabetes_Yes,Smoking_No,Smoking_Yes,Diet_Healthy,Diet_Poor
0,45,25.0,100,5.7,No,False,True,False,True,False,False,True,True,False,True,False,True,False
1,55,30.0,120,6.4,Yes,True,False,True,False,True,False,False,False,True,False,True,False,True
2,65,35.0,140,7.1,Yes,False,True,True,False,True,False,False,False,True,False,True,False,True
3,75,40.0,160,7.8,Yes,True,False,True,False,True,False,False,False,True,False,True,False,True
4,40,20.0,80,5.0,No,False,True,False,False,False,False,True,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,17,15.0,100,5.7,Yes,True,False,True,False,False,False,True,True,False,False,True,False,True
124,22,19.0,120,6.4,Yes,False,True,True,False,False,False,True,True,False,False,True,False,True
125,27,24.0,140,7.1,Yes,True,False,True,False,True,False,False,True,False,False,True,False,True
126,32,29.0,160,7.8,Yes,False,True,True,False,True,False,False,True,False,False,True,False,True


In [33]:
one_encoded.columns


Index(['Age', 'BMI', 'FBS', 'HbA1c', 'Diagnosis', 'Gender_Female',
       'Gender_Male', 'Exercise_No', 'Exercise_Regular', 'Blood Pressure_High',
       'Blood Pressure_Low', 'Blood Pressure_Normal',
       'Family History of Diabetes_No', 'Family History of Diabetes_Yes',
       'Smoking_No', 'Smoking_Yes', 'Diet_Healthy', 'Diet_Poor'],
      dtype='object')

In [22]:
one_encoded.head()

Unnamed: 0,Age,BMI,FBS,HbA1c,Diagnosis,Gender_Female,Gender_Male,Exercise_No,Exercise_Regular,Blood Pressure_High,Blood Pressure_Low,Blood Pressure_Normal,Family History of Diabetes_No,Family History of Diabetes_Yes,Smoking_No,Smoking_Yes,Diet_Healthy,Diet_Poor
0,45,25.0,100,5.7,No,False,True,False,True,False,False,True,True,False,True,False,True,False
1,55,30.0,120,6.4,Yes,True,False,True,False,True,False,False,False,True,False,True,False,True
2,65,35.0,140,7.1,Yes,False,True,True,False,True,False,False,False,True,False,True,False,True
3,75,40.0,160,7.8,Yes,True,False,True,False,True,False,False,False,True,False,True,False,True
4,40,20.0,80,5.0,No,False,True,False,False,False,False,True,True,False,True,False,True,False


In [34]:
#label encoding the one hot encoded columns to convert all the string values in newly generated columns to integers
cat_col = ['Diagnosis', 'Gender_Female',
       'Gender_Male', 'Exercise_No', 'Exercise_Regular', 'Blood Pressure_High',
       'Blood Pressure_Low', 'Blood Pressure_Normal',
       'Family History of Diabetes_No', 'Family History of Diabetes_Yes',
       'Smoking_No', 'Smoking_Yes', 'Diet_Healthy', 'Diet_Poor', 'Diagnosis']

from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
for col in cat_col:
    one_encoded[col]=le.fit_transform(one_encoded[col])  


one_encoded.head()    

Unnamed: 0,Age,BMI,FBS,HbA1c,Diagnosis,Gender_Female,Gender_Male,Exercise_No,Exercise_Regular,Blood Pressure_High,Blood Pressure_Low,Blood Pressure_Normal,Family History of Diabetes_No,Family History of Diabetes_Yes,Smoking_No,Smoking_Yes,Diet_Healthy,Diet_Poor
0,45,25.0,100,5.7,0,0,1,0,1,0,0,1,1,0,1,0,1,0
1,55,30.0,120,6.4,1,1,0,1,0,1,0,0,0,1,0,1,0,1
2,65,35.0,140,7.1,1,0,1,1,0,1,0,0,0,1,0,1,0,1
3,75,40.0,160,7.8,1,1,0,1,0,1,0,0,0,1,0,1,0,1
4,40,20.0,80,5.0,0,0,1,0,0,0,0,1,1,0,1,0,1,0


In [24]:
#Descriptive analysis: 
#Exploratory data analysis df info, correlation matrices, pair plot, box plot etc
#Assignment: 

#Perform -
#Descriptive analysis
#Exploratory data analysis
#here.

#we will discuss about these in the mentoring session

In [25]:
#Feature Selection: Removing irrelevant features which do not contribute towards the label(Diagnosis)

from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func = f_classif, k = 5)

In [26]:
#splitting label and features
x = one_encoded.drop('Diagnosis', axis = 1)
y = one_encoded['Diagnosis']

In [None]:
#applying feature selection
x_selected=selector.fit_transform(x,y)
selected_features=x.columns[selector.get_support()]
print(selected_features)

In [None]:
#created df with selected features
x1 = one_encoded[selected_features]


In [None]:
#train, test, split
from sklearn.model_selection import train_test_split

#for all features
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 10)

#for selected features
x1_train, x1_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 10)

In [None]:
#buiding knn models
from sklearn.neighbors import KNeighborsClassifier 

knn = KNeighborsClassifier(n_neighbors = 5)
knn_selected = KNeighborsClassifier(n_neighbors = 5)

In [None]:
#training moddels
#all features
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)

#selected features
knn_selected.fit(x1_train, y_train)
y_pred_knn1 = knn_selected.predict(x1_test)

In [None]:
#classification report

from sklearn.metrics import classification_report

print("nKNN Performance:")
print(classification_report(y_test, y_pred_knn))

print("nKNN_Selected Performance:")
print(classification_report(y_test, y_pred_knn1))

In [None]:
#building svm models
from sklearn.svm import SVC

svm=SVC(kernel='rbf', probability=True,random_state=10 )
svm_selected=SVC(kernel='rbf', probability=True,random_state=10 )

In [None]:
#training models
#all features
svm.fit(x_train,y_train)
y_pred_svm=svm.predict(x_test)


#selcted features
svm_selected.fit(x1_train,y_train)
y_pred_svm1=svm_selected.predict(x1_test)

In [None]:
#classification report

from sklearn.metrics import classification_report

print("SVM Performance:")
print(classification_report(y_test, y_pred_svm))

print("nSVM_Selected Performance:")
print(classification_report(y_test, y_pred_svm))

In [None]:
#confusion: it show labelwise performance of the model

from sklearn.metrics import confusion_matrix

cm=confusion_matrix(y_test,y_pred_svm)
sns.heatmap(cm,annot=True)

In [None]:
#plot ROC curve for svm all features model
from sklearn.metrics import roc_auc_score, roc_curve

y_proba = svm.predict_proba(x_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label=f'{'svm'} (AUC = {roc_auc_score(y_test, y_proba):.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for SVM(all features) model')
plt.legend()
plt.show()

In [None]:
#Assignment: 

#1:Plot confusion matrix and ROC curves for other models
#2:Learn more about evaluation metrics sucha as confusion matrix, accuracy, precision, recall, AUC, ROC etc
#3:Compare the performance(accuracy, f1 score etc) of SVM and KNN models we created using graph plots
#4: Re-do the entire analysis with label encoding instead of one hot encoding and notice performance difference

#we will discuss about these in mentoring session