In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ckdisease/kidney_disease.csv


# **Aim of the CKD Dataset**

The aim of the Chronic Kidney Disease (CKD) dataset is to facilitate the study and prediction of chronic kidney disease by using machine learning and statistical analysis techniques. This dataset typically contains medical and laboratory information about patients, including features such as age, blood pressure, specific laboratory tests (e.g., serum creatinine, blood urea, hemoglobin), and other relevant clinical data.

Key objectives include:

**Predicting Chronic Kidney Disease**: Developing predictive models to accurately classify whether a patient has CKD based on the available features.

**Understanding Risk Factors**: Identifying key factors or indicators that contribute to the onset and progression of CKD, which can aid in early detection and intervention.

**Improving Clinical Decision-Making**: Assisting healthcare professionals in making informed decisions regarding diagnosis, treatment, and management of CKD.

This dataset is widely used for research, education, and the development of machine learning models aimed at improving patient outcomes in the context of CKD.

# **Understanding about the CKD Dataset Features**

<table border="1" cellpadding="5" cellspacing="0">
  <thead>
    <tr>
      <th>Feature</th>
      <th>Description</th>
      <th>Type</th>
      <th>Example</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Age</td>
      <td>The age of the patient in years.</td>
      <td>Numerical</td>
      <td>48 years</td>
    </tr>
    <tr>
      <td>Blood Pressure (bp)</td>
      <td>The patient's blood pressure, typically measured in mm Hg.</td>
      <td>Numerical</td>
      <td>80 (diastolic pressure)</td>
    </tr>
    <tr>
      <td>Specific Gravity (sg)</td>
      <td>Measure of urine concentration, indicating kidney's ability to concentrate urine.</td>
      <td>Categorical (1.005-1.025)</td>
      <td>1.020</td>
    </tr>
    <tr>
      <td>Albumin (al)</td>
      <td>Presence of albumin in urine, indicating kidney damage.</td>
      <td>Categorical (0-5)</td>
      <td>2</td>
    </tr>
    <tr>
      <td>Sugar (su)</td>
      <td>Presence of sugar in urine, possibly indicating diabetes or kidney disease.</td>
      <td>Categorical (0-5)</td>
      <td>3</td>
    </tr>
    <tr>
      <td>Red Blood Cells (rbc)</td>
      <td>Presence of red blood cells in urine, abnormal levels may indicate kidney disease.</td>
      <td>Categorical (normal, abnormal)</td>
      <td>abnormal</td>
    </tr>
    <tr>
      <td>Pus Cell (pc)</td>
      <td>Presence of pus cells in urine, indicating infection or inflammation in kidneys.</td>
      <td>Categorical (normal, abnormal)</td>
      <td>normal</td>
    </tr>
    <tr>
      <td>Pus Cell Clumps (pcc)</td>
      <td>Clumps of pus cells in urine, suggesting a more severe infection.</td>
      <td>Categorical (present, not present)</td>
      <td>not present</td>
    </tr>
    <tr>
      <td>Bacteria (ba)</td>
      <td>Presence of bacteria in urine, possibly indicating a urinary tract infection.</td>
      <td>Categorical (present, not present)</td>
      <td>present</td>
    </tr>
    <tr>
      <td>Blood Glucose Random (bgr)</td>
      <td>Random measurement of blood glucose levels, indicating blood sugar control.</td>
      <td>Numerical (mg/dL)</td>
      <td>121 mg/dL</td>
    </tr>
    <tr>
      <td>Blood Urea (bu)</td>
      <td>Level of urea in blood, an indicator of kidney function.</td>
      <td>Numerical (mg/dL)</td>
      <td>36 mg/dL</td>
    </tr>
    <tr>
      <td>Serum Creatinine (sc)</td>
      <td>Level of creatinine in blood, a marker of kidney function.</td>
      <td>Numerical (mg/dL)</td>
      <td>1.2 mg/dL</td>
    </tr>
    <tr>
      <td>Sodium (sod)</td>
      <td>Level of sodium in blood, affecting kidney function.</td>
      <td>Numerical (mEq/L)</td>
      <td>140 mEq/L</td>
    </tr>
    <tr>
      <td>Potassium (pot)</td>
      <td>Level of potassium in blood, important for kidney function.</td>
      <td>Numerical (mEq/L)</td>
      <td>4.5 mEq/L</td>
    </tr>
    <tr>
      <td>Hemoglobin (hemo)</td>
      <td>Amount of hemoglobin in blood, can be low in kidney disease.</td>
      <td>Numerical (g/dL)</td>
      <td>13.5 g/dL</td>
    </tr>
    <tr>
      <td>Packed Cell Volume (pcv)</td>
      <td>Volume percentage of red blood cells in blood, related to kidney function.</td>
      <td>Numerical (Percentage)</td>
      <td>41%</td>
    </tr>
    <tr>
      <td>White Blood Cell Count (wc)</td>
      <td>Number of white blood cells in blood, indicating infection or inflammation.</td>
      <td>Numerical (cells/cumm)</td>
      <td>8400 cells/cumm</td>
    </tr>
    <tr>
      <td>Red Blood Cell Count (rc)</td>
      <td>Number of red blood cells in blood, related to overall blood health.</td>
      <td>Numerical (millions/cumm)</td>
      <td>5.2 millions/cumm</td>
    </tr>
    <tr>
      <td>Hypertension (htn)</td>
      <td>Indicates whether the patient has hypertension, often associated with CKD.</td>
      <td>Categorical (yes, no)</td>
      <td>yes</td>
    </tr>
    <tr>
      <td>Diabetes Mellitus (dm)</td>
      <td>Indicates whether the patient has diabetes mellitus, which is a risk factor for CKD.</td>
      <td>Categorical (yes, no)</td>
      <td>yes</td>
    </tr>
    <tr>
      <td>Coronary Artery Disease (cad)</td>
      <td>Indicates whether the patient has coronary artery disease, another risk factor for CKD.</td>
      <td>Categorical (yes, no)</td>
      <td>no</td>
    </tr>
    <tr>
      <td>Appetite</td>
      <td>Indicates the patient's appetite, which can be affected by CKD.</td>
      <td>Categorical (good, poor)</td>
      <td>good</td>
    </tr>
    <tr>
      <td>Pedal Edema</td>
      <td>Indicates whether the patient has swelling in the lower extremities, a common symptom in CKD.</td>
      <td>Categorical (yes, no)</td>
      <td>no</td>
    </tr>
    <tr>
      <td>Anemia</td>
      <td>Indicates whether the patient has anemia, which can be a complication of CKD.</td>
      <td>Categorical (yes, no)</td>
      <td>no</td>
    </tr>
    <tr>
      <td>Class</td>
      <td>Indicates whether the patient has CKD or not (the target variable).</td>
      <td>Categorical (ckd, notckd)</td>
      <td>ckd</td>
    </tr>
  </tbody>
</table>


# **Import Required Libraries**

In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


In [3]:
dataset = pd.read_csv(r'/kaggle/input/ckdisease/kidney_disease.csv')

In [4]:
dataset.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [5]:
# Find null values in our dataset
dataset.isnull().sum()/len(dataset)*100

id                 0.00
age                2.25
bp                 3.00
sg                11.75
al                11.50
su                12.25
rbc               38.00
pc                16.25
pcc                1.00
ba                 1.00
bgr               11.00
bu                 4.75
sc                 4.25
sod               21.75
pot               22.00
hemo              13.00
pcv               17.50
wc                26.25
rc                32.50
htn                0.50
dm                 0.50
cad                0.50
appet              0.25
pe                 0.25
ane                0.25
classification     0.00
dtype: float64

In [6]:
dataset.dtypes

id                  int64
age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc                object
pc                 object
pcc                object
ba                 object
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv                object
wc                 object
rc                 object
htn                object
dm                 object
cad                object
appet              object
pe                 object
ane                object
classification     object
dtype: object

In [7]:
dataset.dtypes

id                  int64
age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc                object
pc                 object
pcc                object
ba                 object
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv                object
wc                 object
rc                 object
htn                object
dm                 object
cad                object
appet              object
pe                 object
ane                object
classification     object
dtype: object

In [8]:
dataset.describe()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,400.0,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,199.5,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,115.614301,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,0.0,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,99.75,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,299.25,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


As we can see that 'packed_cell_volume', 'white_blood_cell_count' and 'red_blood_cell_count' are object type. We need to change them to numerical dtype.

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [10]:
imputer1 = SimpleImputer(strategy='mean')
imputer2 = SimpleImputer(strategy='median')
imputer3 = SimpleImputer(strategy='most_frequent')


In [11]:
dataset.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [12]:
trf = ColumnTransformer([('imputer1', imputer1,['age','bp','sg','al','su','sod','pot', 'hemo']),
                         ('imputer2', imputer2, ['bgr','bu','sc']),
                         ('imputer3', imputer3,['rbc', 'pc', 'pcc', 'ba','pcv', 'wc', 'rc', 'htn', 'dm', 'cad','appet', 'pe', 'ane']),
                         ],
                          remainder='passthrough')

In [13]:
dataset1 = dataset.copy()
dataset1.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [14]:
dataset1 = dataset1.drop(['id'], axis=1)

In [15]:
trf

In [16]:
df_imputed = pd.DataFrame(trf.fit_transform(dataset1))
#df_imputed.columns = dataset1.columns
df_imputed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,48.0,80.0,1.02,1.0,0.0,137.528754,4.627244,15.4,121.0,36.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,137.528754,4.627244,11.3,121.0,18.0,...,38,6000,5.2,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,137.528754,4.627244,9.6,423.0,53.0,...,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,111.0,2.5,11.2,117.0,56.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,137.528754,4.627244,11.6,106.0,26.0,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.02,0.0,0.0,150.0,4.9,15.7,140.0,49.0,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,141.0,3.5,16.5,75.0,31.0,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.02,0.0,0.0,137.0,4.4,15.8,100.0,26.0,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,135.0,4.9,14.2,114.0,50.0,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [17]:
df_imputed.columns = ['age','bp','sg','al','su','sod','pot', 'hemo','bgr','bu','sc','rbc', 'pc', 'pcc', 'ba','pcv', 'wc', 'rc', 'htn', 'dm', 'cad','appet', 'pe', 'ane','classification']

In [18]:
df_imputed.head()

Unnamed: 0,age,bp,sg,al,su,sod,pot,hemo,bgr,bu,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,137.528754,4.627244,15.4,121.0,36.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,137.528754,4.627244,11.3,121.0,18.0,...,38,6000,5.2,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,137.528754,4.627244,9.6,423.0,53.0,...,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,111.0,2.5,11.2,117.0,56.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,137.528754,4.627244,11.6,106.0,26.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [19]:
print(dataset1.columns)
print("*************"*10)
print(df_imputed.columns)

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')
**********************************************************************************************************************************
Index(['age', 'bp', 'sg', 'al', 'su', 'sod', 'pot', 'hemo', 'bgr', 'bu', 'sc',
       'rbc', 'pc', 'pcc', 'ba', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')


In [20]:
dataset1.shape

(400, 25)

In [21]:
#df_imputed.columns = dataset.columns
df_imputed.shape

(400, 25)

In [22]:
df_imputed.isnull().sum()

age               0
bp                0
sg                0
al                0
su                0
sod               0
pot               0
hemo              0
bgr               0
bu                0
sc                0
rbc               0
pc                0
pcc               0
ba                0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

# **Finding the unique values in the columns**

In [23]:

for i in df_imputed.columns:
  print("***********************************************************************", i,
        "************************************************************")
  print()
  print(set(df_imputed[i].tolist()))
  print()

*********************************************************************** age ************************************************************

{2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 14.0, 15.0, 17.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 51.48337595907928, 54.0, 55.0, 56.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 57.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 90.0, 66.0}

*********************************************************************** bp ************************************************************

{100.0, 70.0, 76.46907216494846, 140.0, 110.0, 80.0, 50.0, 180.0, 120.0, 90.0, 60.0}

*********************************************************************** sg ************************************************************

{1.0174079320113314, 1.02, 1.005

# **Steps to Clean and Standardize Categorical Features and Numerical Features**

# **bp,hemo,pcv,wc,rc,dm,cad,classification**


In [24]:
df_imputed['bp'] = np.where(df_imputed['bp']==0.00, df_imputed['bp'].median(), df_imputed['bp'])

In [25]:
df_imputed['hemo'] = np.where(df_imputed['hemo']==0.00, df_imputed['hemo'].median(), df_imputed['hemo'])

In [26]:
df_imputed['pcv'].value_counts()

pcv
41      91
52      21
48      19
44      19
40      16
43      14
45      13
42      13
28      12
36      12
33      12
32      12
50      12
37      11
34      11
30       9
35       9
29       9
46       9
31       8
39       7
24       7
26       6
38       5
47       4
54       4
53       4
49       4
51       4
22       3
25       3
27       3
23       2
19       2
16       1
14       1
17       1
18       1
\t?      1
15       1
20       1
21       1
9        1
\t43     1
Name: count, dtype: int64

In [27]:
# NOTE: There are 2 Inconsistent values in PCV feature
df_imputed['pcv'].mode()

0    41
Name: pcv, dtype: object

In [28]:
df_imputed['pcv'] = df_imputed['pcv'].apply(lambda x:'43' if x=='\t43' else x)

In [29]:
df_imputed['pcv'] = df_imputed['pcv'].apply(lambda x:'41' if x=='\t?' else x)

In [30]:
df_imputed['wc'].value_counts()

wc
9800     116
6700      10
7200       9
9600       9
9200       9
        ... 
4900       1
12000      1
15700      1
4100       1
11500      1
Name: count, Length: 92, dtype: int64

In [31]:
df_imputed['wc'] = df_imputed['wc'].apply(lambda x:'9800' if x=='\t?' else x)

In [32]:
df_imputed['rc'].value_counts()

rc
5.2    148
4.5     16
4.9     14
4.7     11
3.9     10
4.8     10
3.4      9
4.6      9
3.7      8
5.0      8
6.1      8
5.9      8
5.5      8
5.8      7
3.8      7
5.4      7
5.3      7
5.6      6
4.3      6
4.2      6
6.5      5
6.2      5
5.7      5
3.2      5
4.1      5
4.4      5
5.1      5
6.4      5
6.3      4
6.0      4
3.6      4
4.0      3
4        3
3.5      3
3.3      3
5        2
2.6      2
2.8      2
2.9      2
3.1      2
2.5      2
2.1      2
3.0      2
2.7      2
2.3      1
8.0      1
2.4      1
\t?      1
3        1
Name: count, dtype: int64

In [33]:
df_imputed['rc'] = df_imputed['rc'].apply(lambda x:'5.2' if x=='\t?' else x)

In [34]:
df_imputed['dm'].value_counts()

dm
no       260
yes      134
\tno       3
\tyes      2
 yes       1
Name: count, dtype: int64

In [35]:
df_imputed['dm'] = df_imputed['dm'].apply(lambda x:'no' if x=='\tno' else x)
df_imputed['dm'] = df_imputed['dm'].apply(lambda x:'yes' if x=='\tyes' else x)
df_imputed['dm'] = df_imputed['dm'].apply(lambda x:'yes' if x==' yes' else x)

In [36]:
df_imputed['cad'].value_counts()

cad
no      364
yes      34
\tno      2
Name: count, dtype: int64

In [37]:
df_imputed['cad'] = df_imputed['cad'].apply(lambda x:'no' if x=='\tno' else x)

In [38]:
df_imputed['classification'].value_counts()

classification
ckd       248
notckd    150
ckd\t       2
Name: count, dtype: int64

In [39]:
df_imputed['classification'] = df_imputed['classification'].apply(lambda x:'ckd' if x=='ckd\t' else x)

In [40]:
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             400 non-null    object
 1   bp              400 non-null    object
 2   sg              400 non-null    object
 3   al              400 non-null    object
 4   su              400 non-null    object
 5   sod             400 non-null    object
 6   pot             400 non-null    object
 7   hemo            400 non-null    object
 8   bgr             400 non-null    object
 9   bu              400 non-null    object
 10  sc              400 non-null    object
 11  rbc             400 non-null    object
 12  pc              400 non-null    object
 13  pcc             400 non-null    object
 14  ba              400 non-null    object
 15  pcv             400 non-null    object
 16  wc              400 non-null    object
 17  rc              400 non-null    object
 18  htn       

In [41]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

# **As we can see that 'PCV', 'WC' and 'RC' are object type. We need to change them to numerical dtype.**

In [42]:
for i in dataset1.select_dtypes(exclude=['object']).columns:
  df_imputed[i] = df_imputed[i].apply(lambda x:float(x))

In [43]:
df_imputed['pcv'] = pd.to_numeric(df_imputed['pcv'])
df_imputed['wc'] = pd.to_numeric(df_imputed['wc'])
df_imputed['rc'] = pd.to_numeric(df_imputed['rc'])

In [44]:
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             400 non-null    float64
 1   bp              400 non-null    float64
 2   sg              400 non-null    float64
 3   al              400 non-null    float64
 4   su              400 non-null    float64
 5   sod             400 non-null    float64
 6   pot             400 non-null    float64
 7   hemo            400 non-null    float64
 8   bgr             400 non-null    float64
 9   bu              400 non-null    float64
 10  sc              400 non-null    float64
 11  rbc             400 non-null    object 
 12  pc              400 non-null    object 
 13  pcc             400 non-null    object 
 14  ba              400 non-null    object 
 15  pcv             400 non-null    int64  
 16  wc              400 non-null    int64  
 17  rc              400 non-null    flo

In [45]:
df_imputed.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'sod', 'pot', 'hemo', 'bgr', 'bu', 'sc',
       'rbc', 'pc', 'pcc', 'ba', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [46]:
# Extracting categorical and numerical columns

cat_cols = [col for col in df_imputed.columns if df_imputed[col].dtype == 'object']
num_cols = [col for col in df_imputed.columns if df_imputed[col].dtype != 'object']

# **All the missing values are handeled now, lets do ctaegorical features encding now**

# **Feature Encoding**

In [47]:
for col in cat_cols:
    print(f"{col} has {df_imputed[col].nunique()} categories \n")

rbc has 2 categories 

pc has 2 categories 

pcc has 2 categories 

ba has 2 categories 

htn has 2 categories 

dm has 2 categories 

cad has 2 categories 

appet has 2 categories 

pe has 2 categories 

ane has 2 categories 

classification has 2 categories 



# **As all of the categorical columns have 2 categories we can use label encoder**

In [48]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    df_imputed[col] = le.fit_transform(df_imputed[col])

In [49]:
df_imputed.head()

Unnamed: 0,age,bp,sg,al,su,sod,pot,hemo,bgr,bu,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,137.528754,4.627244,15.4,121.0,36.0,...,44,7800,5.2,1,1,0,0,0,0,0
1,7.0,50.0,1.02,4.0,0.0,137.528754,4.627244,11.3,121.0,18.0,...,38,6000,5.2,0,0,0,0,0,0,0
2,62.0,80.0,1.01,2.0,3.0,137.528754,4.627244,9.6,423.0,53.0,...,31,7500,5.2,0,1,0,1,0,1,0
3,48.0,70.0,1.005,4.0,0.0,111.0,2.5,11.2,117.0,56.0,...,32,6700,3.9,1,0,0,1,1,1,0
4,51.0,80.0,1.01,2.0,0.0,137.528754,4.627244,11.6,106.0,26.0,...,35,7300,4.6,0,0,0,0,0,0,0


# **Check if there is any Imbalence data in our Target variable**

In [50]:
df_imputed['classification'].value_counts()

classification
0    250
1    150
Name: count, dtype: int64

In [51]:
# Feature Scaling
x = df_imputed.iloc[:,:-1]
y = df_imputed.iloc[:,-1]

In [52]:
scaler = MinMaxScaler((-1,1))
x = scaler.fit_transform(x)
pd.DataFrame(x).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.045455,-0.538462,0.5,-0.6,-1.0,0.678596,-0.904394,0.673469,-0.576923,-0.82285,...,-1.0,0.555556,-0.53719,0.050847,1.0,1.0,-1.0,-1.0,-1.0,-1.0
1,-0.886364,-1.0,0.5,0.6,-1.0,0.678596,-0.904394,0.115646,-0.576923,-0.915276,...,-1.0,0.288889,-0.68595,0.050847,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,0.363636,-0.538462,-0.5,-0.2,0.2,0.678596,-0.904394,-0.115646,0.713675,-0.735558,...,-1.0,-0.022222,-0.561983,0.050847,-1.0,1.0,-1.0,1.0,-1.0,1.0
3,0.045455,-0.692308,-1.0,0.6,-1.0,0.343849,-1.0,0.102041,-0.594017,-0.720154,...,-1.0,0.022222,-0.628099,-0.389831,1.0,-1.0,-1.0,1.0,1.0,1.0
4,0.113636,-0.538462,-0.5,-0.2,-1.0,0.678596,-0.904394,0.156463,-0.641026,-0.874198,...,-1.0,0.155556,-0.578512,-0.152542,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


#                **Model Building**

In [53]:
# split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# **k-nearest neighbors (KNN) algorithm**

In [54]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of knn

knn_acc = accuracy_score(y_test, knn.predict(x_test))

print(f"Training Accuracy of KNN is {accuracy_score(y_train, knn.predict(x_train))}")
print(f"Test Accuracy of KNN is {knn_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, knn.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, knn.predict(x_test))}")

Training Accuracy of KNN is 0.984375
Test Accuracy of KNN is 0.9875 

Confusion Matrix :- 
[[49  1]
 [ 0 30]]

Classification Report :- 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        50
           1       0.97      1.00      0.98        30

    accuracy                           0.99        80
   macro avg       0.98      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80



# **Decision Tree Classifier**

In [55]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of decision tree

dtc_acc = accuracy_score(y_test, dtc.predict(x_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, dtc.predict(x_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {dtc_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, dtc.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, dtc.predict(x_test))}")

Training Accuracy of Decision Tree Classifier is 1.0
Test Accuracy of Decision Tree Classifier is 0.9625 

Confusion Matrix :- 
[[50  0]
 [ 3 27]]

Classification Report :- 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97        50
           1       1.00      0.90      0.95        30

    accuracy                           0.96        80
   macro avg       0.97      0.95      0.96        80
weighted avg       0.96      0.96      0.96        80



 # Hyper parameter tuning of decision tree 

# -----------------------------**IMPORTANT**-----------------------

# Deprecation of 'auto':

# Deprecation Warning: The warning indicates that the max_features='auto' option has been deprecated since scikit- learn version 1.1 and will be removed in version 1.3. The recommendation is to explicitly set 
# max_features='sqrt' to maintain the behavior similar to the old 'auto' setting.

In [56]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
grid_param = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'splitter' : ['best', 'random'],
    'min_samples_leaf' : [1, 2, 3, 5, 7],
    'min_samples_split' : [2, 3, 5, 7],  # Removed 1 to avoid errors
    'max_features' : ['sqrt', 'log2']  # Removed 'auto' to avoid deprecation warning
}

# Initialize the DecisionTreeClassifier
dtc = DecisionTreeClassifier()

# Set up GridSearchCV with the corrected parameter grid
grid_search_dtc = GridSearchCV(dtc, grid_param, cv=5, n_jobs=-1, verbose=1)

# Fit the model on the training data
grid_search_dtc.fit(x_train, y_train)

Fitting 5 folds for each of 640 candidates, totalling 3200 fits


# --------------------------------Next Steps--------------------------------
# **After running the grid search, you can find the best parameters and evaluate the performance of the resulting model:**

# **Best parameters and best score**

In [57]:
print(grid_search_dtc.best_params_)
print(grid_search_dtc.best_score_)

{'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 3, 'splitter': 'random'}
0.98125


#  **Best estimator**

In [58]:
dtc = grid_search_dtc.best_estimator_

# accuracy score, confusion matrix and classification report of decision tree

dtc_acc = accuracy_score(y_test, dtc.predict(x_test))

print(f"Training Accuracy of Decision Tree Classifier is {accuracy_score(y_train, dtc.predict(x_train))}")
print(f"Test Accuracy of Decision Tree Classifier is {dtc_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, dtc.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, dtc.predict(x_test))}")


Training Accuracy of Decision Tree Classifier is 0.984375
Test Accuracy of Decision Tree Classifier is 0.9625 

Confusion Matrix :- 
[[47  3]
 [ 0 30]]

Classification Report :- 
               precision    recall  f1-score   support

           0       1.00      0.94      0.97        50
           1       0.91      1.00      0.95        30

    accuracy                           0.96        80
   macro avg       0.95      0.97      0.96        80
weighted avg       0.97      0.96      0.96        80



# -----------------------**Random Forest Classifier**-------------------

In [59]:
from sklearn.ensemble import RandomForestClassifier

rd_clf = RandomForestClassifier(criterion = 'entropy', max_depth = 11, max_features = 'auto', min_samples_leaf = 2, min_samples_split = 3, n_estimators = 130)
rd_clf.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of random forest

rd_clf_acc = accuracy_score(y_test, rd_clf.predict(x_test))

print(f"Training Accuracy of Random Forest Classifier is {accuracy_score(y_train, rd_clf.predict(x_train))}")
print(f"Test Accuracy of Random Forest Classifier is {rd_clf_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, rd_clf.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, rd_clf.predict(x_test))}")

Training Accuracy of Random Forest Classifier is 1.0
Test Accuracy of Random Forest Classifier is 0.9625 

Confusion Matrix :- 
[[50  0]
 [ 3 27]]

Classification Report :- 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97        50
           1       1.00      0.90      0.95        30

    accuracy                           0.96        80
   macro avg       0.97      0.95      0.96        80
weighted avg       0.96      0.96      0.96        80



# --------------------------**Ada Boost Classifier**--------------------

In [60]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator = dtc)
ada.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of ada boost

ada_acc = accuracy_score(y_test, ada.predict(x_test))

print(f"Training Accuracy of Ada Boost Classifier is {accuracy_score(y_train, ada.predict(x_train))}")
print(f"Test Accuracy of Ada Boost Classifier is {ada_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, ada.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, ada.predict(x_test))}")


Training Accuracy of Ada Boost Classifier is 1.0
Test Accuracy of Ada Boost Classifier is 1.0 

Confusion Matrix :- 
[[50  0]
 [ 0 30]]

Classification Report :- 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        30

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



# -------------------------**Gradient Boosting Classifier**-------------------

In [61]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of gradient boosting classifier

gb_acc = accuracy_score(y_test, gb.predict(x_test))

print(f"Training Accuracy of Gradient Boosting Classifier is {accuracy_score(y_train, gb.predict(x_train))}")
print(f"Test Accuracy of Gradient Boosting Classifier is {gb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, gb.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, gb.predict(x_test))}")

Training Accuracy of Gradient Boosting Classifier is 1.0
Test Accuracy of Gradient Boosting Classifier is 0.975 

Confusion Matrix :- 
[[50  0]
 [ 2 28]]

Classification Report :- 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        50
           1       1.00      0.93      0.97        30

    accuracy                           0.97        80
   macro avg       0.98      0.97      0.97        80
weighted avg       0.98      0.97      0.97        80



# -----------------------**Stochastic Gradient Boosting (SGB)**------------

In [62]:
sgb = GradientBoostingClassifier(max_depth = 4, subsample = 0.90, max_features = 0.75, n_estimators = 200)
sgb.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of stochastic gradient boosting classifier

sgb_acc = accuracy_score(y_test, sgb.predict(x_test))

print(f"Training Accuracy of Stochastic Gradient Boosting is {accuracy_score(y_train, sgb.predict(x_train))}")
print(f"Test Accuracy of Stochastic Gradient Boosting is {sgb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, sgb.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, sgb.predict(x_test))}")

Training Accuracy of Stochastic Gradient Boosting is 1.0
Test Accuracy of Stochastic Gradient Boosting is 0.975 

Confusion Matrix :- 
[[50  0]
 [ 2 28]]

Classification Report :- 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        50
           1       1.00      0.93      0.97        30

    accuracy                           0.97        80
   macro avg       0.98      0.97      0.97        80
weighted avg       0.98      0.97      0.97        80



# -------------------------------**XgBoost**-----------------------

In [63]:
from xgboost import XGBClassifier

xgb = XGBClassifier(objective = 'binary:logistic', learning_rate = 0.5, max_depth = 5, n_estimators = 150)
xgb.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of xgboost

xgb_acc = accuracy_score(y_test, xgb.predict(x_test))

print(f"Training Accuracy of XgBoost is {accuracy_score(y_train, xgb.predict(x_train))}")
print(f"Test Accuracy of XgBoost is {xgb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, xgb.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, xgb.predict(x_test))}")

Training Accuracy of XgBoost is 1.0
Test Accuracy of XgBoost is 0.9375 

Confusion Matrix :- 
[[48  2]
 [ 3 27]]

Classification Report :- 
               precision    recall  f1-score   support

           0       0.94      0.96      0.95        50
           1       0.93      0.90      0.92        30

    accuracy                           0.94        80
   macro avg       0.94      0.93      0.93        80
weighted avg       0.94      0.94      0.94        80



# --------------------------**Cat Boost Classifier**-----------------------

In [64]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(iterations=10)
cat.fit(x_train, y_train)

Learning rate set to 0.432149
0:	learn: 0.3217869	total: 54.1ms	remaining: 487ms
1:	learn: 0.1452301	total: 55.7ms	remaining: 223ms
2:	learn: 0.1059415	total: 56.4ms	remaining: 132ms
3:	learn: 0.0520325	total: 57.9ms	remaining: 86.8ms
4:	learn: 0.0405046	total: 59.2ms	remaining: 59.2ms
5:	learn: 0.0299071	total: 60.5ms	remaining: 40.4ms
6:	learn: 0.0208935	total: 61.9ms	remaining: 26.5ms
7:	learn: 0.0162274	total: 63.1ms	remaining: 15.8ms
8:	learn: 0.0135028	total: 64.7ms	remaining: 7.19ms
9:	learn: 0.0119991	total: 66.3ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7df6f9f3eef0>

In [65]:
# accuracy score, confusion matrix and classification report of cat boost

cat_acc = accuracy_score(y_test, cat.predict(x_test))

print(f"Training Accuracy of Cat Boost Classifier is {accuracy_score(y_train, cat.predict(x_train))}")
print(f"Test Accuracy of Cat Boost Classifier is {cat_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, cat.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, cat.predict(x_test))}")

Training Accuracy of Cat Boost Classifier is 1.0
Test Accuracy of Cat Boost Classifier is 0.9625 

Confusion Matrix :- 
[[50  0]
 [ 3 27]]

Classification Report :- 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97        50
           1       1.00      0.90      0.95        30

    accuracy                           0.96        80
   macro avg       0.97      0.95      0.96        80
weighted avg       0.96      0.96      0.96        80



# ---------------------------**Extra Trees Classifier**--------------------

In [66]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier()
etc.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of extra trees classifier

etc_acc = accuracy_score(y_test, etc.predict(x_test))

print(f"Training Accuracy of Extra Trees Classifier is {accuracy_score(y_train, etc.predict(x_train))}")
print(f"Test Accuracy of Extra Trees Classifier is {etc_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, etc.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, etc.predict(x_test))}")

Training Accuracy of Extra Trees Classifier is 1.0
Test Accuracy of Extra Trees Classifier is 1.0 

Confusion Matrix :- 
[[50  0]
 [ 0 30]]

Classification Report :- 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        30

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



# ----------------------------**LGBM Classifier**-----------------------

In [67]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(boosting_type='gbdt')
lgbm.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of lgbm classifier

lgbm_acc = accuracy_score(y_test, lgbm.predict(x_test))

print(f"Training Accuracy of LGBM Classifier is {accuracy_score(y_train, lgbm.predict(x_train))}")
print(f"Test Accuracy of LGBM Classifier is {lgbm_acc} \n")

print(f"{confusion_matrix(y_test, lgbm.predict(x_test))}\n")
print(classification_report(y_test, lgbm.predict(x_test)))

[LightGBM] [Info] Number of positive: 120, number of negative: 200
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 540
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.375000 -> initscore=-0.510826
[LightGBM] [Info] Start training from score -0.510826
Training Accuracy of LGBM Classifier is 1.0
Test Accuracy of LGBM Classifier is 1.0 

[[50  0]
 [ 0 30]]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        30

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



# -----------------------Support Vector Machine ( SVM )--------------------

In [68]:
from sklearn.svm import SVC

In [69]:
# SVM -
# kernel = linear
svm_linear = SVC(kernel='linear')
linear = svm_linear.fit(x_train, y_train)
y_pred_linear = svm_linear.predict(x_test)
accuracy_linear = accuracy_score(y_test, y_pred_linear)

# kernel = sigmoid
svm_sigmoid = SVC(kernel='sigmoid')
sigmoid = svm_sigmoid.fit(x_train, y_train)
y_pred_sigmoid = svm_sigmoid.predict(x_test)
accuracy_sigmoid = accuracy_score(y_test, y_pred_sigmoid)

# kernel = poly
svm_poly = SVC(kernel='poly')
poly = svm_poly.fit(x_train, y_train)
y_pred_poly = svm_poly.predict(x_test)
accuracy_poly = accuracy_score(y_test, y_pred_poly)

# kernel = rbf
svm_rbf = SVC(kernel='rbf')
rbf = svm_rbf.fit(x_train, y_train)
y_pred_rbf = svm_rbf.predict(x_test)
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)

# **Always Prefer 'RBF' kernal for SVM Algorithem**

In [70]:
# Create an SVM classifier with an RBF kernel
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(x_train, y_train)

# Evaluate the model
svm_rbf_acc = accuracy_score(y_test, svm_rbf.predict(x_test))
print(f"Training Accuracy of SVM Classifier: {accuracy_score(y_train, svm_rbf.predict(x_train))}")
print(f"Test Accuracy of SVM Classifier: {svm_rbf_acc}\n")

print(f"Confusion Matrix:\n{confusion_matrix(y_test, svm_rbf.predict(x_test))}\n")
print(f"Classification Report:\n{classification_report(y_test, svm_rbf.predict(x_test))}")

Training Accuracy of SVM Classifier: 0.9875
Test Accuracy of SVM Classifier: 1.0

Confusion Matrix:
[[50  0]
 [ 0 30]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        30

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



# ------------------------**Models Comparison**-----------------------

In [71]:
models = pd.DataFrame({
    'Model' : [ 'KNN', 'Decision Tree Classifier', 'Random Forest Classifier','Ada Boost Classifier',
             'Gradient Boosting Classifier', 'Stochastic Gradient Boosting', 'XgBoost', 'Cat Boost', 'Extra Trees Classifier', 'SVC'],
    'Score' : [knn_acc, dtc_acc, rd_clf_acc, ada_acc, gb_acc, sgb_acc, xgb_acc, cat_acc, etc_acc, svm_rbf_acc]
})


models.sort_values(by = 'Score', ascending = False)

Unnamed: 0,Model,Score
9,SVC,1.0
3,Ada Boost Classifier,1.0
8,Extra Trees Classifier,1.0
0,KNN,0.9875
5,Stochastic Gradient Boosting,0.975
4,Gradient Boosting Classifier,0.975
1,Decision Tree Classifier,0.9625
2,Random Forest Classifier,0.9625
7,Cat Boost,0.9625
6,XgBoost,0.9375


In [72]:
import plotly.express as px
px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark', 
       title = 'Models Comparison')

# --------- If you like my work, don't forget to leave an upvote!!------