## Importing Libaries 

In [None]:
import pandas as pd
import numpy as np 
from collections import Counter as c 
import matplotlib.pyplot as plt
import seaborn as sns 
import missingno as msno 
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder 
from sklearn.linear_model import LogisticRegression
import pickle 

## Reading the dataset

In [None]:
data=pd.read_csv(r"/content/sample_data/chronickidneydisease.csv")   ## Loading the dataset

In [None]:
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [None]:
data.tail()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
395,395,55.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd
399,399,58.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,53,6800,6.1,no,no,no,good,no,no,notckd


In [None]:
data.drop(["id"],axis=1,inplace=True)  # dropping the column "id"

In [None]:
data.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

# Renaming the columns

In [None]:
data.columns=['age','blood_pressure','specific_gravity','albumin','sugar','red_blood_cells','pus_cells',
              'pus_cell_clumps','bacteria','blood glucose random','blood_urea','serum_creatinine','sodium','potassium',
              'haemoglobin','packed_cell_volume','white_blood_cell_count','red_blood_cell_count','hypertension',
              'diabetesmellitus','coronary_artery_disease','appetite','pedal_edema','anemia','class']
data.columns

Index(['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
       'red_blood_cells', 'pus_cells', 'pus_cell_clumps', 'bacteria',
       'blood glucose random', 'blood_urea', 'serum_creatinine', 'sodium',
       'potassium', 'haemoglobin', 'packed_cell_volume',
       'white_blood_cell_count', 'red_blood_cell_count', 'hypertension',
       'diabetesmellitus', 'coronary_artery_disease', 'appetite',
       'pedal_edema', 'anemia', 'class'],
      dtype='object')

# Understanding datatype

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      391 non-null    float64
 1   blood_pressure           388 non-null    float64
 2   specific_gravity         353 non-null    float64
 3   albumin                  354 non-null    float64
 4   sugar                    351 non-null    float64
 5   red_blood_cells          248 non-null    object 
 6   pus_cells                335 non-null    object 
 7   pus_cell_clumps          396 non-null    object 
 8   bacteria                 396 non-null    object 
 9   blood glucose random     356 non-null    float64
 10  blood_urea               381 non-null    float64
 11  serum_creatinine         383 non-null    float64
 12  sodium                   313 non-null    float64
 13  potassium                312 non-null    float64
 14  haemoglobin              3

## Target column

In [None]:
data['class'].unique()

array(['ckd', 'ckd\t', 'notckd'], dtype=object)

## Rectifying target coulmn

In [None]:
data['class']=data['class'].replace("ckd\t","ckd")
data['class'].unique()

array(['ckd', 'notckd'], dtype=object)

In [None]:
categorical_columns=set(data.dtypes[data.dtypes=='O'].index.values)  #fetching object or categorical columns
print(categorical_columns)

{'white_blood_cell_count', 'red_blood_cell_count', 'pedal_edema', 'packed_cell_volume', 'pus_cell_clumps', 'pus_cells', 'diabetesmellitus', 'appetite', 'anemia', 'coronary_artery_disease', 'bacteria', 'red_blood_cells', 'class', 'hypertension'}


In [None]:
for i in categorical_columns:
    print("Columns : ",i)
    print(c[data[i]])
    print("*"*120+'\n')

Columns :  white_blood_cell_count


TypeError: ignored

## Removing columns which are not categorical

In [None]:
categorical_columns.remove('red_blood_cell_count')
categorical_columns.remove('packed_cell_volume')
categorical_columns.remove('white_blood_cell_count')
print(categorical_columns)

{'pedal_edema', 'pus_cell_clumps', 'pus_cells', 'diabetesmellitus', 'appetite', 'anemia', 'coronary_artery_disease', 'bacteria', 'red_blood_cells', 'class', 'hypertension'}


## Numerical columns

In [None]:
continuous_columns=set(data.dtypes[data.dtypes!='O'].index.values)    #fetching numerical columns
print(continuous_columns)

{'specific_gravity', 'age', 'haemoglobin', 'blood_urea', 'serum_creatinine', 'sodium', 'blood glucose random', 'sugar', 'potassium', 'blood_pressure', 'albumin'}


In [None]:
for i in continuous_columns:
    print("Columns : ",i)
    print(c[data[i]])
    print("*"*120+'\n')

Columns :  specific_gravity


TypeError: ignored

## Removing columns which are not numerical

In [None]:
continuous_columns.remove('specific_gravity')
continuous_columns.remove('albumin')
continuous_columns.remove('sugar')
print(continuous_columns)


{'age', 'haemoglobin', 'blood_urea', 'serum_creatinine', 'sodium', 'blood glucose random', 'potassium', 'blood_pressure'}


## Adding columns which we found continuous

In [None]:
continuous_columns.add('red_blood_cell_count')
continuous_columns.add('packed_cell_volume')
continuous_columns.add('white_blood_cell_count')
print(continuous_columns)


{'white_blood_cell_count', 'red_blood_cell_count', 'age', 'packed_cell_volume', 'haemoglobin', 'blood_urea', 'serum_creatinine', 'sodium', 'blood glucose random', 'potassium', 'blood_pressure'}


## Adding columns which we found categorical

In [None]:
categorical_columns.add('specific_gravity')
categorical_columns.add('albumin')
categorical_columns.add('sugar')
print(categorical_columns)

{'specific_gravity', 'pedal_edema', 'pus_cell_clumps', 'pus_cells', 'diabetesmellitus', 'appetite', 'anemia', 'coronary_artery_disease', 'bacteria', 'red_blood_cells', 'class', 'sugar', 'hypertension', 'albumin'}


## Rectifying the categorical column classes

In [None]:
data['coronary_artery_disease']=data.coronary_artery_disease.replace('\tno','no')
c(data['coronary_artery_disease'])

Counter({'no': 364, 'yes': 34, nan: 2})

In [None]:
data['diabetesmellitus']=data.diabetesmellitus.replace(to_replace={'\tyes':'yes','\tno':'no',' yes':'yes'})
c(data['diabetesmellitus'])

Counter({'yes': 137, 'no': 261, nan: 2})

## Null values

In [None]:
data.isnull().any()          #returns true if column has any missing values

age                         True
blood_pressure              True
specific_gravity            True
albumin                     True
sugar                       True
red_blood_cells             True
pus_cells                   True
pus_cell_clumps             True
bacteria                    True
blood glucose random        True
blood_urea                  True
serum_creatinine            True
sodium                      True
potassium                   True
haemoglobin                 True
packed_cell_volume          True
white_blood_cell_count      True
red_blood_cell_count        True
hypertension                True
diabetesmellitus            True
coronary_artery_disease     True
appetite                    True
pedal_edema                 True
anemia                      True
class                      False
dtype: bool

In [None]:
data.isnull().sum()  #returns the count of missing values

age                          9
blood_pressure              12
specific_gravity            47
albumin                     46
sugar                       49
red_blood_cells            152
pus_cells                   65
pus_cell_clumps              4
bacteria                     4
blood glucose random        44
blood_urea                  19
serum_creatinine            17
sodium                      87
potassium                   88
haemoglobin                 52
packed_cell_volume          70
white_blood_cell_count     105
red_blood_cell_count       130
hypertension                 2
diabetesmellitus             2
coronary_artery_disease      2
appetite                     1
pedal_edema                  1
anemia                       1
class                        0
dtype: int64

In [None]:
data.packed_cell_volume=pd.to_numeric(data.packed_cell_volume,errors='coerce')
data.white_blood_cell_count=pd.to_numeric(data.white_blood_cell_count,errors='coerce')
data.red_blood_cell_count=pd.to_numeric(data.red_blood_cell_count,errors='coerce')


##  Handling continuous/numeric columns null values

In [None]:
data['blood glucose random'].fillna(data['blood glucose random'].mean(),inplace=True)
data['blood_pressure'].fillna(data['blood_pressure'].mean(),inplace=True)
data['blood_urea'].fillna(data['blood_urea'].mean(),inplace=True)
data['haemoglobin'].fillna(data['haemoglobin'].mean(),inplace=True)
data['packed_cell_volume'].fillna(data['packed_cell_volume'].mean(),inplace=True)
data['potassium'].fillna(data['potassium'].mean(),inplace=True)
data['red_blood_cell_count'].fillna(data['red_blood_cell_count'].mean(),inplace=True)
data['serum_creatinine'].fillna(data['serum_creatinine'].mean(),inplace=True)
data['sodium'].fillna(data['sodium'].mean(),inplace=True)
data['white_blood_cell_count'].fillna(data['white_blood_cell_count'].mean(),inplace=True)


In [None]:
data['age'].fillna(data['age'].mode()[0],inplace=True)
data['hypertension'].fillna(data['hypertension'].mode()[0],inplace=True)
data['pus_cell_clumps'].fillna(data['pus_cell_clumps'].mode()[0],inplace=True)
data['appetite'].fillna(data['appetite'].mode()[0],inplace=True)
data['albumin'].fillna(data['albumin'].mode()[0],inplace=True)
data['pus_cells'].fillna(data['pus_cells'].mode()[0],inplace=True)
data['red_blood_cells'].fillna(data['red_blood_cells'].mode()[0],inplace=True)
data['coronary_artery_disease'].fillna(data['coronary_artery_disease'].mode()[0],inplace=True)
data['bacteria'].fillna(data['bacteria'].mode()[0],inplace=True)
data['anemia'].fillna(data['anemia'].mode()[0],inplace=True)
data['sugar'].fillna(data['sugar'].mode()[0],inplace=True)
data['diabetesmellitus'].fillna(data['diabetesmellitus'].mode()[0],inplace=True)
data['pedal_edema'].fillna(data['pedal_edema'].mode()[0],inplace=True)
data['specific_gravity'].fillna(data['specific_gravity'].mode()[0],inplace=True)

## Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
for i in categorical_columns:
    print("LABEL ENCODING OF:",i)
    Le=LabelEncoder()
    print(c(data[i]))
    data[i]=Le.fit_transform(data[i])
    print(c(data[i]))
    print("*"*100)

LABEL ENCODING OF: specific_gravity
Counter({1.02: 153, 1.01: 84, 1.025: 81, 1.015: 75, 1.005: 7})
Counter({3: 153, 1: 84, 4: 81, 2: 75, 0: 7})
****************************************************************************************************
LABEL ENCODING OF: pedal_edema
Counter({'no': 324, 'yes': 76})
Counter({0: 324, 1: 76})
****************************************************************************************************
LABEL ENCODING OF: pus_cell_clumps
Counter({'notpresent': 358, 'present': 42})
Counter({0: 358, 1: 42})
****************************************************************************************************
LABEL ENCODING OF: pus_cells
Counter({'normal': 324, 'abnormal': 76})
Counter({1: 324, 0: 76})
****************************************************************************************************
LABEL ENCODING OF: diabetesmellitus
Counter({'no': 263, 'yes': 137})
Counter({0: 263, 1: 137})
**********************************************************************

## Creating dependent and independent variables

In [None]:
selected_columns=['red_blood_cells','pus_cells','blood glucose random','blood_urea','pedal_edema','anemia',
                  'diabetesmellitus','coronary_artery_disease']
x=pd.DataFrame(data,columns=selected_columns)
y=pd.DataFrame(data,columns=['class'])
print(x.shape)
print(y.shape)

(400, 8)
(400, 1)


## Splitting the data into train and test

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(320, 8)
(320, 1)
(80, 8)
(80, 1)


## Build a machine learning model

In [None]:
from sklearn.linear_model import LogisticRegression 
lgr=LogisticRegression() 
lgr.fit(x_train,y_train) 

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [None]:
y_pred=lgr.predict(x_test)
y_pred1=lgr.predict([[129,99,1,0,0,1,0,1]])
print(y_pred)
c(y_pred)

[0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 1 1 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1
 0 0 1 0 1 0 0 0 1 0 1 1 1 0 0 0 1 0 1 0 1 1 0 0 1 1 0 0 0 0 1 0 1 1 0 0 1
 0 1 0 1 1 0]


  "X does not have valid feature names, but"


Counter({0: 48, 1: 32})

In [None]:
accuracy_score(y_test,y_pred)

0.925

## Confusion matrix

In [None]:
confusion_mat = confusion_matrix(y_test,y_pred)
confusion_mat

array([[48,  6],
       [ 0, 26]])

## Dumping our model into pickle form

In [None]:
pickle.dump(lgr,open('CKD.pkl','wb'))

In [None]:
with open('/content/CKD.pkl','rb') as f:
  mp=pickle.load(f)

In [None]:
mp.predict([[129,99,1,1,1,1,0,1]])

  "X does not have valid feature names, but"


array([1])