In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("kidney_disease.csv")
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
df.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [4]:
df = df.drop(columns = 'id', axis = 1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             330 non-null    object 
 16  wc              295 non-null    object 
 17  rc              270 non-null    obj

In [6]:
df.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [7]:
df.isnull().sum()

age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [8]:
NumericalColumns = ['age','bp','al','su','bgr','bu','sc','sod','pot','hemo']
CategoricalColumns = ['rbc','sg','pc','pcc','ba','pcv','wc','rc','htn','dm','cad','appet','pe','ane']

In [9]:
df['classification'].replace("ckd\t","ckd",inplace=True)

df['dm'].replace(["\tno","\tyes"," yes"],["no","yes","yes"],inplace=True)

df['cad'].replace(["\tno"],["no"],inplace=True)

In [10]:
df['rc'].replace("\t?",np.nan, inplace=True)

df.wc.replace("\t?",np.nan, inplace=True)

df['pcv'].replace(["\t?","\t43"],np.nan, inplace=True)

In [11]:
for columnName in CategoricalColumns:
  df[columnName].fillna(df[columnName].mode()[0], inplace=True)

In [12]:
for columnName in NumericalColumns:
  df[columnName].fillna(df[columnName].mean(), inplace=True)

In [13]:
df.isnull().sum()

age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

In [14]:
encodeColumn = ['rbc','pc' ,'pcc' ,'ba' ,'htn' ,'dm' ,'cad' ,'appet' ,'pe' ,'ane']
df = pd.get_dummies(df , columns=encodeColumn , prefix=encodeColumn , drop_first=True)

In [15]:
df['classification'].replace(["ckd","notckd"],[1,0], inplace=True)

In [16]:
df.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_poor,pe_yes,ane_yes
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,4.627244,...,1,1,0,0,1,1,0,0,0,0
1,7.0,50.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,4.627244,...,1,1,0,0,0,0,0,0,0,0
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,137.528754,4.627244,...,1,1,0,0,0,1,0,1,0,1
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,...,1,0,1,0,1,0,0,1,1,1
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,4.627244,...,1,1,0,0,0,0,0,0,0,0


In [17]:
df.to_csv("updated_df.csv")

In [18]:
df.classification.value_counts()

1    250
0    150
Name: classification, dtype: int64

In [19]:
X = df.loc[:, df.columns != 'classification']
y = df['classification']

In [20]:
min_max = MinMaxScaler()
min_max.fit(X)
X = min_max.transform(X)

## Train - Test split ##

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [22]:
print(X.shape, X_train.shape, X_test.shape)

(400, 24) (320, 24) (80, 24)


In [23]:
model4 = LogisticRegression()

In [24]:
model4.fit(X_train, y_train)

LogisticRegression()

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
X_train_prediction = model4.predict(X_train)

In [27]:
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [28]:
print("Accuracy of the training data is : ", training_data_accuracy)

Accuracy of the training data is :  0.984375


## Now the accuracy prediction of test data. ##

In [29]:
X_test_predicted = model4.predict(X_test)
training_data_accuracy = accuracy_score(X_test_predicted, y_test)

In [30]:
print("Accuracy of the test data is : ", training_data_accuracy)

Accuracy of the test data is :  0.9875


In [31]:
import pickle

In [32]:
filename = 'chronic_kidney_disease_model.sav'
pickle.dump(model4, open(filename, 'wb'))

In [33]:
loaded_model = pickle.load(open('chronic_kidney_disease_model.sav', 'rb'))

## Now we will make a predictive system. ##

In [34]:
input = (68.0,70.0,1.01,0.0,0.0,100.0,54.0,24.0,104.0,4.0,12.4,36,9800,5.2,1,1,0,0,0,0,0,0,0,0)

In [35]:
input_numpy = np.asarray(input)

In [36]:
input_data = input_numpy.reshape(1,-1)

In [38]:
mix_data = min_max.transform(input_data)



In [39]:
print(mix_data)

[[0.75       0.15384615 0.25       0.         0.         0.16666667
  0.13478819 0.31216931 0.62776025 0.03370787 0.63265306 0.6
  0.31404959 0.52542373 1.         1.         0.         0.
  0.         0.         0.         0.         0.         0.        ]]


In [40]:
prediction = model4.predict(mix_data)

In [41]:
print(prediction)

[1]


In [42]:
if (prediction[0] == 0):
    print("The person consists of chronic kidney")
else:
    print("The person do not consists of chronic kidney")

The person do not consists of chronic kidney
