In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("kidney_disease_cleaned.csv")
df.drop("Unnamed: 0",axis = 1, inplace =True)

In [3]:
pd.set_option('display.max_columns',30)

In [5]:
df.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,pedal_edema,anemia,classification
0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,121.0,36.0,1.2,137.0,5.0,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,79.0,18.0,0.8,147.0,4.7,11.3,38.0,6000.0,3.2,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,139.0,4.1,9.6,31.0,7500.0,5.0,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,135.0,4.7,11.6,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [5]:
df['classification'].value_counts()/len(df)

ckd       0.619718
notckd    0.380282
Name: classification, dtype: float64

In [14]:
cat_col = [col for col in df.columns if df[col].dtype == 'object']
cat_col

['red_blood_cells',
 'pus_cell',
 'pus_cell_clumps',
 'bacteria',
 'hypertension',
 'diabetes_mellitus',
 'coronary_artery_disease',
 'appetite',
 'pedal_edema',
 'anemia',
 'classification']

In [15]:
cat_feature = df[cat_col]
cat_feature

Unnamed: 0,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,pedal_edema,anemia,classification
0,normal,normal,notpresent,notpresent,yes,yes,no,good,no,no,ckd
1,normal,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
2,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes,ckd
3,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes,ckd
4,normal,normal,notpresent,notpresent,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...
350,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
351,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
352,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd
353,normal,normal,notpresent,notpresent,no,no,no,good,no,no,notckd


In [16]:
for i in cat_col:
    print(f'{i} {df[i].unique()}')

red_blood_cells ['normal' 'abnormal']
pus_cell ['normal' 'abnormal']
pus_cell_clumps ['notpresent' 'present']
bacteria ['notpresent' 'present']
hypertension ['yes' 'no']
diabetes_mellitus ['yes' 'no']
coronary_artery_disease ['no' 'yes']
appetite ['good' 'poor']
pedal_edema ['no' 'yes']
anemia ['no' 'yes']
classification ['ckd' 'notckd']


all has only two categories so we can apply label encoder

In [17]:
cat_col[0:-1]

['red_blood_cells',
 'pus_cell',
 'pus_cell_clumps',
 'bacteria',
 'hypertension',
 'diabetes_mellitus',
 'coronary_artery_disease',
 'appetite',
 'pedal_edema',
 'anemia']

In [18]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [48]:
oe = OneHotEncoder(sparse = False)

In [58]:
pd.get_dummies(df[cat_col[0:-1]] , drop_first=True)

Unnamed: 0,red_blood_cells_normal,pus_cell_normal,pus_cell_clumps_present,bacteria_present,hypertension_yes,diabetes_mellitus_yes,coronary_artery_disease_yes,appetite_poor,pedal_edema_yes,anemia_yes
0,1,1,0,0,1,1,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0
2,1,1,0,0,0,1,0,1,0,1
3,1,0,1,0,1,0,0,1,1,1
4,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
350,1,1,0,0,0,0,0,0,0,0
351,1,1,0,0,0,0,0,0,0,0
352,1,1,0,0,0,0,0,0,0,0
353,1,1,0,0,0,0,0,0,0,0


In [57]:
cat_col[0:-1]

['red_blood_cells',
 'pus_cell',
 'pus_cell_clumps',
 'bacteria',
 'hypertension',
 'diabetes_mellitus',
 'coronary_artery_disease',
 'appetite',
 'pedal_edema',
 'anemia']

In [50]:
oe.fit_transform(df[cat_col])

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [19]:
from sklearn.compose import ColumnTransformer

In [29]:
step1 = ColumnTransformer(transformers = [('OneHotEncoder' , OneHotEncoder() , cat_col[0:-1]) ]
                                            , remainder='passthrough') 

In [36]:
step2 = ColumnTransformer(transformers = [('LabelEnoder' , LabelEncoder() , ["classification"] )]
                                            , remainder='passthrough')  

In [34]:
step1.fit_transform(x)

array([[0.0e+00, 1.0e+00, 0.0e+00, ..., 4.4e+01, 7.8e+03, 5.2e+00],
       [0.0e+00, 1.0e+00, 0.0e+00, ..., 3.8e+01, 6.0e+03, 3.2e+00],
       [0.0e+00, 1.0e+00, 0.0e+00, ..., 3.1e+01, 7.5e+03, 5.0e+00],
       ...,
       [0.0e+00, 1.0e+00, 0.0e+00, ..., 4.9e+01, 6.6e+03, 5.4e+00],
       [0.0e+00, 1.0e+00, 0.0e+00, ..., 5.1e+01, 7.2e+03, 5.9e+00],
       [0.0e+00, 1.0e+00, 0.0e+00, ..., 5.3e+01, 6.8e+03, 6.1e+00]])

In [39]:
le = LabelEncoder()

In [40]:
le.fit_transform(y)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [42]:
y

0         ckd
1         ckd
2         ckd
3         ckd
4         ckd
        ...  
350    notckd
351    notckd
352    notckd
353    notckd
354    notckd
Name: classification, Length: 355, dtype: object

In [31]:
from sklearn.pipeline import Pipeline

In [43]:
pipe = Pipeline([('step1' , step1) , ("step2" , le)])

In [47]:
pipe.fit(x_train,y_train)

TypeError: fit() takes 2 positional arguments but 3 were given

In [13]:
step1 = ColumnTransformer(transformers = [('step1' , le , cat_col[0:-1])]
                                            , remainder='passthrough') 

In [14]:
step1.fit_transform(df)

NameError: name 'x_train' is not defined

In [None]:
step1.fit()

In [None]:
step1 = ColumnTransformer(transformers = [("step1" ,LabelEncoder() ,cat_col[0:-1])]
                        , remainder ='passthrough')

In [None]:
le.fit_transform(y_train)

In [None]:
df

In [None]:
for col in cat_col:
    df[col] = le.fit_transform(df[col])

In [None]:
df[['red_blood_cells']]

In [None]:
x_train

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x = df.drop('classification',axis = 1)
y = df['classification']

In [27]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2 , random_state = 123)

In [None]:
step1.fit(x_train,y_train)

In [None]:
num_feature = [col for col in df.columns if df[col].dtype != 'object']
num_feature

In [None]:
df.to_csv('encoded_df.csv' , index = False)

## Feature Importance

##### for categorical

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
ms = SelectKBest(score_func = chi2 , k = 6)

In [None]:
df.dtypes

In [None]:
cat_x = df[cat_feature]
y = df[['classification']]

In [None]:
fs = ms.fit(cat_x,y)

In [None]:
fs

In [None]:
fs.scores_

In [None]:
scoredf = pd.DataFrame({"columns":cat_feature , "score":fs.scores_})

In [None]:
scoredf.sort_values(by = 'score',ascending = False)

#### for num

In [None]:
from sklearn.feature_selection import f_classif

In [None]:
ms1 = SelectKBest(score_func = f_classif , k = 6)

In [None]:
num_x = df[num_feature]

In [None]:
fs1 = ms.fit(num_x , y)

In [None]:
fs1.scores_

In [None]:
scoredf1 = pd.DataFrame({"columns":num_feature , "score":fs1.scores_})
scoredf1

In [None]:
num_feature

In [None]:
final_df = scoredf.append(scoredf1)

In [None]:
final_df.sort_values(by = 'score',ascending = False)

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

In [None]:
!pip install mlxtend --user

In [None]:
sfs = SequentialFeatureSelector(LogisticRegression(C=10,
max_iter=10000), k_features=8, 
                 forward=True, 
                 floating=False,
                 verbose=2,               
                 scoring='f1_macro',
                 cv=scv,
                 n_jobs=-1)
