In [86]:
import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Importing Dataset

In [12]:
df=pd.read_csv('hepatitis.data')
df.head(5)

Unnamed: 0,2,30,2.1,1,2.2,2.3,2.4,2.5,1.1,2.6,2.7,2.8,2.9,2.10,1.00,85,18,4.0,?,1.2
0,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,?,1
1,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,?,1
2,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.7,46,52,4.0,80,1
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,?,200,4.0,?,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1


In [13]:
column_names=[ 'Class'
,'AGE'
, 'SEX'
,'STEROID'
, 'ANTIVIRALS'
, 'FATIGUE'
, 'MALAISE'
, 'ANOREXIA'
, 'LIVER BIG'
, 'LIVER FIRM'
, 'SPLEEN PALPABLE'
, 'SPIDERS'
, 'ASCITES'
, 'VARICES'
,'BILIRUBIN'
,'ALK PHOSPHATE'
,'SGOT'
,'ALBUMIN'
, 'PROTIME'
,'HISTOLOGY']

In [14]:
df.columns=column_names
df.keys()

Index(['Class', 'AGE', 'SEX', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE',
       'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABLE', 'SPIDERS',
       'ASCITES', 'VARICES', 'BILIRUBIN', 'ALK PHOSPHATE', 'SGOT', 'ALBUMIN',
       'PROTIME', 'HISTOLOGY'],
      dtype='object')

In [17]:
df.columns=df.columns.str.lower().str.replace(' ','_')
df.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
0,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,?,1
1,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,?,1
2,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.7,46,52,4.0,80,1
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,?,200,4.0,?,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1


In [20]:
df.shape

(154, 20)

In [26]:
df.isnull().sum().sum()




0

- Importing '?' with NaN for further steps

In [30]:
df=df.replace('?',np.nan)
df.head(3)

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
0,2,50,1,1.0,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,,1
1,2,78,1,2.0,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,,1
2,2,31,1,,1,2,2,2,2,2,2,2,2,2,0.7,46,52,4.0,80.0,1


In [31]:
df.isnull().sum().sum()

166

In [35]:
df_sum_check=df.isnull().sum()
df_sum_check

class               0
age                 0
sex                 0
steroid             1
antivirals          0
fatigue             1
malaise             1
anorexia            1
liver_big          10
liver_firm         11
spleen_palpable     5
spiders             5
ascites             5
varices             5
bilirubin           6
alk_phosphate      29
sgot                4
albumin            16
protime            66
histology           0
dtype: int64

In [39]:

num_vars=df.select_dtypes(include=['int64','float64']).columns
num_var_miss=[]
for var in num_vars:
    #print(var)
    if df_sum_check[var]>0:
        num_var_miss.append(var)
num_var_miss

[]

- this shows that no numerical variable has null values

In [60]:
df.select_dtypes(include=['int64','float64']).columns

Index(['class', 'age', 'sex', 'antivirals', 'histology'], dtype='object')

In [67]:
num_var=['class', 'age', 'sex', 'antivirals', 'histology']

In [38]:

cat_vars=df.select_dtypes(include=['object']).columns
cat_var_miss=[]
for var in cat_vars:
    if df_sum_check[var]>0:
        cat_var_miss.append(var)
cat_var_miss

['steroid',
 'fatigue',
 'malaise',
 'anorexia',
 'liver_big',
 'liver_firm',
 'spleen_palpable',
 'spiders',
 'ascites',
 'varices',
 'bilirubin',
 'alk_phosphate',
 'sgot',
 'albumin',
 'protime']

- above list  shows all the object variable has null values

In [64]:
##values not null

cat_vars=df.select_dtypes(include=['object']).columns
cat_var_notnull=[]
for var in cat_vars:
    if df_sum_check[var]<0:
        cat_var_notnull.append(var)
cat_var_notnull

[]

In [73]:
cat_var_missing=['steroid',
 'fatigue',
 'malaise',
 'anorexia',
 'liver_big',
 'liver_firm',
 'spleen_palpable',
 'spiders',
 'ascites',
 'varices',
 'bilirubin',
 'alk_phosphate',
 'sgot',
 'albumin',
 'protime']

## Creating Pipeline

In [47]:
cat_var_mode_imputer=Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent"))])

In [48]:
preprocessor=ColumnTransformer(transformers=[("mode_imputer",cat_var_mode_imputer,cat_var_missing)])
                              

In [49]:
preprocessor.fit(df)

ColumnTransformer(transformers=[('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['steroid', 'fatigue', 'malaise', 'anorexia',
                                  'liver_big', 'liver_firm', 'spleen_palpable',
                                  'spiders', 'ascites', 'varices', 'bilirubin',
                                  'alk_phosphate', 'sgot', 'albumin',
                                  'protime'])])

In [50]:
preprocessor.transform


<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['steroid', 'fatigue', 'malaise', 'anorexia',
                                  'liver_big', 'liver_firm', 'spleen_palpable',
                                  'spiders', 'ascites', 'varices', 'bilirubin',
                                  'alk_phosphate', 'sgot', 'albumin',
                                  'protime'])])>

- which value is imputed in Null columns

In [51]:
preprocessor.named_transformers_["mode_imputer"].named_steps["imputer"].statistics_

array(['2', '1', '2', '2', '2', '2', '2', '2', '2', '2', '1.00', '85',
       '20', '4.0', '100'], dtype=object)

#### now to impute all above values in our datase

In [58]:
dataset_clean=preprocessor.transform(df)
dataset_clean

array([['1', '1', '2', ..., '42', '3.5', '100'],
       ['2', '1', '2', ..., '32', '4.0', '100'],
       ['2', '2', '2', ..., '52', '4.0', '80'],
       ...,
       ['1', '1', '1', ..., '20', '4.1', '100'],
       ['1', '1', '2', ..., '19', '4.1', '48'],
       ['2', '1', '2', ..., '19', '3.1', '42']], dtype=object)

- one disadvantage of this methodwe give it dataframe and it returns 2d-array
- but we convert that dataset again in Dataframe by just one line of code

In [59]:

preprocessor.transformers_

[('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['steroid',
   'fatigue',
   'malaise',
   'anorexia',
   'liver_big',
   'liver_firm',
   'spleen_palpable',
   'spiders',
   'ascites',
   'varices',
   'bilirubin',
   'alk_phosphate',
   'sgot',
   'albumin',
   'protime']),
 ('remainder', 'drop', [0, 1, 2, 4, 19])]

In [69]:
num_var=['class', 'age', 'sex', 'antivirals', 'histology']

In [71]:
len(df.columns)

20

In [72]:
df.keys()

Index(['class', 'age', 'sex', 'steroid', 'antivirals', 'fatigue', 'malaise',
       'anorexia', 'liver_big', 'liver_firm', 'spleen_palpable', 'spiders',
       'ascites', 'varices', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin',
       'protime', 'histology'],
      dtype='object')

#### now checking if there is any null value or not

In [76]:
dataset_clean_miss_var =pd.DataFrame(dataset_clean,columns=cat_var_missing)
dataset_clean_miss_var.isnull().sum()

steroid            0
fatigue            0
malaise            0
anorexia           0
liver_big          0
liver_firm         0
spleen_palpable    0
spiders            0
ascites            0
varices            0
bilirubin          0
alk_phosphate      0
sgot               0
albumin            0
protime            0
dtype: int64

#### clean dataframe of categorical variable

In [79]:
dataset_clean_miss_var.head(3)

Unnamed: 0,steroid,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime
0,1,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,100
1,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,100
2,2,2,2,2,2,2,2,2,2,2,0.7,46,52,4.0,80


#### dataframe of numerical variable

In [81]:
df_numerical=df.select_dtypes(include=['int64','float64'])
df_numerical.head()

Unnamed: 0,class,age,sex,antivirals,histology
0,2,50,1,2,1
1,2,78,1,2,1
2,2,31,1,1,1
3,2,34,1,2,1
4,2,34,1,2,1


##### Concatenating both dataframe together

In [83]:
concatenated_df = pd.concat([df_numerical, dataset_clean_miss_var], axis=1)
concatenated_df.head(3)

Unnamed: 0,class,age,sex,antivirals,histology,steroid,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime
0,2,50,1,2,1,1,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,100
1,2,78,1,2,1,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,100
2,2,31,1,1,1,2,2,2,2,2,2,2,2,2,2,0.7,46,52,4.0,80


#### checking null values in in our newly concatenated dataframe

In [84]:
concatenated_df.isnull().sum().sum()

0