In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno 


In [125]:
hepatitis_data = pd.read_csv("hepatitis_data.csv")
hepatitis_data.describe()


Unnamed: 0,age,bilirubin,alk_phosphate,sgot,albumin,protime
count,155.0,149.0,126.0,151.0,139.0,88.0
mean,41.2,1.427517,105.325397,85.89404,3.817266,61.852273
std,12.565878,1.212149,51.508109,89.65089,0.651523,22.875244
min,7.0,0.3,26.0,14.0,2.1,0.0
25%,32.0,0.7,74.25,31.5,3.4,46.0
50%,39.0,1.0,85.0,58.0,4.0,61.0
75%,50.0,1.5,132.25,100.5,4.2,76.25
max,78.0,8.0,295.0,648.0,6.4,100.0


In [126]:
df= hepatitis_data

In [127]:
df.head

<bound method NDFrame.head of      age     sex steroid  antivirals  ... albumin protime histology class
0     30    male   False       False  ...     4.0     NaN     False  live
1     50  female   False       False  ...     3.5     NaN     False  live
2     78  female    True       False  ...     4.0     NaN     False  live
3     31  female     NaN        True  ...     4.0    80.0     False  live
4     34  female    True       False  ...     4.0     NaN     False  live
..   ...     ...     ...         ...  ...     ...     ...       ...   ...
150   46  female    True       False  ...     3.3    50.0      True   die
151   44  female    True       False  ...     4.3     NaN      True  live
152   61  female   False       False  ...     4.1     NaN      True  live
153   53    male   False       False  ...     4.1    48.0      True  live
154   43  female    True       False  ...     3.1    42.0      True   die

[155 rows x 20 columns]>

In [128]:
df.dtypes


age                  int64
sex                 object
steroid             object
antivirals            bool
fatigue             object
malaise             object
anorexia            object
liver_big           object
liver_firm          object
spleen_palpable     object
spiders             object
ascites             object
varices             object
bilirubin          float64
alk_phosphate      float64
sgot               float64
albumin            float64
protime            float64
histology             bool
class               object
dtype: object

In [129]:
df.shape

(155, 20)

In [130]:
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows:", duplicate_rows_df.shape)
df.count() 


number of duplicate rows: (0, 20)


age                155
sex                155
steroid            154
antivirals         155
fatigue            154
malaise            154
anorexia           154
liver_big          145
liver_firm         144
spleen_palpable    150
spiders            150
ascites            150
varices            150
bilirubin          149
alk_phosphate      126
sgot               151
albumin            139
protime             88
histology          155
class              155
dtype: int64

In [131]:
df.boxplot(column=['age'])
plt.show()

In [132]:
labels = {'no': 0,'yes': 1,'DIE': 0,'LIVE': 1,'?': np.nan,'female': 0,'male': 1}

hepatitis_data.replace(labels, inplace = True)
hepatitis_data.isnull().sum()

age                 0
sex                 0
steroid             1
antivirals          0
fatigue             1
malaise             1
anorexia            1
liver_big          10
liver_firm         11
spleen_palpable     5
spiders             5
ascites             5
varices             5
bilirubin           6
alk_phosphate      29
sgot                4
albumin            16
protime            67
histology           0
class               0
dtype: int64

In [133]:
sns.heatmap(df.isnull(), cbar=False)


<matplotlib.axes._subplots.AxesSubplot at 0x7f6d4caafd30>

In [134]:
msno.heatmap(df) 


<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f6d4d0f0ac8>

In [135]:
for column in df.columns:
    df[column].fillna(df[column].mode()[0], inplace=True)
    

In [136]:
df.isnull().sum()

age                0
sex                0
steroid            0
antivirals         0
fatigue            0
malaise            0
anorexia           0
liver_big          0
liver_firm         0
spleen_palpable    0
spiders            0
ascites            0
varices            0
bilirubin          0
alk_phosphate      0
sgot               0
albumin            0
protime            0
histology          0
class              0
dtype: int64

In [137]:
df.plot(kind='box', subplots=True, layout=(2,7),
sharex=False,sharey=False, figsize=(20, 10), 
color='deeppink');

<IPython.core.display.Javascript object>

In [138]:
sns.boxplot(x=df['liver_big'])


<matplotlib.axes._subplots.AxesSubplot at 0x7f6d4c420e48>

In [139]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3-Q1
print(IQR)

age                18.0
sex                 0.0
steroid             1.0
antivirals          0.0
fatigue             1.0
malaise             1.0
anorexia            0.0
liver_big           0.0
liver_firm          1.0
spleen_palpable     0.0
spiders             1.0
ascites             0.0
varices             0.0
bilirubin           0.7
alk_phosphate      41.5
sgot               68.5
albumin             0.7
protime            43.0
histology           1.0
dtype: float64


In [140]:
df = df[~((df < (Q1-1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape

(50, 20)