In [54]:
import pandas as pd
import numpy as np
import sklearn.ensemble
import sklearn.preprocessing
import sklearn.metrics
from sklearn.model_selection import train_test_split

# 1 Load and explore the dataset

In [4]:
# load csv
df = pd.read_csv('hepatitis.csv')
df.head(10)

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135.0,42.0,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96.0,32.0,4.0,,False,live
3,31,female,,True,False,False,False,True,False,False,False,False,False,0.7,46.0,52.0,4.0,80.0,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200.0,4.0,,False,live
5,34,female,True,False,False,False,False,True,False,False,False,False,False,0.9,95.0,28.0,4.0,75.0,False,live
6,51,female,False,False,True,False,True,True,False,True,True,False,False,,,,,,False,die
7,23,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,,,,False,live
8,39,female,True,False,True,False,False,True,True,False,False,False,False,0.7,,48.0,4.4,,False,live
9,30,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,120.0,3.9,,False,live


In [5]:
# let's print the number of samples
print(df.shape)

(155, 20)


We can easily identify the types of the variables
- int / float it is a number
- object is cathegorical data
- bool boolean

In [6]:
df.dtypes

age                  int64
sex                 object
steroid             object
antivirals            bool
fatigue             object
malaise             object
anorexia            object
liver_big           object
liver_firm          object
spleen_palpable     object
spiders             object
ascites             object
varices             object
bilirubin          float64
alk_phosphate      float64
sgot               float64
albumin            float64
protime            float64
histology             bool
class               object
dtype: object

In [7]:
# convert all types
df = df.convert_dtypes()
print(df.dtypes)

age                  Int64
sex                 string
steroid            boolean
antivirals         boolean
fatigue            boolean
malaise            boolean
anorexia           boolean
liver_big          boolean
liver_firm         boolean
spleen_palpable    boolean
spiders            boolean
ascites            boolean
varices            boolean
bilirubin          Float64
alk_phosphate        Int64
sgot                 Int64
albumin            Float64
protime              Int64
histology          boolean
class               string
dtype: object


Let's identify how many missing values there are

In [8]:
df.isna().sum()

age                 0
sex                 0
steroid             1
antivirals          0
fatigue             1
malaise             1
anorexia            1
liver_big          10
liver_firm         11
spleen_palpable     5
spiders             5
ascites             5
varices             5
bilirubin           6
alk_phosphate      29
sgot                4
albumin            16
protime            67
histology           0
class               0
dtype: int64

We want to convert this number in percentage

In [9]:
df.isna().sum() / len(df) * 100

age                 0.000000
sex                 0.000000
steroid             0.645161
antivirals          0.000000
fatigue             0.645161
malaise             0.645161
anorexia            0.645161
liver_big           6.451613
liver_firm          7.096774
spleen_palpable     3.225806
spiders             3.225806
ascites             3.225806
varices             3.225806
bilirubin           3.870968
alk_phosphate      18.709677
sgot                2.580645
albumin            10.322581
protime            43.225806
histology           0.000000
class               0.000000
dtype: float64

# 2 Clean the dataset

There are NaN values!

We have three options:
1. drop variables with missing values
2. drop samples with missing values
3. replace missing value with synthetic value

## Case 1 - Drop variables with missing values

In [10]:
df.dropna(axis=0)
# if we want to drop only if a specific column has nan:
# df.dropna(subset=['liver_big'],axis=0)

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
5,34,female,True,False,False,False,False,True,False,False,False,False,False,0.9,95,28,4.0,75,False,live
10,39,female,False,True,False,False,False,False,True,False,False,False,False,1.3,78,30,4.4,85,False,live
11,32,female,True,True,True,False,False,True,True,False,True,False,False,1.0,59,249,3.7,54,False,live
12,41,female,True,True,True,False,False,True,True,False,False,False,False,0.9,81,60,3.9,52,False,live
13,30,female,True,False,True,False,False,True,True,False,False,False,False,2.2,57,144,4.9,78,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,45,female,True,True,False,False,False,True,False,False,False,False,False,1.3,85,44,4.2,85,True,live
143,49,female,False,False,True,True,False,True,False,True,True,False,False,1.4,85,70,3.5,35,True,die
145,31,female,False,False,True,False,False,True,False,False,False,False,False,1.2,75,173,4.2,54,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,live


## Case 2 - Drop samples with missing values

In [11]:
# case 2
df.dropna(axis=1)

Unnamed: 0,age,sex,antivirals,histology,class
0,30,male,False,False,live
1,50,female,False,False,live
2,78,female,False,False,live
3,31,female,True,False,live
4,34,female,False,False,live
...,...,...,...,...,...
150,46,female,False,True,die
151,44,female,False,True,live
152,61,female,False,True,live
153,53,male,False,True,live


In [12]:
# We can also delete variables only if the amount of nan is above a threshold
df.dropna(thresh=0.8*len(df),axis=1)

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,False,live
3,31,female,,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200,4.0,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,,242,3.3,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,True,live


## Case 3 (preferred) - we want to insert generated values

- In case of numerical values, we can replace the missing value with the average value of the column
- In case of cathegorical values we can replace the missing value with the most frequent value of the column

.fillna(value) replaces all the NaNs with values passed as argument


#### Boolean columns

In [13]:
# get columns containing booleans
boolean_cols = df.select_dtypes(include=bool).columns.tolist()
boolean_cols

['steroid',
 'antivirals',
 'fatigue',
 'malaise',
 'anorexia',
 'liver_big',
 'liver_firm',
 'spleen_palpable',
 'spiders',
 'ascites',
 'varices',
 'histology']

In [14]:
# get most frequent values
most_frequent_values = df[boolean_cols].mode().loc[0]

# replace missing content with mode
df[boolean_cols] = df[boolean_cols].fillna(most_frequent_values)

df

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,,False,live
3,31,female,True,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,80,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200,4.0,,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,,242,3.3,50,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,live


#### Integer values

In [15]:
# get columns containing numbers
numeric = df.select_dtypes(include=int)
integer_columns = numeric.columns
# print them
integer_columns

Index(['age', 'alk_phosphate', 'sgot', 'protime'], dtype='object')

In [16]:
# replace these columns with their median
median_of_integer_cols = df[integer_columns].median()
df[integer_columns] = df[integer_columns].fillna(median_of_integer_cols)

#### Float columns

In [17]:
# get columns containing numbers
# numeric = df.select_dtypes(include=np.number)
numeric = df.select_dtypes(include=float)
numeric_columns = numeric.columns
# print them
numeric_columns

Index(['bilirubin', 'albumin'], dtype='object')

In [18]:
# replace these columns with their median
# median_of_numeric_cols = df[numeric_columns].median()
# df[numeric_columns] = df[numeric_columns].fillna(median_of_numeric_cols)

# convert all floats64 to floats
df[numeric_columns] = df[numeric_columns].astype(float)

# interpolate
df[numeric_columns] = df[numeric_columns].interpolate(method ='linear')

#### Categorical columns

In [19]:
# get categorical columns
categorical_columns = df.select_dtypes(include='string').columns.tolist()
print(categorical_columns)

['sex', 'class']


In [20]:
# let's get the most frequent values
most_frequent_values = df[categorical_columns].mode()
most_frequent_values

Unnamed: 0,sex,class
0,female,live


In [21]:
 df[categorical_columns] = df[categorical_columns].fillna(most_frequent_values)
    
df

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,61,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,61,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,61,False,live
3,31,female,True,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,80,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,85,200,4.0,61,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,85,242,3.3,50,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,61,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,61,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,live


In [22]:
# final check
df.isnull().values.any()

False

# 2 Encode the features

We need to encode the target variable through labeling

In [23]:
# initialize label encoder
le = sklearn.preprocessing.LabelEncoder()

# encode target class
df['class'] = le.fit_transform(df['class'])

We need to encode cathegorical features through one-hot encoding

In [24]:
# get columns containing cathegorical features
categorical_features = df.select_dtypes(include='string').columns.tolist()
categorical_features

['sex']

In [25]:
# one-hot encode these features
df = pd.get_dummies(df, prefix=categorical_features)
df

Unnamed: 0,age,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,...,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class,sex_female,sex_male
0,30,False,False,False,False,False,False,False,False,False,...,False,1.0,85,18,4.0,61,False,1,0,1
1,50,False,False,True,False,False,False,False,False,False,...,False,0.9,135,42,3.5,61,False,1,1,0
2,78,True,False,True,False,False,True,False,False,False,...,False,0.7,96,32,4.0,61,False,1,1,0
3,31,True,True,False,False,False,True,False,False,False,...,False,0.7,46,52,4.0,80,False,1,1,0
4,34,True,False,False,False,False,True,False,False,False,...,False,1.0,85,200,4.0,61,False,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,True,False,True,True,True,True,False,False,True,...,True,7.6,85,242,3.3,50,True,0,1,0
151,44,True,False,True,False,False,True,True,False,False,...,False,0.9,126,142,4.3,61,True,1,1,0
152,61,False,False,True,True,False,False,True,False,True,...,False,0.8,75,20,4.1,61,True,1,1,0
153,53,False,False,True,False,False,True,False,True,True,...,True,1.5,81,19,4.1,48,True,1,0,1


# 3 Let's find outlayers

### Isolation forests

In [26]:
# initialize forest
isoforest = sklearn.ensemble.IsolationForest(n_estimators=1000, contamination=0.01, random_state=0)
# for each sample, get anomaly score 
res = isoforest.fit_predict(df.to_numpy())

In [27]:
df[res==-1]

Unnamed: 0,age,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,...,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class,sex_female,sex_male
37,20,False,False,True,True,True,False,True,True,True,...,False,2.3,150,68,3.9,61,False,1,0,1
125,34,True,False,True,True,True,False,True,False,True,...,False,0.7,70,24,4.1,100,True,1,0,1


### PCA

In [28]:
# init pca object
pca = sklearn.decomposition.PCA(n_components=0.9999)
# apply pca
X_pca = pca.fit_transform(df)
# invert pca
X_ori = pca.inverse_transform(X_pca)
# define anomaly score as the sum of the absolute distances
anomaly_score = np.abs(df.to_numpy() - X_ori).sum(1)
# get last quantile
threshold = np.quantile(anomaly_score, 0.99)
# get ids of anomalous values
anomalous_ids = np.argwhere(anomaly_score > threshold).squeeze()

In [29]:
df.iloc[anomalous_ids]

Unnamed: 0,age,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,...,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class,sex_female,sex_male
6,51,False,False,True,False,True,True,False,True,True,...,False,0.95,85,58,4.133333,61,False,0,1,0
153,53,False,False,True,False,False,True,False,True,True,...,True,1.5,81,19,4.1,48,True,1,0,1


# 4 Normalize features

In [30]:
df

Unnamed: 0,age,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,...,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class,sex_female,sex_male
0,30,False,False,False,False,False,False,False,False,False,...,False,1.0,85,18,4.0,61,False,1,0,1
1,50,False,False,True,False,False,False,False,False,False,...,False,0.9,135,42,3.5,61,False,1,1,0
2,78,True,False,True,False,False,True,False,False,False,...,False,0.7,96,32,4.0,61,False,1,1,0
3,31,True,True,False,False,False,True,False,False,False,...,False,0.7,46,52,4.0,80,False,1,1,0
4,34,True,False,False,False,False,True,False,False,False,...,False,1.0,85,200,4.0,61,False,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,True,False,True,True,True,True,False,False,True,...,True,7.6,85,242,3.3,50,True,0,1,0
151,44,True,False,True,False,False,True,True,False,False,...,False,0.9,126,142,4.3,61,True,1,1,0
152,61,False,False,True,True,False,False,True,False,True,...,False,0.8,75,20,4.1,61,True,1,1,0
153,53,False,False,True,False,False,True,False,True,True,...,True,1.5,81,19,4.1,48,True,1,0,1


In [33]:
# first of all, split input variables from output variables
x = df[list(set(df.columns)-set(['class']))]
y = df['class']

In [37]:
# define standard scaler
scaler = sklearn.preprocessing.StandardScaler()
# normalize features
x = scaler.fit_transform(x)

# 5 Split train / test

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [46]:
print(x_train.shape)
print(x_test.shape)

(124, 20)
(31, 20)


# 6 train

In [48]:
# create random forest
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)

In [49]:
# train
rf.fit(x_train, y_train)

In [51]:
# predict on test set
y_estim = rf.predict(x_test)

In [78]:
# compute confusion matrix
conf_mat = sklearn.metrics.confusion_matrix(y_test, y_estim)
conf_mat

array([[ 4,  5],
       [ 0, 22]])

In [79]:
# compute accuracy
acc = sklearn.metrics.accuracy_score(y_test, y_estim)
acc

0.8387096774193549