# L03 15/03/24

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import csv

In [5]:
df = pd.read_csv('datasets/hepatitis.csv')
df

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135.0,42.0,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96.0,32.0,4.0,,False,live
3,31,female,,True,False,False,False,True,False,False,False,False,False,0.7,46.0,52.0,4.0,80.0,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200.0,4.0,,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,,242.0,3.3,50.0,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126.0,142.0,4.3,,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75.0,20.0,4.1,,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81.0,19.0,4.1,48.0,True,live


In [6]:
df.shape

(155, 20)

In [7]:
df.dtypes # we need to convert some variables

age                  int64
sex                 object
steroid             object
antivirals            bool
fatigue             object
malaise             object
anorexia            object
liver_big           object
liver_firm          object
spleen_palpable     object
spiders             object
ascites             object
varices             object
bilirubin          float64
alk_phosphate      float64
sgot               float64
albumin            float64
protime            float64
histology             bool
class               object
dtype: object

In [8]:
df = df.convert_dtypes() # Do what i mean function
df.dtypes

age                         Int64
sex                string[python]
steroid                   boolean
antivirals                boolean
fatigue                   boolean
malaise                   boolean
anorexia                  boolean
liver_big                 boolean
liver_firm                boolean
spleen_palpable           boolean
spiders                   boolean
ascites                   boolean
varices                   boolean
bilirubin                 Float64
alk_phosphate               Int64
sgot                        Int64
albumin                   Float64
protime                     Int64
histology                 boolean
class              string[python]
dtype: object

In [9]:
df.isna().sum()

age                 0
sex                 0
steroid             1
antivirals          0
fatigue             1
malaise             1
anorexia            1
liver_big          10
liver_firm         11
spleen_palpable     5
spiders             5
ascites             5
varices             5
bilirubin           6
alk_phosphate      29
sgot                4
albumin            16
protime            67
histology           0
class               0
dtype: int64

In [10]:
(df.isna().sum()/len(df))*100

age                 0.000000
sex                 0.000000
steroid             0.645161
antivirals          0.000000
fatigue             0.645161
malaise             0.645161
anorexia            0.645161
liver_big           6.451613
liver_firm          7.096774
spleen_palpable     3.225806
spiders             3.225806
ascites             3.225806
varices             3.225806
bilirubin           3.870968
alk_phosphate      18.709677
sgot                2.580645
albumin            10.322581
protime            43.225806
histology           0.000000
class               0.000000
dtype: float64

## Dataset cleaning
### Case 1 - Drop variables iwth missing values

In [11]:
df.dropna(axis = 1) # axis 1 drops columns
# df.dropna(axis = 0) # axis 0 drops rows

Unnamed: 0,age,sex,antivirals,histology,class
0,30,male,False,False,live
1,50,female,False,False,live
2,78,female,False,False,live
3,31,female,True,False,live
4,34,female,False,False,live
...,...,...,...,...,...
150,46,female,False,True,die
151,44,female,False,True,live
152,61,female,False,True,live
153,53,male,False,True,live


### Case 2

In [12]:
df.dropna(thresh=.80*len(df), axis=1).isna().sum()/len(df)*100 # threshold works as a retaining rate. In this case: keep the vars that are NOT na for at least 80%

age                 0.000000
sex                 0.000000
steroid             0.645161
antivirals          0.000000
fatigue             0.645161
malaise             0.645161
anorexia            0.645161
liver_big           6.451613
liver_firm          7.096774
spleen_palpable     3.225806
spiders             3.225806
ascites             3.225806
varices             3.225806
bilirubin           3.870968
alk_phosphate      18.709677
sgot                2.580645
albumin            10.322581
histology           0.000000
class               0.000000
dtype: float64

### Case 3 (preferred) - insert "synthetic" values

In [13]:
# df.fillna(value), replaces nan values with value
bool_cols = df.select_dtypes(include=bool).columns # find boolean columns

In [14]:
bool_mode = df[bool_cols].mode().iloc[0]
bool_mode

steroid             True
antivirals         False
fatigue             True
malaise            False
anorexia           False
liver_big           True
liver_firm         False
spleen_palpable    False
spiders            False
ascites            False
varices            False
histology          False
Name: 0, dtype: boolean

In [15]:
df[bool_cols] = df[bool_cols].fillna(bool_mode)
df

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,,False,live
3,31,female,True,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,80,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200,4.0,,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,,242,3.3,50,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,live


In [16]:
int_cols = df.select_dtypes(include=int).columns
int_cols

Index(['age', 'alk_phosphate', 'sgot', 'protime'], dtype='object')

In [17]:
median_int_cols = df[int_cols].median()
median_int_cols

age              39.0
alk_phosphate    85.0
sgot             58.0
protime          61.0
dtype: Float64

In [18]:
df[int_cols] = df[int_cols].fillna(median_int_cols)
df

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,61,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,61,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,61,False,live
3,31,female,True,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,80,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,85,200,4.0,61,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,85,242,3.3,50,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,61,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,61,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,live


In [19]:
float_cols = df.select_dtypes(include=float).columns
df[float_cols] = df[float_cols].astype(float)
float_cols

Index(['bilirubin', 'albumin'], dtype='object')

In [20]:
mean_float_cols = df[float_cols].interpolate(method='linear')
mean_float_cols

Unnamed: 0,bilirubin,albumin
0,1.0,4.0
1,0.9,3.5
2,0.7,4.0
3,0.7,4.0
4,1.0,4.0
...,...,...
150,7.6,3.3
151,0.9,4.3
152,0.8,4.1
153,1.5,4.1


In [21]:
df[float_cols] = df[float_cols].fillna(mean_float_cols)
df

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,61,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,61,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,61,False,live
3,31,female,True,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,80,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,85,200,4.0,61,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,85,242,3.3,50,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,61,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,61,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,live


In [22]:
## Categorical
# categorical_cols = df.select_dtypes(include='string').columns
# categorical_mode_cols = df[categorical_cols].mode()

# categorical_cols

In [23]:
# df[categorical_cols] = df[categorical_cols].fillna(categorical_mode_cols)
df.isna().sum()/len(df)*100

age                0.0
sex                0.0
steroid            0.0
antivirals         0.0
fatigue            0.0
malaise            0.0
anorexia           0.0
liver_big          0.0
liver_firm         0.0
spleen_palpable    0.0
spiders            0.0
ascites            0.0
varices            0.0
bilirubin          0.0
alk_phosphate      0.0
sgot               0.0
albumin            0.0
protime            0.0
histology          0.0
class              0.0
dtype: float64

In [24]:
df.isna().values.any()

False

## Encode the features

In [25]:
import sklearn.ensemble
import sklearn.preprocessing
import sklearn.metrics
from  sklearn.model_selection import train_test_split

In [26]:
label_encoder = sklearn.preprocessing.LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])
# df['sex'] = label_encoder.fit_transform(df['sex'])
df['class']

0      1
1      1
2      1
3      1
4      1
      ..
150    0
151    1
152    1
153    1
154    0
Name: class, Length: 155, dtype: int64

In [27]:
categorical_feat = df.select_dtypes(include='string').columns.tolist()
df = pd.get_dummies(df, prefix=['sex'])
df

Unnamed: 0,age,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,...,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class,sex_female,sex_male
0,30,False,False,False,False,False,False,False,False,False,...,False,1.0,85,18,4.0,61,False,1,False,True
1,50,False,False,True,False,False,False,False,False,False,...,False,0.9,135,42,3.5,61,False,1,True,False
2,78,True,False,True,False,False,True,False,False,False,...,False,0.7,96,32,4.0,61,False,1,True,False
3,31,True,True,False,False,False,True,False,False,False,...,False,0.7,46,52,4.0,80,False,1,True,False
4,34,True,False,False,False,False,True,False,False,False,...,False,1.0,85,200,4.0,61,False,1,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,True,False,True,True,True,True,False,False,True,...,True,7.6,85,242,3.3,50,True,0,True,False
151,44,True,False,True,False,False,True,True,False,False,...,False,0.9,126,142,4.3,61,True,1,True,False
152,61,False,False,True,True,False,False,True,False,True,...,False,0.8,75,20,4.1,61,True,1,True,False
153,53,False,False,True,False,False,True,False,True,True,...,True,1.5,81,19,4.1,48,True,1,False,True


In [28]:
#find outliers
isorforest = sklearn.ensemble.IsolationForest(n_estimators=1000, contamination=0.01, random_state=0)
res = isorforest.fit_predict(df.to_numpy())
res

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1])

In [29]:
# PCA
pca = sklearn.decomposition.PCA(n_components=0.9999)
x_pca = pca.fit_transform(df)
x_orig = pca.inverse_transform(x_pca)

anomaly_score = np.abs(df.to_numpy() - x_orig).sum(1)

# get last quantile
threshold = np.quantile(anomaly_score, 0.99) # keep 99% fo the sample
anomalous_ids = np.argwhere(anomaly_score > threshold).squeeze() #which are the indecs of the elements that are above the anomaly threshold
df.iloc[anomalous_ids]
df = df.drop(anomalous_ids)

In [30]:
# split
x =df[list(set(df.columns) - set(['class']))]
y = df['class']
# Normalization
scaler = sklearn.preprocessing.StandardScaler()
x = scaler.fit_transform(x)


In [31]:
# split in train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=0)

In [32]:
x.shape, x_train.shape, x_test.shape

((153, 20), (122, 20), (31, 20))

# Test