# Data preparation

In [1]:
import pandas as pd

In [2]:
clmn_names = 'Class, AGE, SEX, STEROID, ANTIVIRALS, FATIGUE, MALAISE, ANOREXIA, LIVER BIG, LIVER FIRM, SPLEEN PALPABLE, SPIDERS, ASCITES, VARICES, BILIRUBIN, ALK PHOSPHATE, SGOT, ALBUMIN, PROTIME, HISTOLOGY'.split(', ')

In [3]:
df = pd.read_csv('data/hepatitis.data', header=None, names=clmn_names, na_values='?')

In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, stratify=df.Class)

In [5]:
len(train), len(test)

(116, 39)

In [6]:
arrays = [['number_of_nan_value', 'number_of_nan_value'], ['Live', 'Dead']]
df_info = pd.DataFrame(columns=pd.MultiIndex.from_arrays(arrays, names=('param', 'class',)))

df_info[('number_of_nan_value', 'Live')] = train.loc[train.Class == 1].isna().sum()
df_info[('number_of_nan_value', 'Dead')] = train.loc[train.Class == 2].isna().sum()

df_info[('nunique', 'Live')] = train.loc[train.Class == 1].nunique()
df_info[('nunique', 'Dead')] = train.loc[train.Class == 2].nunique()

df_info[('unique', 'Live')] = train.loc[train.Class == 1].apply(pd.unique).apply(sorted)
df_info[('unique', 'Dead')] = train.loc[train.Class == 2].apply(pd.unique).apply(sorted)

df_info[('min', 'Live')] = train.loc[train.Class == 1].min()
df_info[('min', 'Dead')] = train.loc[train.Class == 2].min()

df_info[('max', 'Live')] = train.loc[train.Class == 1].max()
df_info[('max', 'Dead')] = train.loc[train.Class == 2].max()

df_info[('mode', 'Live')] = train.loc[train.Class == 1].mode().iloc[0].T
df_info[('mode', 'Dead')] = train.loc[train.Class == 2].mode().iloc[0].T

df_info

param,number_of_nan_value,number_of_nan_value,nunique,nunique,unique,unique,min,min,max,max,mode,mode
class,Live,Dead,Live,Dead,Live,Dead,Live,Dead,Live,Dead,Live,Dead
Class,0,0,1,1,[1],[2],1.0,2.0,1.0,2.0,1.0,2.0
AGE,0,0,22,38,"[30, 31, 33, 34, 35, 37, 38, 39, 41, 42, 44, 4...","[7, 20, 22, 23, 24, 25, 26, 27, 28, 30, 32, 33...",30.0,7.0,70.0,78.0,47.0,34.0
SEX,0,0,1,2,[1],"[1, 2]",1.0,1.0,1.0,2.0,1.0,1.0
STEROID,0,0,2,2,"[1.0, 2.0]","[1.0, 2.0]",1.0,1.0,2.0,2.0,1.0,2.0
ANTIVIRALS,0,0,2,2,"[1, 2]","[1, 2]",1.0,1.0,2.0,2.0,2.0,2.0
FATIGUE,0,1,2,2,"[1.0, 2.0]","[nan, 1.0, 2.0]",1.0,1.0,2.0,2.0,1.0,1.0
MALAISE,0,1,2,2,"[1.0, 2.0]","[nan, 1.0, 2.0]",1.0,1.0,2.0,2.0,1.0,2.0
ANOREXIA,0,1,2,2,"[1.0, 2.0]","[nan, 1.0, 2.0]",1.0,1.0,2.0,2.0,2.0,2.0
LIVER BIG,4,5,2,2,"[1.0, 2.0, nan]","[nan, 1.0, 2.0]",1.0,1.0,2.0,2.0,2.0,2.0
LIVER FIRM,4,6,2,2,"[1.0, 2.0, nan]","[nan, 1.0, 2.0]",1.0,1.0,2.0,2.0,1.0,2.0


In [7]:
num_clmns = df_info[df_info[('nunique', 'Live')] > 2].index.to_list()
num_clmns

['AGE', 'BILIRUBIN', 'ALK PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME']

In [8]:
cat_clmns = df_info[df_info[('nunique', 'Live')] == 2].index.to_list()
cat_clmns

['STEROID',
 'ANTIVIRALS',
 'FATIGUE',
 'MALAISE',
 'ANOREXIA',
 'LIVER BIG',
 'LIVER FIRM',
 'SPLEEN PALPABLE',
 'SPIDERS',
 'ASCITES',
 'VARICES',
 'HISTOLOGY']

In [9]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline, make_union
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.linear_model import LogisticRegression

In [10]:
x_train = train.drop('Class', axis=1)
y_train = train.Class

In [11]:
x_test = test.drop('Class', axis=1)
y_test = test.Class

In [12]:
feature_transform = make_column_transformer(
    (
        make_union(
            MissingIndicator(),
            OneHotEncoder()
        ), cat_clmns),
    (
        make_union(
            MissingIndicator(),
            RobustScaler(unit_variance=True)
        ), num_clmns),
)

In [24]:
pipe_model = make_pipeline(
    feature_transform,
    SimpleImputer(strategy='median'),
    LogisticRegression(class_weight='balanced')
)

In [25]:
pipe_model.fit(x_train, y_train)

In [26]:
pipe_model.score(x_train, y_train)

0.8793103448275862

In [27]:
y_pred_train = pipe_model.predict(x_train)

In [28]:
from sklearn.metrics import classification_report

In [29]:
print(classification_report(y_true=y_train, y_pred=y_pred_train))

              precision    recall  f1-score   support

           1       0.65      0.92      0.76        24
           2       0.98      0.87      0.92        92

    accuracy                           0.88       116
   macro avg       0.81      0.89      0.84       116
weighted avg       0.91      0.88      0.89       116



In [128]:
y_pred_test = pipe_model.predict(x_test)

ValueError: The features [0] have missing values in transform but have no missing values in fit.

# Live people feature distribution

In [37]:
from sklearn.svm import OneClassSVM

In [38]:
pipe_model = make_pipeline(
    feature_transform,
    SimpleImputer(strategy='median'),
    OneClassSVM()
)

In [32]:
x_train_mix = train[train.Class == 2].drop('Class', axis=1)

In [39]:
pipe_model.fit(x_train_mix)

In [43]:
pipe_model.predict(x_train_mix)

array([ 1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1, -1,
       -1,  1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1,  1, -1,  1,
        1, -1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1,
        1, -1,  1, -1,  1, -1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1, -1,
       -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1,
        1,  1, -1,  1, -1, -1,  1], dtype=int64)

# clustering

In [44]:
from sklearn.cluster import KMeans

In [114]:
pipe_model = make_pipeline(
    feature_transform,
    SimpleImputer(strategy='median'),
    KMeans(n_clusters=2, n_init='auto')
)

In [115]:
pipe_model.fit(x_train)

In [116]:
clsters = pipe_model.predict(x_train)

In [117]:
res = pd.DataFrame()
res['cluster'] = clsters
res['label'] = y_train.values - 1
res

Unnamed: 0,cluster,label
0,0,0
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
111,0,1
112,0,1
113,0,0
114,0,1


In [118]:
res.groupby(['cluster', 'label']).size()

cluster  label
0        0        14
         1        82
1        0        10
         1        10
dtype: int64

In [119]:
clsters

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1])

In [120]:
x_train_0 = x_train.loc[clsters == 0]
x_train_1 = x_train.loc[clsters == 1]

y_train_0 = y_train.loc[clsters == 0]
y_train_1 = y_train.loc[clsters == 1]

In [121]:
model0 = make_pipeline(
    feature_transform,
    SimpleImputer(strategy='median'),
    LogisticRegression(class_weight='balanced')
).fit(x_train_0, y_tarin_0)

In [122]:
model0.score(x_train_0, y_tarin_0)

0.9270833333333334

In [123]:
model1 = make_pipeline(
    feature_transform,
    SimpleImputer(strategy='median'),
    LogisticRegression(class_weight='balanced')
).fit(x_train_1, y_tarin_1)

In [124]:
model1.score(x_train_1, y_tarin_1)

0.95

In [None]:
def with_clster_mode(x):
    clstrs = pipe_model.predict(x)
    mask = clstrs == 0
    x0 = x.loc[mask]
    x1 = x.loc[~mask]
    clstrs[mask] = model0.predict(x0)
    clstrs[~mask] = model1.predict(x1)
    return clstrs
    

In [126]:
clstrs = clsters.copy()
mask = clstrs == 0
x0 = x_train.loc[mask]
x1 = x_train.loc[~mask]
clstrs[mask] = model0.predict(x0)
clstrs[~mask] = model1.predict(x1)

ValueError: The features [2 3 4] have missing values in transform but have no missing values in fit.

In [83]:
clstrs

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1])

In [84]:
mask = clstrs == 0

In [86]:
x0 = x_train.loc[mask]
x0

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
109,33,1,1.0,2,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,0.7,63.0,80.0,3.0,31.0,2
78,34,1,1.0,2,1.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,72.0,46.0,4.4,57.0,1
73,36,1,1.0,2,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,,45.0,4.0,57.0,1
11,32,1,2.0,1,1.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,59.0,249.0,3.7,54.0,1
75,32,1,1.0,1,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,55.0,45.0,4.1,56.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,51,1,2.0,2,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.8,,33.0,4.5,,2
92,33,1,2.0,2,2.0,2.0,2.0,,,2.0,2.0,2.0,2.0,1.0,,60.0,4.0,,2
40,52,1,1.0,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,75.0,55.0,4.0,21.0,1
88,38,1,1.0,2,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.2,118.0,16.0,2.8,,2


In [87]:
x1 = x_train.loc[~mask]
x1

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
103,51,1,1.0,2,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,4.6,215.0,269.0,3.9,51.0,2
102,27,1,1.0,2,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.4,168.0,227.0,3.0,66.0,2
41,23,1,2.0,2,2.0,2.0,2.0,,,,,,,4.6,56.0,16.0,4.6,,1
127,50,1,2.0,2,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,2.8,155.0,75.0,2.4,32.0,2
141,54,1,1.0,2,1.0,1.0,2.0,,,1.0,2.0,1.0,2.0,3.9,120.0,28.0,3.5,43.0,2
131,31,1,1.0,2,1.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,8.0,,101.0,2.2,,2
100,48,1,1.0,2,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,4.8,123.0,157.0,2.7,31.0,2
150,46,1,2.0,2,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,7.6,,242.0,3.3,50.0,2
119,54,1,2.0,2,1.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,3.2,85.0,28.0,3.8,,2
129,57,1,1.0,2,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,4.6,82.0,55.0,3.3,30.0,2


In [94]:
model0.predict(x0)

ValueError: The features [2 3 4] have missing values in transform but have no missing values in fit.

In [92]:
model1.predict(x1)

array([2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1],
      dtype=int64)

# nan value feature upsampling

In [31]:
import numpy as np

In [32]:
new_train = train.copy()
for clm in train.columns[1:]:
    train2 = train.copy()
    train2[clm] = np.nan
    new_train = pd.concat([new_train, train2])

In [34]:
new_train.isna().sum()

Class                 0
AGE                 116
SEX                 116
STEROID             116
ANTIVIRALS          116
FATIGUE             135
MALAISE             135
ANOREXIA            135
LIVER BIG           287
LIVER FIRM          306
SPLEEN PALPABLE     192
SPIDERS             192
ASCITES             192
VARICES             192
BILIRUBIN           192
ALK PHOSPHATE       515
SGOT                173
ALBUMIN             306
PROTIME            1066
HISTOLOGY           116
dtype: int64

In [35]:
from sklearn.impute import KNNImputer

In [36]:
x_train_new = new_train.drop('Class', axis=1)
y_train_new = new_train.Class

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
pipe_model_new = make_pipeline(
    feature_transform,
    KNNImputer(weights='distance'),
    # LogisticRegression(penalty=None, class_weight='balanced')
    RandomForestClassifier()
)

In [51]:
pipe_model_new.fit(x_train_new, y_train_new)

In [52]:
pipe_model_new.score(x_train, y_train)

1.0

In [53]:
pipe_model_new.score(x_test, y_test)

0.9230769230769231