In [1]:
import pandas as pd

In [13]:
clmn_names = 'Class, AGE, SEX, STEROID, ANTIVIRALS, FATIGUE, MALAISE, ANOREXIA, LIVER BIG, LIVER FIRM, SPLEEN PALPABLE, SPIDERS, ASCITES, VARICES, BILIRUBIN, ALK PHOSPHATE, SGOT, ALBUMIN, PROTIME, HISTOLOGY'.split(', ')

In [14]:
df = pd.read_csv('data/hepatitis.data', header=None, names=clmn_names, na_values='?')

In [23]:
df.apply(pd.unique)

Class                                                         [2, 1]
AGE                [30, 50, 78, 31, 34, 51, 23, 39, 32, 41, 47, 3...
SEX                                                           [2, 1]
STEROID                                              [1.0, 2.0, nan]
ANTIVIRALS                                                    [2, 1]
FATIGUE                                              [2.0, 1.0, nan]
MALAISE                                              [2.0, 1.0, nan]
ANOREXIA                                             [2.0, 1.0, nan]
LIVER BIG                                            [1.0, 2.0, nan]
LIVER FIRM                                           [2.0, 1.0, nan]
SPLEEN PALPABLE                                      [2.0, 1.0, nan]
SPIDERS                                              [2.0, 1.0, nan]
ASCITES                                              [2.0, 1.0, nan]
VARICES                                              [2.0, nan, 1.0]
BILIRUBIN          [1.0, 0.9, 0.7,

In [24]:
df.nunique()

Class               2
AGE                49
SEX                 2
STEROID             2
ANTIVIRALS          2
FATIGUE             2
MALAISE             2
ANOREXIA            2
LIVER BIG           2
LIVER FIRM          2
SPLEEN PALPABLE     2
SPIDERS             2
ASCITES             2
VARICES             2
BILIRUBIN          34
ALK PHOSPHATE      83
SGOT               84
ALBUMIN            29
PROTIME            44
HISTOLOGY           2
dtype: int64

In [61]:
nan_val_clmn = df.isna().sum()
nan_val_clmn

Class               0
AGE                 0
SEX                 0
STEROID             1
ANTIVIRALS          0
FATIGUE             1
MALAISE             1
ANOREXIA            1
LIVER BIG          10
LIVER FIRM         11
SPLEEN PALPABLE     5
SPIDERS             5
ASCITES             5
VARICES             5
BILIRUBIN           6
ALK PHOSPHATE      29
SGOT                4
ALBUMIN            16
PROTIME            67
HISTOLOGY           0
dtype: int64

In [26]:
df_drop = df.dropna()

In [27]:
len(df_drop)

80

In [28]:
y = df_drop['Class']
X = df_drop.drop('Class', axis=1)

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
mdl = RandomForestClassifier().fit(X, y)

In [31]:
mdl.score(X, y)

1.0

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [84]:
scl_x = StandardScaler().fit_transform(X)

In [94]:
lg = LogisticRegression(penalty=None, max_iter=1000, class_weight='balanced').fit(X, y)

In [95]:
lg_s = LogisticRegression(penalty=None, max_iter=1000, class_weight='balanced').fit(scl_x, y)

In [96]:
lg.score(X, y)

1.0

In [97]:
lg_s.score(scl_x, y)

1.0

In [98]:
y.unique()

array([2, 1], dtype=int64)

In [99]:
lg.predict(X.sample(2))

array([2, 2], dtype=int64)

In [100]:
y.value_counts()

2    67
1    13
Name: Class, dtype: int64

In [101]:
13/67

0.19402985074626866

In [102]:
a = lg.coef_

In [103]:
abs(a)

array([[  60.92156347, 3161.74894271, 4009.2902206 ,  417.33787469,
         899.40429066, 2501.35868654, 4352.90257517, 2567.75617259,
        1084.1966244 ,  184.42159961, 1631.77957375,   81.19868709,
         555.71472476, 1672.35366986,    9.65488504,   49.61846025,
         827.06006108,   38.49148229, 5880.57818645]])

In [104]:
a.shape

(1, 19)

In [105]:
lg_rank = pd.DataFrame(abs(a[0]), index=clmn_names[1:], columns=['lg_rank'])

In [106]:
lgs_rank = pd.DataFrame(abs(lg_s.coef_[0]), index=clmn_names[1:], columns=['lgs_rank'])
lgs_rank

Unnamed: 0,lgs_rank
AGE,9.326354
SEX,23.13216
STEROID,17.202013
ANTIVIRALS,1.88466
FATIGUE,5.481661
MALAISE,7.223471
ANOREXIA,27.629428
LIVER BIG,13.481189
LIVER FIRM,0.120297
SPLEEN PALPABLE,8.881991


In [107]:
lg_rank['nan_count'] = nan_val_clmn
lg_rank

Unnamed: 0,lg_rank,nan_count
AGE,60.921563,0
SEX,3161.748943,0
STEROID,4009.290221,1
ANTIVIRALS,417.337875,0
FATIGUE,899.404291,1
MALAISE,2501.358687,1
ANOREXIA,4352.902575,1
LIVER BIG,2567.756173,10
LIVER FIRM,1084.196624,11
SPLEEN PALPABLE,184.4216,5


In [108]:
lg_rank.sort_values(by='lg_rank', ascending=False)

Unnamed: 0,lg_rank,nan_count
HISTOLOGY,5880.578186,0
ANOREXIA,4352.902575,1
STEROID,4009.290221,1
SEX,3161.748943,0
LIVER BIG,2567.756173,10
MALAISE,2501.358687,1
BILIRUBIN,1672.35367,6
SPIDERS,1631.779574,5
LIVER FIRM,1084.196624,11
FATIGUE,899.404291,1


In [109]:
res = pd.DataFrame()
res['non_droped'] = abs(df.corr()['Class'])

In [110]:
res['droped'] = abs(df_drop.corr()['Class'])
res

Unnamed: 0,non_droped,droped
Class,1.0,1.0
AGE,0.219647,0.212769
SEX,0.173051,0.175876
STEROID,0.1347,0.12383
ANTIVIRALS,0.130196,0.108776
FATIGUE,0.309269,0.181151
MALAISE,0.337859,0.275595
ANOREXIA,0.132172,0.185042
LIVER BIG,0.07763,0.19403
LIVER FIRM,0.063155,0.055978


In [111]:
fin = pd.concat([res, lg_rank], axis=1).drop('Class')
fin

Unnamed: 0,non_droped,droped,lg_rank,nan_count
AGE,0.219647,0.212769,60.921563,0.0
SEX,0.173051,0.175876,3161.748943,0.0
STEROID,0.1347,0.12383,4009.290221,1.0
ANTIVIRALS,0.130196,0.108776,417.337875,0.0
FATIGUE,0.309269,0.181151,899.404291,1.0
MALAISE,0.337859,0.275595,2501.358687,1.0
ANOREXIA,0.132172,0.185042,4352.902575,1.0
LIVER BIG,0.07763,0.19403,2567.756173,10.0
LIVER FIRM,0.063155,0.055978,1084.196624,11.0
SPLEEN PALPABLE,0.238734,0.135643,184.4216,5.0


In [112]:
fin.sort_values('nan_count', ascending=False)

Unnamed: 0,non_droped,droped,lg_rank,nan_count
PROTIME,0.40916,0.395386,38.491482,67.0
ALK PHOSPHATE,0.161204,0.18936,9.654885,29.0
ALBUMIN,0.503272,0.477404,827.060061,16.0
LIVER FIRM,0.063155,0.055978,1084.196624,11.0
LIVER BIG,0.07763,0.19403,2567.756173,10.0
BILIRUBIN,0.463752,0.351557,1672.35367,6.0
SPLEEN PALPABLE,0.238734,0.135643,184.4216,5.0
SPIDERS,0.398308,0.287839,1631.779574,5.0
ASCITES,0.477882,0.479211,81.198687,5.0
VARICES,0.368846,0.345785,555.714725,5.0


In [113]:
abs(lg_s.coef_)

array([[ 9.3263544 , 23.13216009, 17.20201284,  1.88466035,  5.48166122,
         7.22347112, 27.6294279 , 13.48118852,  0.12029684,  8.8819914 ,
         3.34961467,  1.94109006,  5.29441419, 14.3828287 ,  7.10503654,
        25.23084368,  4.67532783,  9.11951568, 16.65819391]])

In [114]:
fin['lgs_rank'] = lgs_rank

In [116]:
fin.sort_values('lgs_rank', ascending=False, inplace=True)

In [117]:
fin

Unnamed: 0,non_droped,droped,lg_rank,nan_count,lgs_rank
ANOREXIA,0.132172,0.185042,4352.902575,1.0,27.629428
SGOT,0.077678,0.078731,49.61846,4.0,25.230844
SEX,0.173051,0.175876,3161.748943,0.0,23.13216
STEROID,0.1347,0.12383,4009.290221,1.0,17.202013
HISTOLOGY,0.337856,0.456856,5880.578186,0.0,16.658194
BILIRUBIN,0.463752,0.351557,1672.35367,6.0,14.382829
LIVER BIG,0.07763,0.19403,2567.756173,10.0,13.481189
AGE,0.219647,0.212769,60.921563,0.0,9.326354
PROTIME,0.40916,0.395386,38.491482,67.0,9.119516
SPLEEN PALPABLE,0.238734,0.135643,184.4216,5.0,8.881991


# Modeling

In [118]:
fin.index[:8]

Index(['ANOREXIA', 'SGOT', 'SEX', 'STEROID', 'HISTOLOGY', 'BILIRUBIN',
       'LIVER BIG', 'AGE'],
      dtype='object')

In [119]:
df.loc[:, fin.index[:8]]

Unnamed: 0,ANOREXIA,SGOT,SEX,STEROID,HISTOLOGY,BILIRUBIN,LIVER BIG,AGE
0,2.0,18.0,2,1.0,1,1.0,1.0,30
1,2.0,42.0,1,1.0,1,0.9,1.0,50
2,2.0,32.0,1,2.0,1,0.7,2.0,78
3,2.0,52.0,1,,1,0.7,2.0,31
4,2.0,200.0,1,2.0,1,1.0,2.0,34
...,...,...,...,...,...,...,...,...
150,1.0,242.0,1,2.0,2,7.6,2.0,46
151,2.0,142.0,1,2.0,2,0.9,2.0,44
152,2.0,20.0,1,1.0,2,0.8,1.0,61
153,2.0,19.0,2,1.0,2,1.5,2.0,53
