In [88]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline

In [41]:
df = pd.read_csv('/content/sample_data/hepatitis.csv')

In [42]:
print(df.head(5))
print(df.shape)

   age     sex steroid  antivirals fatigue malaise anorexia liver_big  \
0   30    male   False       False   False   False    False     False   
1   50  female   False       False    True   False    False     False   
2   78  female    True       False    True   False    False      True   
3   31  female     NaN        True   False   False    False      True   
4   34  female    True       False   False   False    False      True   

  liver_firm spleen_palpable spiders ascites varices  bilirubin  \
0      False           False   False   False   False        1.0   
1      False           False   False   False   False        0.9   
2      False           False   False   False   False        0.7   
3      False           False   False   False   False        0.7   
4      False           False   False   False   False        1.0   

   alk_phosphate   sgot  albumin  protime  histology class  
0           85.0   18.0      4.0      NaN      False  live  
1          135.0   42.0      3.5    

In [43]:
print(df.isnull().sum())

age                 0
sex                 0
steroid             1
antivirals          0
fatigue             1
malaise             1
anorexia            1
liver_big          10
liver_firm         11
spleen_palpable     5
spiders             5
ascites             5
varices             5
bilirubin           6
alk_phosphate      29
sgot                4
albumin            16
protime            67
histology           0
class               0
dtype: int64


In [44]:
# First imputation. Mode for cat data and Mean for num
null_cat_cols = [col for col in df.columns if df[col].dtype == 'object']
null_num_cols = [col for col in df.columns if df[col].dtype != 'object']

In [45]:
print(null_cat_cols)

['sex', 'steroid', 'fatigue', 'malaise', 'anorexia', 'liver_big', 'liver_firm', 'spleen_palpable', 'spiders', 'ascites', 'varices', 'class']


In [46]:
print(null_num_cols)

['age', 'antivirals', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime', 'histology']


In [47]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              155 non-null    int64  
 1   sex              155 non-null    object 
 2   steroid          154 non-null    object 
 3   antivirals       155 non-null    bool   
 4   fatigue          154 non-null    object 
 5   malaise          154 non-null    object 
 6   anorexia         154 non-null    object 
 7   liver_big        145 non-null    object 
 8   liver_firm       144 non-null    object 
 9   spleen_palpable  150 non-null    object 
 10  spiders          150 non-null    object 
 11  ascites          150 non-null    object 
 12  varices          150 non-null    object 
 13  bilirubin        149 non-null    float64
 14  alk_phosphate    126 non-null    float64
 15  sgot             151 non-null    float64
 16  albumin          139 non-null    float64
 17  protime         

In [50]:
for col in null_num_cols:
  df[col] = df[col].fillna(df[col].mean())

for col in null_cat_cols:
  df[col] = df[col].fillna(df[col].mode()[0])

  df[col] = df[col].fillna(df[col].mode()[0])


In [51]:
print(df.isnull().sum())

age                0
sex                0
steroid            0
antivirals         0
fatigue            0
malaise            0
anorexia           0
liver_big          0
liver_firm         0
spleen_palpable    0
spiders            0
ascites            0
varices            0
bilirubin          0
alk_phosphate      0
sgot               0
albumin            0
protime            0
histology          0
class              0
dtype: int64


In [52]:
print(df.head(10))

   age     sex  steroid  antivirals  fatigue  malaise  anorexia  liver_big  \
0   30    male    False       False    False    False     False      False   
1   50  female    False       False     True    False     False      False   
2   78  female     True       False     True    False     False       True   
3   31  female     True        True    False    False     False       True   
4   34  female     True       False    False    False     False       True   
5   34  female     True       False    False    False     False       True   
6   51  female    False       False     True    False      True       True   
7   23  female     True       False    False    False     False       True   
8   39  female     True       False     True    False     False       True   
9   30  female     True       False    False    False     False       True   

   liver_firm  spleen_palpable  spiders  ascites  varices  bilirubin  \
0       False            False    False    False    False   1.000000 

In [60]:
# Convert object cols to boolean
df['sex'] = df['sex'] == 'male'
boolean_cols = ['steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia',  'liver_big',
'liver_firm',  'spleen_palpable',  'spiders',  'ascites',  'varices', 'histology']

for col in boolean_cols:
  df[col] = df[col] == 'True'

df['class'] = df['class'] == 'live'

In [61]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              155 non-null    int64  
 1   sex              155 non-null    bool   
 2   steroid          155 non-null    bool   
 3   antivirals       155 non-null    bool   
 4   fatigue          155 non-null    bool   
 5   malaise          155 non-null    bool   
 6   anorexia         155 non-null    bool   
 7   liver_big        155 non-null    bool   
 8   liver_firm       155 non-null    bool   
 9   spleen_palpable  155 non-null    bool   
 10  spiders          155 non-null    bool   
 11  ascites          155 non-null    bool   
 12  varices          155 non-null    bool   
 13  bilirubin        155 non-null    float64
 14  alk_phosphate    155 non-null    float64
 15  sgot             155 non-null    float64
 16  albumin          155 non-null    float64
 17  protime         

In [86]:
num_cols = ['age', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime']

In [92]:
pipeline = Pipeline([
    ('standard', StandardScaler()),
    ('robust', RobustScaler())
])

df[num_cols] = pipeline.fit_transform(df[num_cols])

In [93]:
# split the data
X = df.drop('class', axis=1)
y = df['class']

In [94]:
print(type(X))

<class 'pandas.core.frame.DataFrame'>


In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

In [99]:
sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, y_train)

In [100]:
sgd_preds = sgd_clf.predict(X_test)

In [101]:
print(classification_report(y_test, sgd_preds))

              precision    recall  f1-score   support

       False       0.67      0.44      0.53         9
        True       0.80      0.91      0.85        22

    accuracy                           0.77        31
   macro avg       0.73      0.68      0.69        31
weighted avg       0.76      0.77      0.76        31

