In [167]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re

In [168]:
# Load data

In [169]:
df = pd.read_csv('./cow.csv')
df

Unnamed: 0,surgery,age,hospital_number,temperature,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome Class,surgical_lesion,lesion
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,,decreased,distend_large,45.0,8.4,,,died,no,11300
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,,absent,other,50.0,85.0,cloudy,2.0,euthanized,no,2208
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,,normal,normal,33.0,6.7,,,lived,no,0
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,5.0,decreased,,48.0,7.2,serosanguious,5.3,died,yes,2208
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,,,,74.0,7.4,,,died,no,4300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,533886,,120.0,70.0,cold,,pale_cyanotic,more_3_sec,...,,,distend_large,55.0,65.0,,,euthanized,no,3205
295,no,adult,527702,37.2,72.0,24.0,cool,increased,pale_cyanotic,more_3_sec,...,,absent,distend_small,44.0,,serosanguious,3.3,euthanized,yes,2208
296,yes,adult,529386,37.5,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,...,,decreased,distend_large,60.0,6.8,,,died,yes,3205
297,yes,adult,530612,36.5,100.0,24.0,cool,reduced,pale_pink,less_3_sec,...,,absent,distend_small,50.0,6.0,serosanguious,3.4,lived,yes,2208


In [170]:
def extract_lesion_column(lesion):
    site, type_l, subtype, code = 'None', 'None', 'None', 'None'
    if lesion != 0:
        types = re.findall('^([1-9]|11|00)([1-4])([0-2])([1-9]|10|0)',str(lesion))
        if types:
            site, type_l, subtype, code = types[0]
    return pd.Series({'lesion_site': site, 'lesion_type': type_l, 'lesion_subtype': subtype, 'lesion_code': code}, dtype='category')

df.loc[:, ['lesion_site', 'lesion_type', 'lesion_subtype', 'lesion_code']] = df.lesion.apply(extract_lesion_column).astype('category')
df.drop(columns=['lesion', 'hospital_number'], inplace=True)

In [171]:
categorical_columns = []
for col in df:
    if str(df[col].dtype) != 'category' and df[col].unique().size < 10:
        df[col].fillna('[NAN]', inplace=True)
        categorical_columns.append(col)
        df[col] = df[col].astype('category')

In [172]:
X = []
X_labels = []
Y = None
for col in df:
    if col == 'outcome Class':
        Y = df[col].cat.codes.values
    elif str(df[col].dtype) != 'category':
        X.append(df[col].fillna(df[col].mean()).values,)
        X_labels.append(col)
    else:
        X.append(df[col].cat.codes.values)
        X_labels.append(col)
X = np.array(X).T
x_label2idx = {l:i for i,l in enumerate(X_labels)}

In [173]:
import sklearn.feature_selection as fs

In [174]:
selector = fs.SelectPercentile(fs.chi2,percentile=50).fit(X, Y)
X_new = selector.transform(X)
keep_cols = selector.scores_.argsort()[::-1][:X_new.shape[1]]
keep_cols_label = [X_labels[i] for i in keep_cols]
keep_cols_label

['total_protein',
 'pulse',
 'lesion_subtype',
 'packed_cell_volume',
 'lesion_site',
 'lesion_code',
 'abdomen',
 'abdomo_appearance',
 'respiratory_rate',
 'rectal_exam_feces',
 'lesion_type',
 'temp_of_extremities',
 'surgical_lesion']

In [175]:
col2types = {l:df[l].dtype.name for l in X_labels}

In [176]:
from sklearn.model_selection import train_test_split
x_train_, x_test_, y_train_, y_test_ = train_test_split(X[:,keep_cols], Y, test_size=0.2)

In [177]:
np.unique(y_test_, return_counts=1),np.unique(y_train_, return_counts=1)

((array([0, 1, 2], dtype=int8), array([18,  9, 33])),
 (array([0, 1, 2], dtype=int8), array([ 59,  35, 145])))

In [178]:
num_classes = np.unique(y_test_).size
num_classes

3

In [179]:
x_train_.T.shape

(13, 239)

In [180]:
x_train = x_train_ #{l:v for l, v in zip(keep_cols_label, x_train_.T)}
x_test = x_test_ #{l:v for l, v in zip(keep_cols_label, x_test_.T)}

y_train = y_train_ # tf.keras.utils.to_categorical(y_train_)
y_test = y_test_ # tf.keras.utils.to_categorical(y_test_)

# Buid model

In [181]:
from sklearn import svm
from sklearn.metrics import classification_report

In [182]:
outcome_classes = df['outcome Class'].cat.categories.tolist()

In [183]:
svm_modes = ['LinearSVC', 'SVC',]

In [196]:
def train_model(model):
    print(model)
    model.fit(x_train, y_train)
    test_hypo = model.predict(x_test)
    return classification_report(y_test, test_hypo, target_names=outcome_classes,output_dict=True)

In [240]:
results = {}
for loss in ['hinge', 'squared_hinge']:
    for multi_class in ['ovr', 'crammer_singer']:
        for C in range(1, 3):
            out = train_model(svm.LinearSVC(C=2, loss=loss))
            data = {}
            for k, v in out.items():
                if not isinstance(v, dict): continue
                if k not in ['lived', 'euthanized', 'died']: continue
                for k1, v1 in v.items():
                    if k1 in ['precision', 'recall']:
                        data[k1+'_'+k] = v1
            results[f'loss={loss}, multi_class={multi_class}, C={C}'] = data

LinearSVC(C=2, loss='hinge')
LinearSVC(C=2, loss='hinge')
LinearSVC(C=2, loss='hinge')
LinearSVC(C=2, loss='hinge')
LinearSVC(C=2)
LinearSVC(C=2)
LinearSVC(C=2)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


LinearSVC(C=2)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [241]:
pd.DataFrame(results).T

Unnamed: 0,precision_died,recall_died,precision_euthanized,recall_euthanized,precision_lived,recall_lived
"loss=hinge, multi_class=ovr, C=1",0.583333,0.777778,1.0,0.111111,0.8,0.848485
"loss=hinge, multi_class=ovr, C=2",0.75,0.5,0.555556,0.555556,0.769231,0.909091
"loss=hinge, multi_class=crammer_singer, C=1",0.0,0.0,1.0,0.333333,0.578947,1.0
"loss=hinge, multi_class=crammer_singer, C=2",0.4,1.0,1.0,0.111111,0.928571,0.393939
"loss=squared_hinge, multi_class=ovr, C=1",0.45,0.5,0.272727,0.666667,0.888889,0.484848
"loss=squared_hinge, multi_class=ovr, C=2",1.0,0.055556,0.25,0.888889,0.814815,0.666667
"loss=squared_hinge, multi_class=crammer_singer, C=1",1.0,0.055556,0.0,0.0,0.559322,1.0
"loss=squared_hinge, multi_class=crammer_singer, C=2",0.8,0.222222,0.0,0.0,0.6,1.0
