In [77]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re

In [78]:
# Load data

In [79]:
df = pd.read_csv('./cow.csv')
df

Unnamed: 0,surgery,age,hospital_number,temperature,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome Class,surgical_lesion,lesion
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,,decreased,distend_large,45.0,8.4,,,died,no,11300
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,,absent,other,50.0,85.0,cloudy,2.0,euthanized,no,2208
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,,normal,normal,33.0,6.7,,,lived,no,0
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,5.0,decreased,,48.0,7.2,serosanguious,5.3,died,yes,2208
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,,,,74.0,7.4,,,died,no,4300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,533886,,120.0,70.0,cold,,pale_cyanotic,more_3_sec,...,,,distend_large,55.0,65.0,,,euthanized,no,3205
295,no,adult,527702,37.2,72.0,24.0,cool,increased,pale_cyanotic,more_3_sec,...,,absent,distend_small,44.0,,serosanguious,3.3,euthanized,yes,2208
296,yes,adult,529386,37.5,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,...,,decreased,distend_large,60.0,6.8,,,died,yes,3205
297,yes,adult,530612,36.5,100.0,24.0,cool,reduced,pale_pink,less_3_sec,...,,absent,distend_small,50.0,6.0,serosanguious,3.4,lived,yes,2208


In [80]:
def extract_lesion_column(lesion):
    site, type_l, subtype, code = 'None', 'None', 'None', 'None'
    if lesion != 0:
        types = re.findall('^([1-9]|11|00)([1-4])([0-2])([1-9]|10|0)',str(lesion))
        if types:
            site, type_l, subtype, code = types[0]
    return pd.Series({'lesion_site': site, 'lesion_type': type_l, 'lesion_subtype': subtype, 'lesion_code': code}, dtype='category')

df.loc[:, ['lesion_site', 'lesion_type', 'lesion_subtype', 'lesion_code']] = df.lesion.apply(extract_lesion_column).astype('category')
df.drop(columns=['lesion', 'hospital_number'], inplace=True)

In [81]:
categorical_columns = []
for col in df:
    if str(df[col].dtype) != 'category' and df[col].unique().size < 10:
        df[col].fillna('[NAN]', inplace=True)
        categorical_columns.append(col)
        df[col] = df[col].astype('category')

In [82]:
X = []
X_labels = []
Y = None
for col in df:
    if col == 'outcome Class':
        Y = df[col].cat.codes.values
    elif str(df[col].dtype) != 'category':
        X.append(df[col].fillna(df[col].mean()).values,)
        X_labels.append(col)
    else:
        X.append(df[col].cat.codes.values)
        X_labels.append(col)
X = np.array(X).T
x_label2idx = {l:i for i,l in enumerate(X_labels)}

In [83]:
import sklearn.feature_selection as fs

In [100]:
selector = fs.SelectPercentile(fs.chi2,percentile=10).fit(X, Y)
X_new = selector.transform(X)
keep_cols = selector.scores_.argsort()[::-1][:X_new.shape[1]]
keep_cols_label = [X_labels[i] for i in keep_cols]
keep_cols_label

['total_protein', 'pulse', 'lesion_subtype']

In [101]:
col2types = {l:df[l].dtype.name for l in X_labels}

In [102]:
from sklearn.model_selection import train_test_split
x_train_, x_test_, y_train_, y_test_ = train_test_split(X[:,keep_cols], Y, test_size=0.1)

In [103]:
np.unique(y_test_, return_counts=1),np.unique(y_train_, return_counts=1)

((array([0, 1, 2], dtype=int8), array([ 6,  2, 22])),
 (array([0, 1, 2], dtype=int8), array([ 71,  42, 156])))

In [104]:
num_classes = np.unique(y_test_).size
num_classes

3

In [105]:
x_train={l:v for l, v in zip(keep_cols_label, x_train_.T)}
x_test={l:v for l, v in zip(keep_cols_label, x_test_.T)}

y_train = tf.keras.utils.to_categorical(y_train_)
y_test = tf.keras.utils.to_categorical(y_test_)

In [156]:
def build_model():
    def _build_input_layer(feature_name):
        if col2types[feature_name] == 'category':
            inputs = tf.keras.layers.Input(shape=(1,), name=feature_name)
            n = np.unique(X[:, x_label2idx[feature_name]]).size
            x = tf.keras.layers.Embedding(n, 128, input_length=1)(inputs)
            x = tf.keras.layers.Flatten()(x)
            x = tf.keras.layers.Dense(128)(inputs)
            return inputs, x
        else:
            inputs = tf.keras.layers.Input(shape=(1,), name=feature_name)
            x = tf.keras.layers.Dense(128)(inputs)
            return inputs, x

    in_layers = []
    x_layers = []
    for feature in keep_cols_label:
        il, xl = _build_input_layer(feature)
        in_layers.append(il)
        x_layers.append(xl)

    x = tf.keras.layers.concatenate(x_layers)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.average(x_layers + [x])
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
    return tf.keras.Model(inputs=in_layers, outputs=x, name='model')

model=build_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 total_protein (InputLayer)     [(None, 1)]          0           []                               
                                                                                                  
 pulse (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 lesion_subtype (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 dense_134 (Dense)              (None, 128)          256         ['total_protein[0][0]']          
                                                                                              

In [157]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.categorical_crossentropy,
    metrics=['accuracy', tf.keras.metrics.Recall()]
)

In [158]:
fit_history=model.fit(x_train, y_train,  epochs=100,)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100


Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [159]:
model.evaluate(x_test, y_test)



[0.5376690030097961, 0.7666666507720947, 0.699999988079071]