In [1]:
import numpy as np
import pandas as pd

# display all the columns when output the dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

np.set_printoptions(threshold=np.inf)

## Load the data

In [2]:
# read the file
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,more_3_sec,depressed,absent,slight,slight,less_1_liter,6.5,decreased,distend_small,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,2.0,absent,distend_small,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,3.5,,distend_large,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,2.0,decreased,distend_small,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,less_3_sec,alert,hypomotile,none,slight,less_1_liter,7.0,normal,normal,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [4]:
data.describe()

Unnamed: 0,id,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3
count,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0
mean,617.0,954500.4,38.202186,79.574089,30.054251,4.382591,49.602429,21.388016,3.290931,3832.496356,14.612146,3.577328
std,356.6581,1356403.0,0.788668,29.108638,16.452066,1.937357,10.5358,26.676453,1.589195,5436.733774,193.705735,88.858953
min,0.0,521399.0,35.4,30.0,8.0,1.0,23.0,3.5,0.1,0.0,0.0,0.0
25%,308.5,528800.0,37.8,53.0,18.0,2.0,43.0,6.6,2.0,2205.0,0.0,0.0
50%,617.0,529777.0,38.2,76.0,28.0,4.5,48.0,7.5,3.0,2209.0,0.0,0.0
75%,925.5,534145.0,38.6,100.0,36.0,6.0,57.0,9.1,4.3,3205.0,0.0,0.0
max,1234.0,5305129.0,40.8,184.0,96.0,7.5,75.0,89.0,10.1,41110.0,3112.0,2209.0


## Select useful information for the model

In [5]:
# select all the rows and select from only the col1 to the last col
# id is usefuless in the model, so I do not need to include it
interested_data = data.iloc[:, 1:]
interested_data.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,more_3_sec,depressed,absent,slight,slight,less_1_liter,6.5,decreased,distend_small,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,2.0,absent,distend_small,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,3.5,,distend_large,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,2.0,decreased,distend_small,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,less_3_sec,alert,hypomotile,none,slight,less_1_liter,7.0,normal,normal,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


## Map the categroical data with numerical data (one hot-encoding)

In [6]:
# The function map_categorical data is to map the categorical data with the numerical values
    # colNames: the columns in the dataframe needs to be mapped
def map_categorical_data(colNames, dataframe):
    # create a dictionary to store column names as keys and values are numberical data
    dictionary={}
    
    # create a for loop to iterate the column name list
    for i in colNames:
        # get the unique values in a column
        values = dataframe[i].unique()
        
        # create another for loop to iterate the values by index
        for j in range(len(values)):
            
            # insert the value in the dictionary as key and assgin a value to it
            # j starts from 0, so add 1
            if str(values[j]) != 'nan':
                dictionary[values[j]] = int(j)
        
        # after convert the categrocial in the dataframe with map the dictionary
            # like {'v1': 1, 'v2': 2}
        dataframe[i] = dataframe[i].map(dictionary)
        
        # clear the dictionary for next mapping
        dictionary.clear()
        
    return dataframe

In [7]:
# The function categorical_data() is to find the 
def categorical_data(dataframe):
    # create an array to store the categorical data name
    arr=[]
    
    # create a for loop iterate the columns
    for i in dataframe.columns:
        # if the values in that column is string, append the name to the array
        if isinstance(dataframe[i].values[0], str):
            arr.append(i)
    return arr;

In [8]:
# call the categroical_data function
cols = categorical_data(interested_data)
print(cols)

['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion', 'cp_data', 'outcome']


In [9]:
# call the map_categorical_data function
mapped_data = map_categorical_data(cols, interested_data.copy())

## Dealing with the missing values

In [10]:
# the built in function isnull().sum() return a series of numpy and like a dictionary
nan_data = mapped_data.isnull().sum()

# just a dictionary
nan_data

surgery                    0
age                        0
hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
temp_of_extremities       39
peripheral_pulse          60
mucous_membrane           21
capillary_refill_time      6
pain                      44
peristalsis               20
abdominal_distention      23
nasogastric_tube          80
nasogastric_reflux        21
nasogastric_reflux_ph      0
rectal_exam_feces        190
abdomen                  213
packed_cell_volume         0
total_protein              0
abdomo_appearance         48
abdomo_protein             0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
outcome                    0
dtype: int64

In [11]:
# create a variable to store the columns which has nan values
nan_data_cols = []

# iterate the keys in nan_data
for i in nan_data.keys():
    if nan_data[i] > 0:
        nan_data_cols.append(i)

In [12]:
# the fillNan function is used to fill out the nan values with mean value in the dataframe
def fillNan(nan_data_cols, dataframe):
    
    # iterate the cols with nan values
    for i in nan_data_cols:
        
        # calculate the mean values
        mean = int(np.mean(dataframe[i]))
        
        # fill the nan values with the mean
        dataframe[i] = dataframe[i].fillna(mean)
        
    return dataframe

In [13]:
# call the fillNan function
noNan_data = fillNan(nan_data_cols, mapped_data)

In [14]:
noNan_data.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
outcome                  0
dtype: int64

In [15]:
data_preprocessed = noNan_data.copy()
data_preprocessed

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,0,530001,38.1,132.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.5,0.0,0.0,57.0,8.5,0.0,3.4,0,2209,0,0,0,0
1,0,0,533836,37.5,88.0,12.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,33.0,64.0,0.0,2.0,0,2208,0,0,0,1
2,0,0,529812,38.3,120.0,28.0,0.0,0.0,2.0,1.0,2.0,1.0,1.0,0.0,2.0,3.5,1.0,1.0,37.0,6.4,0.0,3.4,0,5124,0,0,0,2
3,0,0,5262541,37.1,72.0,30.0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,2.0,0.0,0.0,53.0,7.0,1.0,3.9,0,2208,0,0,1,2
4,1,0,5299629,38.0,52.0,48.0,2.0,1.0,3.0,1.0,3.0,1.0,2.0,0.0,0.0,7.0,3.0,2.0,47.0,7.3,1.0,2.6,1,0,0,0,1,2
5,1,0,529642,38.1,56.0,32.0,2.0,1.0,4.0,1.0,0.0,1.0,2.0,1.0,1.0,5.0,0.0,3.0,49.0,8.0,1.0,2.8,1,0,0,0,1,2
6,0,0,534787,38.3,36.0,16.0,0.0,0.0,3.0,1.0,4.0,2.0,0.0,1.0,1.0,2.0,0.0,0.0,43.0,75.0,1.0,1.0,1,3111,0,0,1,1
7,1,0,529461,39.2,114.0,24.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,4.5,1.0,0.0,57.0,7.6,0.0,4.5,0,2207,0,0,1,0
8,1,0,528742,37.4,48.0,12.0,0.0,0.0,3.0,1.0,3.0,3.0,0.0,0.0,1.0,7.0,3.0,3.0,40.0,7.8,1.0,2.6,1,0,0,0,1,2
9,0,0,529640,38.3,129.0,48.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,2.0,4.4,1.0,0.0,57.0,4.9,1.0,2.9,0,3209,0,0,1,0


## Get the input and the target

In [16]:
input_data = noNan_data.iloc[:, :len(data_preprocessed.columns)-1]
target_data = data_preprocessed['outcome']

## Feature Selection

#### Determine the significance of each individual variable to the model
#### The lower the p value that a variable has, the greater impact it has to the model

In [17]:
# import f_regression to calculate the p_values
from sklearn.feature_selection import f_regression

In [18]:
# calculate and return two arrays
p_values = f_regression(input_data, target_data)
# the array in index 1 is the p values and round it to the third digit
p_values = p_values[1].round(3)

In [19]:
# create a dataframe to display the p_values
p_value_table = pd.DataFrame({'Features': input_data.columns, 'P_values': p_values})
p_value_table

Unnamed: 0,Features,P_values
0,surgery,0.0
1,age,0.0
2,hospital_number,0.0
3,rectal_temp,0.846
4,pulse,0.0
5,respiratory_rate,0.0
6,temp_of_extremities,0.0
7,peripheral_pulse,0.0
8,mucous_membrane,0.0
9,capillary_refill_time,0.0


## Balance the outcome

#### The outcomes are died, euthanized, and lived, and the number of their occurence in the dataset should be euqal.  

In [20]:
# import the collections to count the frequencies of each variable in the list
from collections import Counter

# The function is to calculate the frequencies of each variable in the list
def frequencies_outcome(data):
    # get the frequency
    counter = Counter(data)
    #print(counter)
    # find the smallest value in the dictionary and return its key
    smallest = min(counter, key=counter.get)
    return smallest, counter[smallest]

In [21]:
def balance_outcome(data):
    # call the frequencies_outcome to get the smallest frequncies of an output with its key
    smallest_frequency_key, smallest_frequency = frequencies_outcome(data)
    
    # create dict_total to represent the total number of freuqueny that match the smallest frequency
    dict_total = {'0': smallest_frequency,
                 '2': smallest_frequency}
    
    # create a dictionary to count the number of three different outputs
    dict_count = {}
    
    # store the indices of row needs to be removed from the list
    removed_indices = []
    
    # create a for loop to iterate the data by indcies
    for i in range(len(data)):
        # get a value from the output
        outcome = str(data[i])
        
        # if outcome is smallest_frequency_key, execute continue
        # else-if: the outcome in the keys of dict_count, enter inside and check the inner if-statement, 
        # else: assgin value 1 to the value  
                # if the counts in dict_count is smaller than the total counts, add 1 
                    # otherwise, it exceeds the total count of a particular output
        if outcome == str(smallest_frequency_key):
            continue
        elif outcome in dict_count.keys():
            if dict_count[outcome] < dict_total[outcome]:
                dict_count[outcome] += 1
            else:
                removed_indices.append(i)
        else:
            dict_count[outcome] = 1
    
    return removed_indices

In [22]:
# call the balance_outcome function to get the removed incdices
removed_indices = balance_outcome(target_data)

In [23]:
# drop excessive data and reset the index 
balanced_input_data = input_data.drop(removed_indices, axis=0).reset_index(drop=True)
balanced_target_data = target_data.drop(removed_indices, axis=0).reset_index(drop=True)

In [24]:
frequencies_outcome(balanced_target_data)

(0, 251)

## Shuffle

In [25]:
import random

In [26]:
shuffled_indices = np.arange(balanced_input_data.shape[0])
random.shuffle(shuffled_indices)

In [27]:
shuffled_input_data = balanced_input_data.iloc[shuffled_indices]
shuffled_target_data = balanced_target_data[shuffled_indices]

In [28]:
shuffled_input_data = shuffled_input_data.reset_index(drop=True)
shuffled_target_data = shuffled_target_data.reset_index(drop=True)
print(interested_data['outcome'].unique())

['died' 'euthanized' 'lived']


#### 1: died 
#### 2: euthanized 
#### 3: lived

## Scale the input data

In [29]:
from sklearn.preprocessing import StandardScaler

In [30]:
scaler = StandardScaler()
scaler.fit(shuffled_input_data)
scaled_shuffled_input_data = scaler.transform(shuffled_input_data)

## Split the data into train and validation

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
train_input, validation_input, train_target, validation_target = train_test_split(scaled_shuffled_input_data, 
                                                                                  shuffled_target_data, 
                                                                                  test_size=0.1)

## Modeling

In [33]:
import tensorflow as tf
input_size = scaled_shuffled_input_data.shape[1]
output_size = 3
hidden_layer = 50

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer, activation='relu'),
    tf.keras.layers.Dense(hidden_layer, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax')
])

In [34]:
customized_optimizer = tf.keras.optimizers.Adam()

model.compile(optimizer=customized_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [35]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)
max_epochs = 100
batch_size = 100

model.fit(train_input,
         train_target,
         epochs=max_epochs,
         batch_size=batch_size,
         validation_data=(validation_input, validation_target),
         verbose=2)

Epoch 1/100
7/7 - 1s - loss: 1.1179 - accuracy: 0.3855 - val_loss: 1.0854 - val_accuracy: 0.4211 - 883ms/epoch - 126ms/step
Epoch 2/100
7/7 - 0s - loss: 1.0305 - accuracy: 0.5037 - val_loss: 1.0127 - val_accuracy: 0.5526 - 28ms/epoch - 4ms/step
Epoch 3/100
7/7 - 0s - loss: 0.9679 - accuracy: 0.5835 - val_loss: 0.9556 - val_accuracy: 0.6447 - 35ms/epoch - 5ms/step
Epoch 4/100
7/7 - 0s - loss: 0.9192 - accuracy: 0.6071 - val_loss: 0.9023 - val_accuracy: 0.6711 - 29ms/epoch - 4ms/step
Epoch 5/100
7/7 - 0s - loss: 0.8778 - accuracy: 0.6292 - val_loss: 0.8563 - val_accuracy: 0.6842 - 32ms/epoch - 5ms/step
Epoch 6/100
7/7 - 0s - loss: 0.8414 - accuracy: 0.6499 - val_loss: 0.8202 - val_accuracy: 0.6842 - 32ms/epoch - 5ms/step
Epoch 7/100
7/7 - 0s - loss: 0.8129 - accuracy: 0.6706 - val_loss: 0.7886 - val_accuracy: 0.6842 - 27ms/epoch - 4ms/step
Epoch 8/100
7/7 - 0s - loss: 0.7891 - accuracy: 0.6795 - val_loss: 0.7631 - val_accuracy: 0.7105 - 29ms/epoch - 4ms/step
Epoch 9/100
7/7 - 0s - loss: 

Epoch 69/100
7/7 - 0s - loss: 0.2374 - accuracy: 0.9439 - val_loss: 0.8303 - val_accuracy: 0.6579 - 35ms/epoch - 5ms/step
Epoch 70/100
7/7 - 0s - loss: 0.2321 - accuracy: 0.9439 - val_loss: 0.8353 - val_accuracy: 0.6447 - 32ms/epoch - 5ms/step
Epoch 71/100
7/7 - 0s - loss: 0.2274 - accuracy: 0.9453 - val_loss: 0.8388 - val_accuracy: 0.6842 - 30ms/epoch - 4ms/step
Epoch 72/100
7/7 - 0s - loss: 0.2205 - accuracy: 0.9572 - val_loss: 0.8516 - val_accuracy: 0.6447 - 35ms/epoch - 5ms/step
Epoch 73/100
7/7 - 0s - loss: 0.2151 - accuracy: 0.9601 - val_loss: 0.8563 - val_accuracy: 0.6579 - 35ms/epoch - 5ms/step
Epoch 74/100
7/7 - 0s - loss: 0.2097 - accuracy: 0.9616 - val_loss: 0.8685 - val_accuracy: 0.6316 - 35ms/epoch - 5ms/step
Epoch 75/100
7/7 - 0s - loss: 0.2046 - accuracy: 0.9631 - val_loss: 0.8726 - val_accuracy: 0.6711 - 36ms/epoch - 5ms/step
Epoch 76/100
7/7 - 0s - loss: 0.2004 - accuracy: 0.9586 - val_loss: 0.8826 - val_accuracy: 0.6447 - 35ms/epoch - 5ms/step
Epoch 77/100
7/7 - 0s - 

<keras.callbacks.History at 0x1f53f1ebd90>