In [1]:
import numpy as np
import pandas as pd
import numbers
from collections import Counter

# display all the columns when output the dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

np.set_printoptions(threshold=np.inf)

## Load the data

In [2]:
# read the file
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,more_3_sec,depressed,absent,slight,slight,less_1_liter,6.5,decreased,distend_small,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,2.0,absent,distend_small,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,3.5,,distend_large,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,2.0,decreased,distend_small,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,less_3_sec,alert,hypomotile,none,slight,less_1_liter,7.0,normal,normal,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [4]:
data.describe(include='all')

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
count,1235.0,1235,1235,1235.0,1235.0,1235.0,1235.0,1196,1175,1214,1229,1191,1215,1212,1155,1214,1235.0,1045,1022,1235.0,1235.0,1187,1235.0,1235,1235.0,1235.0,1235.0,1235,1235
unique,,2,2,,,,,4,4,6,3,6,5,4,3,4,,5,5,,,3,,2,,,,2,3
top,,yes,adult,,,,,cool,reduced,pale_pink,less_3_sec,depressed,hypomotile,moderate,slight,more_1_liter,,absent,distend_small,,,serosanguious,,yes,,,,yes,lived
freq,,887,1160,,,,,700,724,284,834,429,664,543,758,604,,493,482,,,570,,929,,,,668,574
mean,617.0,,,954500.4,38.202186,79.574089,30.054251,,,,,,,,,,4.382591,,,49.602429,21.388016,,3.290931,,3832.496356,14.612146,3.577328,,
std,356.6581,,,1356403.0,0.788668,29.108638,16.452066,,,,,,,,,,1.937357,,,10.5358,26.676453,,1.589195,,5436.733774,193.705735,88.858953,,
min,0.0,,,521399.0,35.4,30.0,8.0,,,,,,,,,,1.0,,,23.0,3.5,,0.1,,0.0,0.0,0.0,,
25%,308.5,,,528800.0,37.8,53.0,18.0,,,,,,,,,,2.0,,,43.0,6.6,,2.0,,2205.0,0.0,0.0,,
50%,617.0,,,529777.0,38.2,76.0,28.0,,,,,,,,,,4.5,,,48.0,7.5,,3.0,,2209.0,0.0,0.0,,
75%,925.5,,,534145.0,38.6,100.0,36.0,,,,,,,,,,6.0,,,57.0,9.1,,4.3,,3205.0,0.0,0.0,,


## Select useful information for the model

In [5]:
# select all the rows and select from only the col1 to the last col
# id is usefuless in the model, so I do not need to include it
interested_data = data.iloc[:, 1:]

# concat the origin dataset into the train dataset as well
interested_data = pd.concat([interested_data, pd.read_csv('origin.csv')], axis=0)
interested_data = interested_data.reset_index(drop=True)

interested_data = interested_data.drop(['hospital_number'], axis=1)

## Dealing with the missing values

In [6]:
table = interested_data.describe(include='all')
table

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
count,1534,1534,1474.0,1510.0,1476.0,1439,1405,1466,1496,1435,1470,1455,1350,1407,1288.0,1242,1203,1505.0,1501.0,1321,1336.0,1534,1534.0,1534.0,1534.0,1534,1534
unique,2,2,,,,4,4,6,3,6,5,4,3,4,,5,5,,,3,,2,,,,2,3
top,yes,adult,,,,cool,reduced,pale_pink,less_3_sec,depressed,hypomotile,moderate,slight,more_1_liter,,absent,distend_small,,,serosanguious,,yes,,,,no,lived
freq,1067,1435,,,,808,827,341,1021,488,791,608,859,643,,572,525,,,616,,1119,,,,767,752
mean,,,38.196744,78.194702,30.120596,,,,,,,,,,4.395963,,,49.011296,21.899534,,3.271931,,3798.817471,29.409387,4.320078,,
std,,,0.779883,29.162518,16.650801,,,,,,,,,,1.939516,,,10.590426,26.813063,,1.621415,,5429.907956,336.748319,97.624805,,
min,,,35.4,30.0,8.0,,,,,,,,,,1.0,,,23.0,3.3,,0.1,,0.0,0.0,0.0,,
25%,,,37.8,52.0,18.0,,,,,,,,,,2.0,,,42.0,6.6,,2.0,,2124.0,0.0,0.0,,
50%,,,38.2,72.0,28.0,,,,,,,,,,4.5,,,48.0,7.5,,3.0,,2209.0,0.0,0.0,,
75%,,,38.6,96.0,36.0,,,,,,,,,,6.2,,,55.0,13.0,,4.3,,3205.0,0.0,0.0,,


In [7]:
# the built in function isnull().sum() return a series of numpy and like a dictionary
nan_data = interested_data.isnull().sum()

# just a dictionary
nan_data

surgery                    0
age                        0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       95
peripheral_pulse         129
mucous_membrane           68
capillary_refill_time     38
pain                      99
peristalsis               64
abdominal_distention      79
nasogastric_tube         184
nasogastric_reflux       127
nasogastric_reflux_ph    246
rectal_exam_feces        292
abdomen                  331
packed_cell_volume        29
total_protein             33
abdomo_appearance        213
abdomo_protein           198
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
outcome                    0
dtype: int64

In [8]:
def getNan_cols(nan_data): 
    # create a variable to store the columns which has nan values
    numerical_nan_data_cols=[]
    categorical_nan_data_cols=[]
    
    # iterate the keys in nan_data
    for i in nan_data.keys():
        if nan_data[i] > 0:
            if isinstance(interested_data[i].values[0], numbers.Number):
                numerical_nan_data_cols.append(i)
            else:
                categorical_nan_data_cols.append(i)
                
    if (len(numerical_nan_data_cols) > 0) & (len(categorical_nan_data_cols) > 0):
        return numerical_nan_data_cols, categorical_nan_data_cols
    elif len(numerical_nan_data_cols) > 0:
        return numerical_nan_data_cols
    else:
        return categorical_nan_data_cols

In [9]:
#numerical_nan_data_cols, 
numerical_nan_data_cols, categorical_nan_data_cols = getNan_cols(nan_data)

In [10]:
# the fillNan function is used to fill out the nan values with mean value in the dataframe
def categorical_fillNan(nan_data_cols, dataframe, table):
    
    # iterate the cols with nan values
    for i in nan_data_cols:
        
        # get the top value
        top = table[i].top
        
        # fill the nan values with the mean
        dataframe[i] = dataframe[i].fillna(top)
        
    return dataframe

In [11]:
def numberical_fillNan(nan_data_cols, dataframe):
     # iterate the cols with nan values
    for i in nan_data_cols:
        
        # get the top value
        mean = int(table[i].mean())
        
        # fill the nan values with the mean
        dataframe[i] = dataframe[i].fillna(mean)
        
    return dataframe

In [12]:
# call the categorical_fillNan function
noNan_data = categorical_fillNan(categorical_nan_data_cols, interested_data, table)

# call the numerical_fillNan
noNan_data = numberical_fillNan(numerical_nan_data_cols, noNan_data)

In [13]:
# Preprocessing finished
data_preprocessed = noNan_data.copy()

## Get the input and the target

In [14]:
input_data = data_preprocessed.iloc[:, :len(data_preprocessed.columns)-1]
target_data = data_preprocessed['outcome']

## Get dummies

In [15]:
dummies_input_data = pd.get_dummies(input_data, dtype=int, drop_first=True)
dummies_target_data = target_data.map({'died': 0, 'euthanized': 1, 'lived': 2})

## Feature Selection

#### Determine the significance of each individual variable to the model
#### The lower the p value that a variable has, the greater impact it has to the model

In [16]:
# import f_regression to calculate the p_values
from sklearn.feature_selection import f_regression

In [17]:
# calculate and return two arrays
p_values = f_regression(dummies_input_data, dummies_target_data)
# the array in index 1 is the p values and round it to the third digit
p_values = p_values[1].round(3)

In [18]:
# create a dataframe to display the p_values
p_value_table = pd.DataFrame({'Features': dummies_input_data.columns, 'P_values': p_values})
p_value_table

Unnamed: 0,Features,P_values
0,rectal_temp,0.214
1,pulse,0.0
2,respiratory_rate,0.014
3,nasogastric_reflux_ph,0.0
4,packed_cell_volume,0.0
5,total_protein,0.0
6,abdomo_protein,0.001
7,lesion_1,0.48
8,lesion_2,0.019
9,lesion_3,0.332


## Balance the outcome

#### The outcomes are died, euthanized, and lived, and the number of their occurence in the dataset should be euqal.  

In [19]:
# The function is to calculate the frequencies of each variable in the list
def frequencies_outcome(data):
    # get the frequency
    counter = Counter(data)
    print(counter)
    # find the smallest value in the dictionary and return its key
    smallest = min(counter, key=counter.get)
    return smallest, counter[smallest]

In [20]:
def balance_outcome(data):
    # call the frequencies_outcome to get the smallest frequncies of an output with its key
    smallest_frequency_key, smallest_frequency = frequencies_outcome(data)
    
    # create dict_total to represent the total number of freuqueny that match the smallest frequency
    dict_total = {'0': smallest_frequency,
                 '2': smallest_frequency}
    
    # create a dictionary to count the number of three different outputs
    dict_count = {}
    
    # store the indices of row needs to be removed from the list
    removed_indices = []
    
    # create a for loop to iterate the data by indcies
    for i in range(len(data)):
        # get a value from the output
        outcome = str(data[i])
        
        # if outcome is smallest_frequency_key, execute continue
        # else-if: the outcome in the keys of dict_count, enter inside and check the inner if-statement, 
        # else: assgin value 1 to the value  
                # if the counts in dict_count is smaller than the total counts, add 1 
                    # otherwise, it exceeds the total count of a particular output
        if outcome == str(smallest_frequency_key):
            continue
        elif outcome in dict_count.keys():
            if dict_count[outcome] < dict_total[outcome]:
                dict_count[outcome] += 1
            else:
                removed_indices.append(i)
        else:
            dict_count[outcome] = 1
    
    return removed_indices

In [21]:
# call the balance_outcome function to get the removed incdices
removed_indices = balance_outcome(dummies_target_data)

Counter({2: 752, 0: 487, 1: 295})


In [22]:
# drop excessive data and reset the index 
balanced_input_data = dummies_input_data.drop(removed_indices, axis=0).reset_index(drop=True)
balanced_target_data = dummies_target_data.drop(removed_indices, axis=0).reset_index(drop=True)

#balanced_input_data = dummies_input_data.copy()
#balanced_target_data = dummies_target_data.copy()

In [23]:
counter  = Counter(balanced_target_data)
print(counter)

Counter({0: 295, 1: 295, 2: 295})


## Shuffle

In [24]:
import random

In [25]:
shuffled_indices = np.arange(balanced_input_data.shape[0])
random.shuffle(shuffled_indices)

In [26]:
shuffled_input_data = balanced_input_data.iloc[shuffled_indices].reset_index(drop=True)
shuffled_target_data = balanced_target_data[shuffled_indices].reset_index(drop=True)

#shuffled_input_data = balanced_input_data
#shuffled_target_data = balanced_target_data

In [27]:
print(interested_data['outcome'].unique())

['died' 'euthanized' 'lived']


#### 0: died 
#### 1: euthanized 
#### 2: lived

## Scale the input data

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
scaler = StandardScaler()
scaler.fit(shuffled_input_data)
scaled_input_data = scaler.transform(shuffled_input_data)

## Split

In [30]:
# split the data into three parts
train_counts = int(0.8*scaled_input_data.shape[0])
validation_counts = int(0.1*scaled_input_data.shape[0])

train_input = scaled_input_data[:train_counts]
train_target = shuffled_target_data[:train_counts]

validation_input = scaled_input_data[train_counts: train_counts+validation_counts]
validation_target = shuffled_target_data[train_counts: train_counts+validation_counts]

test_input = scaled_input_data[train_counts+validation_counts :]
test_target = shuffled_target_data[train_counts+validation_counts :]

#test_input = scaled_input_data[train_counts:]
#test_target = shuffled_target_data[train_counts:]

## Deep Learning Modeling

In [31]:
import tensorflow as tf
input_size = train_input.shape[1]
output_size = 3
hidden_layer = 100

# create the model with two hidden layers and eahc hidden layers has 50 units
    # use relu for hidden layers and softmax for the output layer
model = tf.keras.Sequential([
    #tf.keras.layers.Input(shape=(input_size,)),
    tf.keras.layers.Dense(hidden_layer, activation='relu'),
    tf.keras.layers.Dense(hidden_layer, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax')
])

In [32]:
customized_optimizer = tf.keras.optimizers.Adam()

model.compile(optimizer=customized_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [33]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=1)
max_epochs = 100
batch_size = 100

model.fit(train_input,
         train_target,
         epochs=max_epochs,
         batch_size=batch_size,
         validation_data=(validation_input, validation_target),
         verbose=2)

Epoch 1/100
8/8 - 1s - loss: 1.0839 - accuracy: 0.4138 - val_loss: 0.9742 - val_accuracy: 0.5341 - 964ms/epoch - 121ms/step
Epoch 2/100
8/8 - 0s - loss: 0.9052 - accuracy: 0.5989 - val_loss: 0.9052 - val_accuracy: 0.6023 - 55ms/epoch - 7ms/step
Epoch 3/100
8/8 - 0s - loss: 0.8250 - accuracy: 0.6299 - val_loss: 0.8908 - val_accuracy: 0.6364 - 51ms/epoch - 6ms/step
Epoch 4/100
8/8 - 0s - loss: 0.7776 - accuracy: 0.6483 - val_loss: 0.8603 - val_accuracy: 0.6364 - 50ms/epoch - 6ms/step
Epoch 5/100
8/8 - 0s - loss: 0.7363 - accuracy: 0.6737 - val_loss: 0.8202 - val_accuracy: 0.6591 - 46ms/epoch - 6ms/step
Epoch 6/100
8/8 - 0s - loss: 0.7045 - accuracy: 0.7048 - val_loss: 0.7905 - val_accuracy: 0.6818 - 46ms/epoch - 6ms/step
Epoch 7/100
8/8 - 0s - loss: 0.6753 - accuracy: 0.7119 - val_loss: 0.7736 - val_accuracy: 0.6818 - 50ms/epoch - 6ms/step
Epoch 8/100
8/8 - 0s - loss: 0.6496 - accuracy: 0.7288 - val_loss: 0.7661 - val_accuracy: 0.6818 - 45ms/epoch - 6ms/step
Epoch 9/100
8/8 - 0s - loss: 

Epoch 68/100
8/8 - 0s - loss: 0.0712 - accuracy: 0.9958 - val_loss: 1.3868 - val_accuracy: 0.6705 - 94ms/epoch - 12ms/step
Epoch 69/100
8/8 - 0s - loss: 0.0697 - accuracy: 0.9944 - val_loss: 1.4098 - val_accuracy: 0.6477 - 100ms/epoch - 12ms/step
Epoch 70/100
8/8 - 0s - loss: 0.0676 - accuracy: 0.9972 - val_loss: 1.4414 - val_accuracy: 0.6818 - 105ms/epoch - 13ms/step
Epoch 71/100
8/8 - 0s - loss: 0.0623 - accuracy: 0.9986 - val_loss: 1.4519 - val_accuracy: 0.6818 - 100ms/epoch - 13ms/step
Epoch 72/100
8/8 - 0s - loss: 0.0601 - accuracy: 0.9986 - val_loss: 1.4385 - val_accuracy: 0.6705 - 97ms/epoch - 12ms/step
Epoch 73/100
8/8 - 0s - loss: 0.0562 - accuracy: 1.0000 - val_loss: 1.4488 - val_accuracy: 0.6705 - 97ms/epoch - 12ms/step
Epoch 74/100
8/8 - 0s - loss: 0.0523 - accuracy: 1.0000 - val_loss: 1.4647 - val_accuracy: 0.6591 - 105ms/epoch - 13ms/step
Epoch 75/100
8/8 - 0s - loss: 0.0505 - accuracy: 1.0000 - val_loss: 1.4739 - val_accuracy: 0.6818 - 100ms/epoch - 13ms/step
Epoch 76/10

<keras.callbacks.History at 0x1f2b030f3d0>

## NN-Testing

In [34]:
test_loss, test_accuracy = model.evaluate(test_input, test_target)
print("The test loss: {0:.2f}, and the test accuracy: {1:.2f}".format(test_loss, test_accuracy))

The test loss: 1.75, and the test accuracy: 0.58


#### The average NN-testig is 62%
#### The model created by netural network is overfitting,and one of the possible reason is the dataset is small

## Scikit-Learn LogisticRegression

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
classifier = LogisticRegression()
classifier.fit(scaled_input_data, shuffled_target_data)

In [37]:
test_output = classifier.predict(test_input)

### Confusion matrix

In [38]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_target,test_output)

In [39]:
cm_table = pd.DataFrame(cm)
cm_table.columns = ['Predict died', 'Predict euthanized', 'Predict lived']
cm_table = cm_table.rename({0: 'predict died', 1: 'Predict euthanized', 2: 'Predict lived'})
cm_table

Unnamed: 0,Predict died,Predict euthanized,Predict lived
predict died,26,3,4
Predict euthanized,2,17,5
Predict lived,3,7,22


### Sklearn test accuracy

In [40]:
from sklearn import metrics
test_accuracy = metrics.accuracy_score(test_output, test_target)
print('The test accuracy is {0:.2f}%'.format(test_accuracy*100))

The test accuracy is 73.03%


#### The average sklearn test accuracy is 70%

## Loading Test dataset

In [41]:
test_data = pd.read_csv('test.csv')

In [42]:
horse_id = test_data['id']
test_input_predict = test_data.iloc[:, 1:]
test_input_predict = test_input_predict.drop('hospital_number', axis=1)

In [45]:
test_nan_cols = getNan_cols(test_input_predict.isnull().sum())
test_describe = test_input_predict.describe(include='all')

In [47]:
test_input_predict = categorical_fillNan(test_nan_cols, test_input_predict, test_describe)

In [48]:
test_dummies_input = pd.get_dummies(test_input_predict, dtype=int, drop_first=True)

In [49]:
def insertMissing_cols(dummies_input_data, test_input_predict):
    col_train = list(dummies_input_data.columns)
    col_test = list(test_input_predict.columns)
    missed_cols=[]
    
    for i in col_train:
        counts = col_test.count(i)
        if counts == 0:
            missed_cols.append(i)
    
    for i in missed_cols:
        test_input_predict[i] = 0
    
    return test_input_predict

In [50]:
def findExtra_cols(dummies_input_data, test_input_predict):
    col_train = list(dummies_input_data.columns)
    col_test = list(test_input_predict.columns)
    extra_cols=[]
    
    for i in col_test:
        counts = col_train.count(i)
        if counts == 0:
            extra_cols.append(i)
    
    for i in extra_cols:
        test_input_predict = test_input_predict.drop(i, axis=1)
        
    return test_input_predict

In [51]:
test_dummies_input = insertMissing_cols(dummies_input_data.copy(), test_dummies_input.copy())
test_dummies_input = findExtra_cols(dummies_input_data.copy(), test_dummies_input.copy())

In [52]:
print(test_dummies_input.shape)

(824, 54)


In [53]:
print(dummies_input_data.shape)

(1534, 54)


In [54]:
scaler.fit(test_dummies_input)
scaled_test_input = scaler.transform(test_dummies_input)

## Sklearn model predict test dataset

In [55]:
sklearn_test_output = classifier.predict(scaled_test_input)

In [56]:
sklearn_df = pd.DataFrame({'id': horse_id, 'outcome': sklearn_test_output})
sklearn_df['outcome'] = sklearn_df['outcome'].map({0: 'died', 1: 'euthanized', 2: 'lived'})

In [57]:
sklearn_df

Unnamed: 0,id,outcome
0,1235,lived
1,1236,lived
2,1237,lived
3,1238,euthanized
4,1239,lived
5,1240,died
6,1241,lived
7,1242,euthanized
8,1243,lived
9,1244,died


## Deep learning model predict test dataset

In [58]:
NN_test_accuracy_output = model.predict(scaled_test_input)



In [59]:
NN_test_accuracy_output = NN_test_accuracy_output.round(2)

In [60]:
def convertAccuracy_toStr(NN_test_output):
    test_output=[]
    
    for i in NN_test_output:
        max_value = max(i)
        index = np.where(i == max_value)
        index = index[0][0]
        
        if index == 0:
            test_output.append('died')
        elif index == 1:
            test_output.append('euthanized')
        else:
            test_output.append('lived')
            
    return test_output

In [61]:
NN_test_str_output = convertAccuracy_toStr(NN_test_accuracy_output)

In [62]:
NN_df = pd.DataFrame({'id': horse_id, 'outcome': NN_test_str_output})

In [63]:
NN_df

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
5,1240,died
6,1241,lived
7,1242,died
8,1243,lived
9,1244,lived


## Save the results to csv

In [64]:
sklearn_df.to_csv('sklearn_test_outcome.csv', index=False)