# Predicting the survival of horses based on their medical conditions 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [2]:
horses = pd.read_csv(r"Datasets\horse.csv")
horses.head(10)

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no
5,no,adult,528355,,,,warm,normal,pale_pink,less_3_sec,...,,,,,lived,no,0,0,0,no
6,yes,adult,526802,37.9,48.0,16.0,normal,normal,normal_pink,less_3_sec,...,37.0,7.0,,,lived,yes,3124,0,0,no
7,yes,adult,529607,,60.0,,cool,,,less_3_sec,...,44.0,8.3,,,died,yes,2208,0,0,no
8,no,adult,530051,,80.0,36.0,cool,absent,pale_pink,less_3_sec,...,38.0,6.2,,,euthanized,yes,3205,0,0,no
9,no,young,5299629,38.3,90.0,,normal,,normal_pink,less_3_sec,...,40.0,6.2,clear,2.2,lived,no,0,0,0,yes


In [3]:
horses.shape

(299, 28)

In [4]:
#separate the target variable (Qualitative) from the predictor variables from the dataset.
target = horses['outcome']
print(target.unique())
horses = horses.drop(['outcome'], axis = 1)
horses.head(5)

['died' 'euthanized' 'lived']


Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,distend_large,45.0,8.4,,,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,other,50.0,85.0,cloudy,2.0,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,normal,33.0,6.7,,,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,,48.0,7.2,serosanguious,5.3,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,,74.0,7.4,,,no,4300,0,0,no


In [5]:
horses.dtypes

surgery                   object
age                       object
hospital_number            int64
rectal_temp              float64
pulse                    float64
respiratory_rate         float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
nasogastric_tube          object
nasogastric_reflux        object
nasogastric_reflux_ph    float64
rectal_exam_feces         object
abdomen                   object
packed_cell_volume       float64
total_protein            float64
abdomo_appearance         object
abdomo_protein           float64
surgical_lesion           object
lesion_1                   int64
lesion_2                   int64
lesion_3                   int64
cp_data                   object
dtype: object

In [6]:
#find categorical predictor variables and turn them into dummy variables.
col_list = []
for col_headers in horses.columns:
    if(horses[col_headers].dtype == np.object):
        col_list.append(col_headers)
print(col_list)
    
for cat in col_list:
    horses[cat] = pd.get_dummies(horses[cat])

['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion', 'cp_data']


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if(horses[col_headers].dtype == np.object):


In [7]:
#separating dataset into training and testing sets

X,y = horses.values, target.values

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [8]:
#Data Imputation 

imp = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

Implementing Decision Tree Classifier Algorithm 

In [13]:
dtcal = DecisionTreeClassifier()
dtcal.fit(X_train,y_train)
y_pred = dtcal.predict(X_test)

In [14]:
#Calculating the accuracy score 
accuracy = accuracy_score(y_pred,y_test)*100
print ("Accuracy: %.2f%%" % accuracy)

Accuracy: 52.22%


Implementing Random Forest Classifier Algorithm

In [15]:
rfcal = RandomForestClassifier()
rfcal.fit(X_train,y_train)
y_predict = rfcal.predict(X_test)

In [16]:
#Calculating the accuracy score 
accuracy = accuracy_score(y_predict,y_test)*100
print ("Accuracy: %.2f%%" % accuracy)

Accuracy: 67.78%
