In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

%matplotlib inline

### 1. Let’s attempt to predict the survival of a horse based on various observed medical conditions. Load the data from ‘horses.csv’ and observe whether it contains missing values. 

In [2]:
data = pd.read_csv('horse.csv')

In [3]:
data.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [4]:
data.shape

(299, 28)

In [5]:
# checking missing values in dataframe:
data.isnull().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

### 2. This dataset contains many categorical features, replace them with label encoding. 

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
surgery                  299 non-null object
age                      299 non-null object
hospital_number          299 non-null int64
rectal_temp              239 non-null float64
pulse                    275 non-null float64
respiratory_rate         241 non-null float64
temp_of_extremities      243 non-null object
peripheral_pulse         230 non-null object
mucous_membrane          252 non-null object
capillary_refill_time    267 non-null object
pain                     244 non-null object
peristalsis              255 non-null object
abdominal_distention     243 non-null object
nasogastric_tube         195 non-null object
nasogastric_reflux       193 non-null object
nasogastric_reflux_ph    53 non-null float64
rectal_exam_feces        197 non-null object
abdomen                  181 non-null object
packed_cell_volume       270 non-null float64
total_protein            266 non-null

In [8]:
Y = data["outcome"]
X = data.drop(["outcome"], axis=1)

In [9]:
X = pd.get_dummies(X)
X.head()

Unnamed: 0,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,...,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
0,530101,38.5,66.0,28.0,,45.0,8.4,,11300,0,...,0,0,0,0,0,0,1,0,1,0
1,534817,39.2,88.0,20.0,,50.0,85.0,2.0,2208,0,...,0,0,1,0,1,0,1,0,1,0
2,530334,38.3,40.0,24.0,,33.0,6.7,,0,0,...,0,1,0,0,0,0,1,0,0,1
3,5290409,39.1,164.0,84.0,5.0,48.0,7.2,5.3,2208,0,...,0,0,0,0,0,1,0,1,0,1
4,530255,37.3,104.0,35.0,,74.0,7.4,,4300,0,...,0,0,0,0,0,0,1,0,1,0


###  3. Replace the missing values by the most frequent value in each column.

In [10]:
X = X.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [12]:
X.head()

Unnamed: 0,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,...,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
0,530101,38.5,66.0,28.0,2.0,45.0,8.4,2.0,11300,0,...,0,0,0,0,0,0,1,0,1,0
1,534817,39.2,88.0,20.0,2.0,50.0,85.0,2.0,2208,0,...,0,0,1,0,1,0,1,0,1,0
2,530334,38.3,40.0,24.0,2.0,33.0,6.7,2.0,0,0,...,0,1,0,0,0,0,1,0,0,1
3,5290409,39.1,164.0,84.0,5.0,48.0,7.2,5.3,2208,0,...,0,0,0,0,0,1,0,1,0,1
4,530255,37.3,104.0,35.0,2.0,74.0,7.4,2.0,4300,0,...,0,0,0,0,0,0,1,0,1,0


In [13]:
X.isnull().sum()

hospital_number                    0
rectal_temp                        0
pulse                              0
respiratory_rate                   0
nasogastric_reflux_ph              0
packed_cell_volume                 0
total_protein                      0
abdomo_protein                     0
lesion_1                           0
lesion_2                           0
lesion_3                           0
surgery_no                         0
surgery_yes                        0
age_adult                          0
age_young                          0
temp_of_extremities_cold           0
temp_of_extremities_cool           0
temp_of_extremities_normal         0
temp_of_extremities_warm           0
peripheral_pulse_absent            0
peripheral_pulse_increased         0
peripheral_pulse_normal            0
peripheral_pulse_reduced           0
mucous_membrane_bright_pink        0
mucous_membrane_bright_red         0
mucous_membrane_dark_cyanotic      0
mucous_membrane_normal_pink        0
m

###  4. Fit a decision tree classifier and observe the accuracy

In [23]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=23)

In [26]:
tree = DecisionTreeClassifier()
D_tree = tree.fit(x_train,y_train)
y_predict = D_tree.predict(x_test)
decisiontree_accuracy = accuracy_score(y_predict,y_test)
decisiontree_accuracy

0.7

### 5. Fit a random forest classifier and observe the accuracy. 

In [28]:
random_forest = RandomForestClassifier(n_estimators=300)
random_forest1 = random_forest.fit(x_train,y_train)
y_predict = random_forest1.predict(x_test)
random_forest_accuracy = accuracy_score(y_predict,y_test)
random_forest_accuracy

0.7