### Problem Statement: 
Predict the survival of a horse based on various observed medical conditions. Load
the data from “horses.csv” and observe whether it contains missing values. The dataset contains many
categorical features; replace them with label encoding. Replace the missing values by the most frequent value
in each column. Fit a decision tree classifier and random forest classifier, and observe the accuracy.

### Objective: 
Learn to fit a decision tree, and compare its accuracy with random forest classifier. 

In [1]:
import pandas as pd
import matplotlib.pyplot as plot

%matplotlib inline

In [2]:
animals = pd.read_csv('C:/data/horse.csv')

In [3]:
animals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [4]:
animals.shape

(299, 28)

In [5]:
animals.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [6]:
animals.describe()

Unnamed: 0,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3
count,299.0,239.0,275.0,241.0,53.0,270.0,266.0,101.0,299.0,299.0,299.0
mean,1087733.0,38.168619,72.0,30.460581,4.707547,46.307407,24.274436,3.039604,3659.70903,90.528428,7.38796
std,1532032.0,0.733744,28.646219,17.666102,1.982311,10.436743,27.364194,1.967947,5408.472421,650.637139,127.749768
min,518476.0,35.4,30.0,8.0,1.0,23.0,3.3,0.1,0.0,0.0,0.0
25%,528904.0,37.8,48.0,18.0,3.0,38.0,6.5,2.0,2111.5,0.0,0.0
50%,530301.0,38.2,64.0,25.0,5.0,45.0,7.5,2.3,2322.0,0.0,0.0
75%,534736.0,38.5,88.0,36.0,6.5,52.0,56.75,3.9,3209.0,0.0,0.0
max,5305629.0,40.8,184.0,96.0,7.5,75.0,89.0,10.1,41110.0,7111.0,2209.0


In [7]:
animals.isnull().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [8]:
animals.columns

Index(['surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2',
       'lesion_3', 'cp_data'],
      dtype='object')

In [9]:
target = animals['outcome']

In [10]:
target.unique()

array(['died', 'euthanized', 'lived'], dtype=object)

In [11]:
animals = animals.drop(['outcome'],axis=1)

In [12]:
#Select all categorical variables and create dummies
category_variables=['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse',
                    'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
                    'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
                    'rectal_exam_feces', 'abdomen','abdomo_appearance','surgical_lesion', 
                    'cp_data']

for category in category_variables:
    animals[category] = pd.get_dummies(animals[category])

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

x,y = animals.values, target.values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size= 0.2,random_state=1)

In [14]:
from sklearn.tree import DecisionTreeClassifier
print(x_train.shape)

(239, 27)


In [20]:
#handling missing value

#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer 
import numpy as np
#imp = Imputer(missing_values = 'NaN', strategy = "most_frequent",axis = 0)
imp = SimpleImputer(missing_values = np.nan, strategy = "most_frequent")
x_train = imp.fit_transform(x_train)
x_test = imp.fit_transform(x_test)

In [21]:
classifier = DecisionTreeClassifier()

In [22]:
classifier.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [30]:
y_predict = classifier.predict(x_test)

In [31]:
from sklearn.metrics import accuracy_score

In [35]:
accuracy = accuracy_score(y_predict,y_test)

In [36]:
print(accuracy)

0.55


In [37]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

In [38]:
classifier.fit(x_train,y_train)
y_predict = classifier.predict(x_test)
accuracy = accuracy_score(y_predict,y_test)
print(accuracy)

0.7166666666666667
