In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [2]:
df_raw = pd.read_csv('E:/Edureka_Python-Course/Data Sources/Day_7/horse.csv')

In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [4]:
df_raw.head().transpose()

Unnamed: 0,0,1,2,3,4
surgery,no,yes,no,yes,no
age,adult,adult,adult,young,adult
hospital_number,530101,534817,530334,5290409,530255
rectal_temp,38.5,39.2,38.3,39.1,37.3
pulse,66,88,40,164,104
respiratory_rate,28,20,24,84,35
temp_of_extremities,cool,,normal,cold,
peripheral_pulse,reduced,,normal,normal,
mucous_membrane,,pale_cyanotic,pale_pink,dark_cyanotic,dark_cyanotic
capillary_refill_time,more_3_sec,less_3_sec,less_3_sec,more_3_sec,more_3_sec


In [5]:
any(df_raw.isnull())

True

In [6]:
df_raw.age.dtype == 'O'

True

In [None]:
#target = df_raw.outcome.copy()
#df_raw.drop('outcome', axis=1, inplace=True)

In [7]:
ip = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
ip.fit(df_raw)
transformed = ip.transform(df_raw)
df_num = pd.DataFrame(data=transformed, columns=df_raw.columns)
df_num.head().transpose()

Unnamed: 0,0,1,2,3,4
surgery,no,yes,no,yes,no
age,adult,adult,adult,young,adult
hospital_number,530101,534817,530334,5290409,530255
rectal_temp,38.5,39.2,38.3,39.1,37.3
pulse,66,88,40,164,104
respiratory_rate,28,20,24,84,35
temp_of_extremities,cool,cool,normal,cold,cool
peripheral_pulse,reduced,normal,normal,normal,normal
mucous_membrane,normal_pink,pale_cyanotic,pale_pink,dark_cyanotic,dark_cyanotic
capillary_refill_time,more_3_sec,less_3_sec,less_3_sec,more_3_sec,more_3_sec


In [8]:
for col in df_num.columns:
    print(f'{col}\t\t\t{df_num.isnull()[col].unique()}')

surgery			[False]
age			[False]
hospital_number			[False]
rectal_temp			[False]
pulse			[False]
respiratory_rate			[False]
temp_of_extremities			[False]
peripheral_pulse			[False]
mucous_membrane			[False]
capillary_refill_time			[False]
pain			[False]
peristalsis			[False]
abdominal_distention			[False]
nasogastric_tube			[False]
nasogastric_reflux			[False]
nasogastric_reflux_ph			[False]
rectal_exam_feces			[False]
abdomen			[False]
packed_cell_volume			[False]
total_protein			[False]
abdomo_appearance			[False]
abdomo_protein			[False]
outcome			[False]
surgical_lesion			[False]
lesion_1			[False]
lesion_2			[False]
lesion_3			[False]
cp_data			[False]


In [9]:
categorical_cols = [col for col in df_num.columns if df_raw[col].dtype == 'O']

labelEncoders = {
    col : LabelEncoder() for col in categorical_cols
}

fitted_labelEncoders = {
    col : labelEncoders[col].fit(df_num[col]) for col in categorical_cols
}

for col in categorical_cols:
    df_num[col] = fitted_labelEncoders[col].transform(df_num[col])

df_num

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,0,0,530101,38.5,66,28,1,3,3,2,...,45,8.4,1,2,0,0,11300,0,0,0
1,1,0,534817,39.2,88,20,1,2,4,1,...,50,85,1,2,1,0,2208,0,0,0
2,0,0,530334,38.3,40,24,2,2,5,1,...,33,6.7,1,2,2,0,0,0,0,1
3,1,1,5290409,39.1,164,84,0,2,2,2,...,48,7.2,2,5.3,0,1,2208,0,0,1
4,0,0,530255,37.3,104,35,1,2,2,2,...,74,7.4,1,2,0,0,4300,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,1,0,533886,38,120,70,0,2,4,2,...,55,65,1,2,1,0,3205,0,0,0
295,0,0,527702,37.2,72,24,1,1,4,2,...,44,6.5,2,3.3,1,1,2208,0,0,1
296,1,0,529386,37.5,72,30,0,3,4,1,...,60,6.8,1,2,0,1,3205,0,0,0
297,1,0,530612,36.5,100,24,1,3,5,1,...,50,6,2,3.4,2,1,2208,0,0,1


In [10]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   surgery                299 non-null    int32 
 1   age                    299 non-null    int32 
 2   hospital_number        299 non-null    object
 3   rectal_temp            299 non-null    object
 4   pulse                  299 non-null    object
 5   respiratory_rate       299 non-null    object
 6   temp_of_extremities    299 non-null    int32 
 7   peripheral_pulse       299 non-null    int32 
 8   mucous_membrane        299 non-null    int32 
 9   capillary_refill_time  299 non-null    int32 
 10  pain                   299 non-null    int32 
 11  peristalsis            299 non-null    int32 
 12  abdominal_distention   299 non-null    int32 
 13  nasogastric_tube       299 non-null    int32 
 14  nasogastric_reflux     299 non-null    int32 
 15  nasogastric_reflux_ph  

In [12]:
for col in df_num.columns:
    df_num[col] = pd.to_numeric(df_num[col])
df_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    int32  
 1   age                    299 non-null    int32  
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            299 non-null    float64
 4   pulse                  299 non-null    float64
 5   respiratory_rate       299 non-null    float64
 6   temp_of_extremities    299 non-null    int32  
 7   peripheral_pulse       299 non-null    int32  
 8   mucous_membrane        299 non-null    int32  
 9   capillary_refill_time  299 non-null    int32  
 10  pain                   299 non-null    int32  
 11  peristalsis            299 non-null    int32  
 12  abdominal_distention   299 non-null    int32  
 13  nasogastric_tube       299 non-null    int32  
 14  nasogastric_reflux     299 non-null    int32  
 15  nasoga

# All Numeric Data

In [14]:
features, target = df_num.drop('outcome', axis=1), df_num.outcome

In [15]:
trainX, testX, trainY, testY = train_test_split(features, target, random_state=58, test_size=0.2)

In [26]:
model_decisionTree = DecisionTreeClassifier()
model_randForest = RandomForestClassifier()
models = [model_decisionTree, model_randForest]

scores = []
for model in models:
    model.fit(trainX, trainY)
    y_predd = fitted_labelEncoders['outcome'].inverse_transform(model.predict(testX))
    y_truee = fitted_labelEncoders['outcome'].inverse_transform(testY)
    #print(y_pred)
    #print(y_true)
    score = accuracy_score(y_truee, y_predd)
    print(f'Accuracy of {model} : {score}')


Accuracy of DecisionTreeClassifier() : 0.6
Accuracy of RandomForestClassifier() : 0.5666666666666667


In [None]:
df_num

In [None]:
target