## Import requirements

In [16]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from pandas.errors import MergeError
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector

## Data Preparation

### Dataset understanding

In [17]:
data = pd.read_csv('/content/horse.csv')

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

### Column remover function:

In [19]:
data.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [20]:
labels = data['outcome']
samples = data.drop(columns=['outcome', 'cp_data', 'hospital_number'])

In [21]:
x_train, x_test, y_train, y_test = train_test_split(samples, labels, test_size=0.1, random_state=1)

In [22]:
x_train.head()

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,...,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3
85,no,adult,38.0,54.0,30.0,warm,reduced,pale_pink,3,mild_pain,...,,distend_small,45.0,6.2,,,no,400,0,0
296,yes,adult,37.5,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,severe_pain,...,decreased,distend_large,60.0,6.8,,,yes,3205,0,0
132,no,adult,37.9,40.0,24.0,normal,normal,normal_pink,less_3_sec,depressed,...,,firm,40.0,5.7,,,yes,400,0,0
285,no,adult,37.8,82.0,12.0,cool,normal,normal_pink,more_3_sec,severe_pain,...,,,50.0,7.0,,,yes,2205,0,0
131,no,adult,37.6,88.0,36.0,cool,normal,normal_pink,less_3_sec,mild_pain,...,,,44.0,6.0,,,yes,1400,0,0


In [23]:
x_train['rectal_temp']

85     38.0
296    37.5
132    37.9
285    37.8
131    37.6
       ... 
203    39.2
255    37.5
72     37.7
235    38.2
37     37.8
Name: rectal_temp, Length: 269, dtype: float64

In [24]:
x_train['abdomo_protein'].isnull().sum()

178

In [25]:
x_test.head()

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,...,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3
173,no,adult,,,,cool,absent,dark_cyanotic,,severe_pain,...,,,,,,,yes,3400,0,0
287,yes,adult,,,,,,,,,...,,,,,,,yes,2124,0,0
51,yes,adult,37.4,84.0,36.0,normal,,pale_pink,more_3_sec,mild_pain,...,absent,distend_large,,,serosanguious,,yes,7209,0,0
146,no,adult,38.0,44.0,12.0,warm,normal,pale_pink,less_3_sec,mild_pain,...,normal,distend_small,33.0,6.5,,,yes,2209,0,0
214,yes,adult,37.8,40.0,12.0,normal,normal,normal_pink,less_3_sec,alert,...,normal,other,38.0,7.0,,,yes,3111,0,0


#### Percentage of the missing values for each feature:

In [26]:
(x_train.isnull().sum() / len(x_train)) * 100

surgery                   0.000000
age                       0.000000
rectal_temp              19.702602
pulse                     8.178439
respiratory_rate         20.074349
temp_of_extremities      19.330855
peripheral_pulse         22.676580
mucous_membrane          15.985130
capillary_refill_time    10.408922
pain                     18.959108
peristalsis              13.754647
abdominal_distention     18.959108
nasogastric_tube         35.315985
nasogastric_reflux       36.059480
nasogastric_reflux_ph    82.156134
rectal_exam_feces        34.944238
abdomen                  40.520446
packed_cell_volume        8.921933
total_protein            10.037175
abdomo_appearance        55.390335
abdomo_protein           66.171004
surgical_lesion           0.000000
lesion_1                  0.000000
lesion_2                  0.000000
lesion_3                  0.000000
dtype: float64

In [27]:
categ_imputer = SimpleImputer(strategy='most_frequent')
x_train = pd.DataFrame(
    categ_imputer.fit_transform(x_train), columns=x_train.columns
    ).astype(x_train.dtypes.to_dict())

In [28]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
import numpy as np

num_feat = x_train.select_dtypes(include=['float64', 'int64'])


ordinal_categories = {
    'peripheral_pulse': ['normal', 'increased', 'reduced', 'absent'],
    'capillary_refill_time': ['more_3_sec', '3', 'less_3_sec'],
    'peristalsis': ['hypomotile', 'normal', 'hypermotile', 'absent'],
    'abdominal_distention': ['none', 'slight', 'moderate', 'severe'],
    'nasogastric_tube': ['none', 'slight', 'significant'],
    'nasogastric_reflux': ['none', 'less_1_liter', 'more_1_liter'],
    'rectal_exam_feces': ['normal', 'increased', 'decreased', 'absent'],
    'abdomen': ['normal', 'other', 'firm', 'distend_small', 'distend_large'],
    'abdomo_appearance': ['clear', 'cloudy', 'serosanguious']
}


nominal_cats = ['temp_of_extremities', 'mucous_membrane', 'pain']


class NumericalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X)
        return self

    def transform(self, X):
        return self.scaler.transform(X)


class OrdinalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_categories):
        self.ordinal_categories = ordinal_categories
        self.ordinal_encoder = OrdinalEncoder(categories=[self.ordinal_categories[f] for f in self.ordinal_categories])

    def fit(self, X, y=None):
        return self.ordinal_encoder.fit(X)

    def transform(self, X):
        return self.ordinal_encoder.transform(X)


class NominalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.onehot_encoder = OneHotEncoder(drop='first')

    def fit(self, X, y=None):
        return self.onehot_encoder.fit(X)

    def transform(self, X):
        return self.onehot_encoder.transform(X)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', NumericalTransformer(), num_feat.columns),  # StandardScaler for numerical features
        ('ord', OrdinalTransformer(ordinal_categories), list(ordinal_categories.keys())),  # ordinal transformer
        ('nom', NominalTransformer(), nominal_cats)  # nominal transformer
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])


In [29]:
X_transformed = pipeline.fit_transform(x_train)
num_ord_feature_names = (list(make_column_selector(dtype_include=['float64', 'int64'])(x_train)) +
                         list(ordinal_categories.keys()))


nom_feature_names = []
nominal_encoder = pipeline.named_steps['preprocessor'].transformers_[2][1].onehot_encoder
for i, col in enumerate(nominal_cats):
    categories = nominal_encoder.categories_[i][1:]
    nom_feature_names.extend([f'{col}_{cat}' for cat in categories])

feature_names = num_ord_feature_names + nom_feature_names

X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

In [30]:
X_transformed_df

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3,...,temp_of_extremities_warm,mucous_membrane_bright_red,mucous_membrane_dark_cyanotic,mucous_membrane_normal_pink,mucous_membrane_pale_cyanotic,mucous_membrane_pale_pink,pain_depressed,pain_extreme_pain,pain_mild_pain,pain_severe_pain
0,-0.234521,-0.592235,0.027347,-0.359030,-0.065322,-0.621231,-0.287231,-0.644911,-0.147097,-0.061085,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.994594,0.035524,0.027347,-0.359030,1.374983,-0.598783,-0.287231,-0.035403,-0.147097,-0.061085,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-0.386535,-1.080491,-0.340473,-0.359030,-0.545424,-0.639937,-0.287231,-0.644911,-0.147097,-0.061085,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.538550,0.384278,-1.076115,-0.359030,0.414779,-0.591301,-0.287231,-0.252697,-0.147097,-0.061085,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.842579,0.593531,0.395168,-0.735703,-0.161343,-0.628713,-0.287231,-0.427618,-0.147097,-0.061085,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,1.589655,-0.801487,-0.401777,-0.359030,-0.929505,-0.606266,0.495597,-0.054960,-0.147097,-0.061085,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
265,-0.994594,0.035524,-0.340473,-0.359030,-1.025526,1.578633,-0.287231,0.834639,-0.147097,-0.061085,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
266,-0.690565,-0.522484,-0.340473,-0.359030,-0.833485,-0.610007,-0.287231,-0.272688,-0.147097,-0.061085,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
267,0.069509,-1.010740,-0.217867,-0.359030,-0.929505,-0.595042,-0.287231,-0.055829,-0.147097,-0.061085,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
