In [21]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [22]:
df = pd.read_csv('horse.csv')
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101.0,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300.0,0.0,0,no
1,yes,adult,534817.0,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208.0,0.0,0,no
2,no,adult,530334.0,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0.0,0.0,0,yes
3,yes,young,5290409.0,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208.0,0.0,0,yes
4,no,adult,530255.0,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300.0,0.0,0,no


In [23]:
missing_values = df.isnull().sum()
per_missing = (df.isnull().sum()/len(df)) * 100
total_missing = pd.concat([missing_values, per_missing], axis = 1)
total_missing.columns= ['Missing', 'Percentage']
total_missing.sort_values('Percentage', ascending=False, inplace=True)
total_missing.head(20)

Unnamed: 0,Missing,Percentage
nasogastric_reflux_ph,246,82.274247
abdomo_protein,201,67.22408
abdomo_appearance,168,56.187291
abdomen,124,41.471572
nasogastric_reflux,109,36.454849
nasogastric_tube,107,35.785953
rectal_exam_feces,107,35.785953
peripheral_pulse,73,24.414716
rectal_temp,65,21.73913
respiratory_rate,64,21.404682


In [24]:
df.drop(['nasogastric_reflux_ph', 'abdomo_protein'], axis=1,inplace=True)
numerical = df.select_dtypes(include=['int','float'])
categorical = df.select_dtypes(include=['object','category'])

for x in numerical:
    df[x].fillna(np.mean(df[x]), inplace=True)

for x in categorical:
    df[x].fillna(df[x].mode()[0], inplace=True)

df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(np.mean(df[x]), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(df[x].mode()[0], inplace=True)


surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
outcome                  0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
dtype: int64

In [25]:
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101.0,38.5,66.0,28.0,cool,reduced,normal_pink,more_3_sec,...,distend_large,45.0,8.4,cloudy,died,no,11300.0,0.0,0,no
1,yes,adult,534817.0,39.2,88.0,20.0,cool,normal,pale_cyanotic,less_3_sec,...,other,50.0,85.0,cloudy,euthanized,no,2208.0,0.0,0,no
2,no,adult,530334.0,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,normal,33.0,6.7,cloudy,lived,no,0.0,0.0,0,yes
3,yes,young,5290409.0,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,distend_large,48.0,7.2,serosanguious,died,yes,2208.0,0.0,0,yes
4,no,adult,530255.0,37.3,104.0,35.0,cool,normal,dark_cyanotic,more_3_sec,...,distend_large,74.0,7.4,cloudy,died,no,4300.0,0.0,0,no


In [26]:
# to check if the target variable is balanced
df['outcome'].value_counts()

outcome
lived         182
died           76
euthanized     41
Name: count, dtype: int64

In [27]:
# using random sampling to balance the calss
from sklearn.utils import resample
df_lived = df[df['outcome'] == 'lived']
df_died = df[df['outcome'] == 'died']
df_euth = df[df['outcome'] == 'euthanized']
df_died_samp = resample(df_died, n_samples=182)
df_euth_samp = resample(df_euth, n_samples=182)
df = pd.concat([df_lived, df_died_samp, df_euth_samp])
df = df.sample(frac=1)
df['outcome'].value_counts()

outcome
lived         182
euthanized    182
died          182
Name: count, dtype: int64

overfitting is when the training percentage is high
unserfitting is when the training percentage i low

In [28]:
encoder = LabelEncoder()
categorical = df.select_dtypes(include=['object','category'])

for x in categorical:
    df[x] = encoder.fit_transform(df[x])
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
208,0,0,534478.0,37.5,44.0,10.0,1,2,3,1,...,2,43.0,51.0,0,2,0,0.0,0.0,0,0
215,0,0,528977.0,38.18547,71.95539,30.455319,1,2,3,1,...,0,46.291667,23.947126,1,2,0,0.0,0.0,0,0
162,1,0,533942.0,38.0,66.0,20.0,2,3,5,1,...,0,46.0,46.0,2,1,1,6209.0,0.0,0,0
10,1,0,528548.0,38.1,66.0,12.0,1,3,1,1,...,0,44.0,6.0,1,2,1,2124.0,0.0,0,1
249,1,0,535043.0,39.3,100.0,51.0,0,0,2,1,...,1,66.0,13.0,2,1,1,2113.0,0.0,0,0


In [31]:
df.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
outcome                  0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
dtype: int64

In [32]:
x = df.drop(['outcome','hospital_number'], axis=1)
y = df['outcome']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=4)

In [33]:
model = LogisticRegression()
model.fit(xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
pred1= model.predict(xtest)
print(classification_report(ytest,pred1))
cm = confusion_matrix(ytest,pred1)
cm

              precision    recall  f1-score   support

           0       0.57      0.75      0.65        36
           1       0.77      0.63      0.70        38
           2       0.66      0.58      0.62        36

    accuracy                           0.65       110
   macro avg       0.67      0.65      0.65       110
weighted avg       0.67      0.65      0.66       110



array([[27,  2,  7],
       [10, 24,  4],
       [10,  5, 21]])

In [37]:
model_2 = DecisionTreeClassifier()
model_2.fit(xtrain, ytrain)

In [41]:
pred2 = model_2.predict(xtest)
print(classification_report(ytest,pred2))
cm = confusion_matrix(ytest, pred2)
cm

              precision    recall  f1-score   support

           0       0.86      0.86      0.86        36
           1       0.93      0.97      0.95        38
           2       0.82      0.78      0.80        36

    accuracy                           0.87       110
   macro avg       0.87      0.87      0.87       110
weighted avg       0.87      0.87      0.87       110



array([[31,  0,  5],
       [ 0, 37,  1],
       [ 5,  3, 28]])

In [42]:
model_3 = RandomForestClassifier()
model_3.fit(xtrain, ytrain)

In [43]:
pred3 = model_3.predict(xtest)
print(classification_report(ytest,pred3))
cm = confusion_matrix(ytest, pred3)
cm

              precision    recall  f1-score   support

           0       0.94      0.86      0.90        36
           1       0.95      1.00      0.97        38
           2       0.86      0.89      0.88        36

    accuracy                           0.92       110
   macro avg       0.92      0.92      0.92       110
weighted avg       0.92      0.92      0.92       110



array([[31,  0,  5],
       [ 0, 38,  0],
       [ 2,  2, 32]])

In [44]:
model_4 = KNeighborsClassifier()
model_4.fit(xtrain, ytrain)

In [None]:
# SUPPORT VECTOR CLASSIFIER
# SUPPORT VECTOR MACHINE
pred4 = model_4.predict(xtest)
print(classification_report(ytest,pred4))
cm = confusion_matrix(ytest, pred4)
cm

              precision    recall  f1-score   support

           0       0.84      0.75      0.79        36
           1       0.77      0.97      0.86        38
           2       0.83      0.69      0.76        36

    accuracy                           0.81       110
   macro avg       0.82      0.81      0.80       110
weighted avg       0.82      0.81      0.81       110



array([[27,  5,  4],
       [ 0, 37,  1],
       [ 5,  6, 25]])

In [46]:
model_5 = SVC()
model_5.fit(xtrain, ytrain)

In [47]:
pred5 = model_5.predict(xtest)
print(classification_report(ytest,pred5))
cm = confusion_matrix(ytest, pred5)
cm

              precision    recall  f1-score   support

           0       0.50      0.33      0.40        36
           1       0.46      0.82      0.59        38
           2       0.84      0.44      0.58        36

    accuracy                           0.54       110
   macro avg       0.60      0.53      0.52       110
weighted avg       0.60      0.54      0.53       110



array([[12, 22,  2],
       [ 6, 31,  1],
       [ 6, 14, 16]])

<!-- ANY RECALL AND PRECISION THAT IS BELOW 85% IS NOT BAD -->