## Imports

# Heath Outcomes of Horses S3E22

https://www.kaggle.com/competitions/playground-series-s3e22

Profile: https://www.kaggle.com/meesh11

In [112]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

pd.options.display.float_format = '{:,.2f}'.format

## Var Declaration

In [113]:
data_path = r'C:\Users\chase\Desktop\Coding\Kaggle\S3E22\data'

## Loading Data and Munging

In [114]:
#Modified version of SJAKOO7's summary function in the notebook: https://www.kaggle.com/code/sjagkoo7/predict-health-outcomes-of-horses-s3-ep22
def summary(df : pd.DataFrame) -> pd.DataFrame:

    """_summary_
    Returns:
        pd.Dataframe : A dataframe containing descriptive metrics for imputation and feature engineering
    """

    print(f'Data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtype'])
    summ['#missing'] = df.isnull().sum().values 
    summ['%missing'] = df.isnull().sum().values / len(df) * 100
    summ['#unique'] = df.nunique().values
    summ=pd.concat([summ,df.describe().T.drop('count',axis=1)],axis=1)
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['third value'] = df.loc[2].values
    
    return summ

In [115]:
train = pd.read_csv(data_path + '/train.csv')
summary(train).style.background_gradient(cmap='YlGnBu')

Data shape: (1235, 29)


Unnamed: 0,dtype,#missing,%missing,#unique,mean,std,min,25%,50%,75%,max,first value,second value,third value
id,int64,0,0.0,1235,617.0,356.6581,0.0,308.5,617.0,925.5,1234.0,0,1,2
surgery,object,0,0.0,2,,,,,,,,yes,yes,yes
age,object,0,0.0,2,,,,,,,,adult,adult,adult
hospital_number,int64,0,0.0,255,954500.401619,1356403.13894,521399.0,528800.0,529777.0,534145.0,5305129.0,530001,533836,529812
rectal_temp,float64,0,0.0,43,38.202186,0.788668,35.4,37.8,38.2,38.6,40.8,38.100000,37.500000,38.300000
pulse,float64,0,0.0,50,79.574089,29.108638,30.0,53.0,76.0,100.0,184.0,132.000000,88.000000,120.000000
respiratory_rate,float64,0,0.0,37,30.054251,16.452066,8.0,18.0,28.0,36.0,96.0,24.000000,12.000000,28.000000
temp_of_extremities,object,39,3.157895,4,,,,,,,,cool,cool,cool
peripheral_pulse,object,60,4.8583,4,,,,,,,,reduced,normal,reduced
mucous_membrane,object,21,1.700405,6,,,,,,,,dark_cyanotic,pale_cyanotic,pale_pink


In [116]:
test = pd.read_csv(data_path + '/test.csv')
summary(test).style.background_gradient(cmap='YlOrRd')

Data shape: (824, 28)


Unnamed: 0,dtype,#missing,%missing,#unique,mean,std,min,25%,50%,75%,max,first value,second value,third value
id,int64,0,0.0,824,1646.5,238.012605,1235.0,1440.75,1646.5,1852.25,2058.0,1235,1236,1237
surgery,object,0,0.0,2,,,,,,,,no,yes,yes
age,object,0,0.0,2,,,,,,,,adult,adult,adult
hospital_number,int64,0,0.0,210,1108357.197816,1555626.919032,521399.0,528743.0,529808.5,534644.0,5305129.0,534053,528469,528178
rectal_temp,float64,0,0.0,34,38.244539,0.785234,36.0,37.8,38.2,38.6,40.8,38.600000,38.200000,37.700000
pulse,float64,0,0.0,49,80.229369,29.164711,36.0,54.0,76.0,100.0,184.0,40.000000,112.000000,66.000000
respiratory_rate,float64,0,0.0,38,30.71966,17.43191,9.0,18.0,28.0,36.0,96.0,20.000000,48.000000,12.000000
temp_of_extremities,object,35,4.247573,4,,,,,,,,normal,cool,cool
peripheral_pulse,object,47,5.703883,4,,,,,,,,normal,reduced,normal
mucous_membrane,object,13,1.57767,6,,,,,,,,normal_pink,bright_pink,bright_red


## Prepare Data

The data provided has a great deal of missing values, and there are both quantitative and categorical columns. Imputation and dropping data will be required. Pathology data is not availible so the column will be dropped.

In [117]:
original = pd.read_csv('data\original_horse.csv')

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

train['is_generated'], test['is_generated'] = 1,1
original['is_generated'] = 0

train_total = pd.concat([train, original], ignore_index=True)
train_total.drop_duplicates(inplace=True)

total = pd.concat([train_total, test])
target = 'outcome'

total[target] = total[target].map({'died':0,'euthanized':1,'lived':2})

In [118]:
def preprocessing(df, le_cols, ohe_cols):
    
    # Label Encoding for binary cols
    le = LabelEncoder()    
    for col in le_cols:
        df[col] = le.fit_transform(df[col])
    
    # OneHot Encoding for category cols
    df = pd.get_dummies(df, columns = ohe_cols)
    
    df["pain"] = df["pain"].replace('slight', 'moderate')
    df["peristalsis"] = df["peristalsis"].replace('distend_small', 'normal')
    df["rectal_exam_feces"] = df["rectal_exam_feces"].replace('serosanguious', 'absent')
    df["nasogastric_reflux"] = df["nasogastric_reflux"].replace('slight', 'none')
        
    df["temp_of_extremities"] = df["temp_of_extremities"].fillna("normal").map({'cold': 0, 'cool': 1, 'normal': 2, 'warm': 3})
    df["peripheral_pulse"] = df["peripheral_pulse"].fillna("normal").map({'absent': 0, 'reduced': 1, 'normal': 2, 'increased': 3})
    df["capillary_refill_time"] = df["capillary_refill_time"].fillna("3").map({'less_3_sec': 0, '3': 1, 'more_3_sec': 2})
    df["pain"] = df["pain"].fillna("depressed").map({'alert': 0, 'depressed': 1, 'moderate': 2, 'mild_pain': 3, 'severe_pain': 4, 'extreme_pain': 5})
    df["peristalsis"] = df["peristalsis"].fillna("hypomotile").map({'hypermotile': 0, 'normal': 1, 'hypomotile': 2, 'absent': 3})
    df["abdominal_distention"] = df["abdominal_distention"].fillna("none").map({'none': 0, 'slight': 1, 'moderate': 2, 'severe': 3})
    df["nasogastric_tube"] = df["nasogastric_tube"].fillna("none").map({'none': 0, 'slight': 1, 'significant': 2})
    df["nasogastric_reflux"] = df["nasogastric_reflux"].fillna("none").map({'less_1_liter': 0, 'none': 1, 'more_1_liter': 2})
    df["rectal_exam_feces"] = df["rectal_exam_feces"].fillna("absent").map({'absent': 0, 'decreased': 1, 'normal': 2, 'increased': 3})
    df["abdomen"] = df["abdomen"].fillna("distend_small").map({'normal': 0, 'other': 1, 'firm': 2,'distend_small': 3, 'distend_large': 4})
    df["abdomo_appearance"] = df["abdomo_appearance"].fillna("serosanguious").map({'clear': 0, 'cloudy': 1, 'serosanguious': 2})
    
    # Imputer 
    cols_with_nan = df.drop(target,axis=1).columns[df.drop(target,axis=1).isna().any()].tolist()
    for feature in cols_with_nan:
        df[feature].fillna(df[feature].mode()[0], inplace=True)
     
    return df  

In [119]:
total = preprocessing(total, le_cols = ["surgery", "age", "surgical_lesion"], ohe_cols = ["mucous_membrane"])
total.drop(['lesion_2', 'lesion_3', 'mucous_membrane_dark_cyanotic', 'cp_data'], axis=1, inplace=True)

# Create Pipeline

In [134]:
test = total[total[target].isna()].drop(columns=[target]) #Final test df
train = total.dropna()

X = train.drop(columns=[target])  # Remove the target column to get features
y = train[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [121]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier())
])

param_grid = {
    'xgb__learning_rate': [0.01, 0.1],
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 5],
    'xgb__subsample': [0.8, 1],
    'xgb__colsample_bytree': [0.8, 1]
}

# Train Model

In [122]:
model = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
model.fit(X_train, y_train)

In [123]:
print(f"Best score: {model.best_score_}")
print(f"Best parameters: {model.best_params_}")

test_score = model.score(X_test, y_test)
print(f"Test set accuracy: {test_score}")

Best score: 0.7342724406835905
Best parameters: {'xgb__colsample_bytree': 0.8, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators': 200, 'xgb__subsample': 0.8}
Test set accuracy: 0.758957654723127


In [183]:
predictions = pd.Series(model.predict(test))
predictions = predictions.map({0:'died',1:'euthanized',2:'lived'}).reset_index(drop=True)

     id     outcome
0  1235       lived
1  1236        died
2  1237       lived
3  1238  euthanized
4  1239       lived


In [188]:
submission = pd.read_csv(data_path + '/test.csv')
submission = submission[['id']]
submission['outcome'] = predictions

submission.to_csv('submission.csv', index=False)