In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score 

---
### Preprocessing

In [2]:
def read_data():
    train = pd.read_csv("../Data/health_of_hourse/train.csv")
    test = pd.read_csv("../Data/health_of_hourse/test.csv")
    return train, test

def preprocessing_train_data(train):
    # Convert categorical to binary/Trinary
    train['surgery'] = train['surgery'].map({'yes':1, 'no':0})
    train['age'] = train['age'].map({'adult':1, 'young':0})
    train['surgical_lesion'] = train['surgical_lesion'].map({"yes":1, "no":0})
    train['capillary_refill_time'] = train['capillary_refill_time'].map({'less_3_sec': 0, 'more_3_sec': 1, '3':2})
    
    # Fill NA with mode
    train = train.apply(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else x.median()))
    
    # Ordinal mappings
    ordinal_mappings = {
        'pain': {'depressed':1 ,'mild_pain':2, 'extreme_pain':3, 'alert':4, 'severe_pain':5, 'slight':6},
        'abdominal_distention': {'slight':1, 'moderate':2, 'none':3, 'severe':4},
    }

    for col, mapping in ordinal_mappings.items():
        train[col] = train[col].map(mapping)

    # One-Hot Encoding
    cols_to_encode = ['peristalsis', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'cp_data', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'nasogastric_tube']
    ohe = OneHotEncoder(drop='first', sparse=False)
    encoded_data = ohe.fit_transform(train[cols_to_encode])
    df_encoded = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(cols_to_encode))
    
    train = pd.concat([train, df_encoded], axis=1)
    train.drop(cols_to_encode, axis=1, inplace=True)

    # Numerical
    scaler = StandardScaler()
    numerical_cols = ['rectal_temp', 'pulse', 'respiratory_rate']
    train[numerical_cols] = scaler.fit_transform(train[numerical_cols])

    # Target
    label_enc = LabelEncoder()
    train['outcome'] = label_enc.fit_transform(train['outcome'])

    # Drop
    train.drop(['id', 'hospital_number'], axis=1, inplace=True)

    return train, ohe, scaler, label_enc


In [3]:
def preprocessing_test_data(test,ohe, scaler, label_enc):
    # convert categorical to binary/Trinary
    test['surgery'] = test['surgery'].map({'yes':1,'no':0})
    test['age'] = test['age'].map({'adult':1, 'young':0})
    test['surgical_lesion'] = test['surgical_lesion'].map({"yes":1, "no":0})
    test['capillary_refill_time'] = test['capillary_refill_time'].map({'less_3_sec': 0, 'more_3_sec': 1, '3':2})

    # fill NA with mode
    test = test.apply(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else x.median()))
    
    # ordinal mappings
    ordinal_mappings = {
        'pain': {'depressed':1, 'mild_pain':2, 'extreme_pain':3, 'alert':4, 'severe_pain':5, 'slight':6},
        'abdominal_distention': {'slight':1, 'moderate':2, 'none':3, 'severe':4}
    }

    for col, mapping in ordinal_mappings.items():
        test[col] = test[col].map(mapping)
    
    
    # One-Hot Encoding
    cols_to_encode = ['peristalsis', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'cp_data', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'nasogastric_tube']
    encoded_data = ohe.transform(test[cols_to_encode])
    df_encoded = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(cols_to_encode))
    
    test = pd.concat([test, df_encoded], axis=1)
    test.drop(cols_to_encode, axis=1, inplace=True)
    
    # numerical
    scaler = StandardScaler()
    numerical_cols = ['rectal_temp', 'pulse', 'respiratory_rate']
    test[numerical_cols] = scaler.fit_transform(test[numerical_cols])
    
    # object
    object_cols = test.select_dtypes(include='object').columns
    test[object_cols] = test[object_cols].astype('category')
    
    return test

---
### Read in

In [4]:
train, test = read_data()

---
### XGBoosting

In [5]:
train, ohe, scaler, label_enc = preprocessing_train_data(train)

X = train.drop('outcome', axis=1)
y = train['outcome']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3)  # Assuming 3 classes for 'outcome'
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_val)

# Check performance
print(classification_report(y_val, y_pred))
print("Accuracy:", accuracy_score(y_val, y_pred))



              precision    recall  f1-score   support

           0       0.60      0.60      0.60        72
           1       0.71      0.63      0.67        54
           2       0.75      0.79      0.77       121

    accuracy                           0.70       247
   macro avg       0.68      0.67      0.68       247
weighted avg       0.70      0.70      0.70       247

Accuracy: 0.6963562753036437


---
### Apply Test & Final

In [6]:
test = preprocessing_test_data(test, ohe, scaler, label_enc)

cols_to_drop = ['hospital_number', 'id']

test_data = test.drop(columns=cols_to_drop)

test_predictions = model.predict(test_data)

original_labels = label_enc.inverse_transform(test_predictions)

# Create a submission file if needed
submission = pd.DataFrame({'id': test['id'], 'outcome': original_labels})
submission.to_csv('../Submissions/submission_Predict Health Outcomes of Horses.csv', index=False)