In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

My EDA: https://www.kaggle.com/code/kotukw/health-outcomes-of-horses-eda

In [None]:
data_train_path = "/kaggle/input/playground-series-s3e22/train.csv"
data_test_path = "/kaggle/input/playground-series-s3e22/test.csv"
submision_path = "/kaggle/input/playground-series-s3e22/sample_submission.csv"

In [None]:
train_df = pd.read_csv(data_train_path, index_col="id")
test_df = pd.read_csv(data_test_path, index_col="id")
submision_df = pd.read_csv(submision_path, index_col="id")

In [None]:
num_cols = ['rectal_temp', 'pulse', 'respiratory_rate', 'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1']

cat_cols = ['temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance']

bin_cols = ['surgery', 'age', 'surgical_lesion', 'cp_data']

del_cols = ['lesion_3', 'lesion_2', 'nasogastric_reflux_ph', 'hospital_number']

In [None]:
X = train_df.copy()
X = X.drop(columns = del_cols)
y = X.pop("outcome")

In [None]:
Q1 = X[num_cols].quantile(0.25)
Q3 = X[num_cols].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

from scipy.stats import zscore

data_standardized = X[num_cols].apply(zscore)

# Removing values that are far from the average (by Z-score)
data_no_outliers = X[num_cols][(data_standardized >= -3) & (data_standardized <= 3)]

X[num_cols] = data_no_outliers.copy()
X

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler 

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

num_pipeline = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)),
    ('scaler', StandardScaler())
])

byn_pipeline = Pipeline(steps=[
    ('binary_encoder', OneHotEncoder()),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cate', cat_pipeline, cat_cols),
        ('byn', byn_pipeline, bin_cols)
    ]
)

models = {
    'XGBClassifier': XGBClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'LGBMClassifier' : LGBMClassifier(),
    "SVC" : SVC(),
    "KNeighborsClassifier" : KNeighborsClassifier(),
    "AdaBoostClassifier" : AdaBoostClassifier(),
    "GradientBoostingClassifier" : GradientBoostingClassifier(),
}

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
best_random_state = None
best_mae = float('inf')
for random_state in range(100):
    results_df = pd.DataFrame(columns=['Model', 'MAE'])
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=random_state)
    for model_name, model in models.items():
        clf = Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("model", model)
            ]
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        results_df.loc[len(results_df)] = [model_name, mae]
    best_mae_models = results_df.sort_values(by='MAE')["MAE"][0]
    if best_mae_models < best_mae:
        best_mae = best_mae_models
        best_random_state = random_state
print(f"Лучшее значение random_state: {best_random_state}")
print(f"Лучший MAE: {best_mae}")

In [None]:
results_df = pd.DataFrame(columns=['Model', 'MAE'])
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=4)
for model_name, model in models.items():
    clf = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ]
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    results_df.loc[len(results_df)] = [model_name, mae]
results_df.sort_values(by='MAE').head(9)

**Tried stacking for the first time. Public score was 0.77**

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

models_first_level = []
preds_first_level = []

for model in [XGBClassifier(), LGBMClassifier(), AdaBoostClassifier()]:
    clf = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ]
    )
    clf.fit(X_train, y_train)
    models_first_level.append(clf)
    y_preds = clf.predict(X_train)
    preds_first_level.append(y_preds) 

stacked_predictions = pd.DataFrame({'Model1': preds_first_level[0], 'Model2': preds_first_level[1], 'Model3': preds_first_level[2]})
meta_model = LogisticRegression()
meta_model.fit(stacked_predictions, y_train)

stacked_test_predictions = pd.DataFrame({
    'Model1': models_first_level[0].predict(test_df),
    'Model2': models_first_level[1].predict(test_df),
    'Model3': models_first_level[2].predict(test_df)
})
ensemble_predictions = meta_model.predict(stacked_test_predictions)

**Decided to try gridsearchcv on the best model: RandomForestClassifier**

In [None]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestClassifier())
    ]
)
param_grid = {
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt'],
    'model__bootstrap': [True, False],
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, scoring='accuracy')

In [None]:
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
best_params

In [None]:
accuracy = best_model.score(X_test, y_test)
print(accuracy)

In [None]:
from sklearn.metrics import accuracy_score
y_train_pred = best_model.predict(X_train)
print(f"Accuracy on train: {accuracy_score(list(y_train), list(y_train_pred)):.2f}")

y_pred = best_model.predict(X_test)
print(f"Accuracy on test: {accuracy_score(list(y_test), list(y_test)):.2f}")

In [None]:
submition = best_model.predict(test_df)

public score 0.76

In [None]:
result_df = pd.DataFrame({'id': test_df.index, 'outcome': ensemble_predictions})
result_df['outcome'] = result_df['outcome'].replace({0: 'died', 1: 'euthanized', 2: 'lived'})
result_df.to_csv('submission.csv', index=False)

In [None]:
result_df