In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
import numpy as np
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier

In [2]:
train = pd.read_csv("../dataset/train.csv")
test = pd.read_csv("../dataset/test.csv")

In [3]:
train.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [4]:
test.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,1235,no,adult,534053,38.6,40.0,20.0,normal,normal,normal_pink,...,distend_small,42.0,7.5,clear,2.3,no,0,0,0,no
1,1236,yes,adult,528469,38.2,112.0,48.0,cool,reduced,bright_pink,...,distend_small,44.0,6.0,serosanguious,2.6,no,2208,0,0,yes
2,1237,yes,adult,528178,37.7,66.0,12.0,cool,normal,bright_red,...,distend_small,31.5,6.0,cloudy,1.6,yes,2205,0,0,yes
3,1238,no,adult,534784,37.1,88.0,20.0,cool,reduced,pale_cyanotic,...,distend_large,75.0,81.0,,1.0,yes,1400,0,0,no
4,1239,yes,adult,529840,38.3,50.0,12.0,,normal,bright_pink,...,distend_small,37.0,6.8,cloudy,2.6,yes,2208,0,0,yes


In [6]:
train.isnull().sum()

id                         0
surgery                    0
age                        0
hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
temp_of_extremities       39
peripheral_pulse          60
mucous_membrane           21
capillary_refill_time      6
pain                      44
peristalsis               20
abdominal_distention      23
nasogastric_tube          80
nasogastric_reflux        21
nasogastric_reflux_ph      0
rectal_exam_feces        190
abdomen                  213
packed_cell_volume         0
total_protein              0
abdomo_appearance         48
abdomo_protein             0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
outcome                    0
dtype: int64

In [7]:
test.isnull().sum()

id                         0
surgery                    0
age                        0
hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
temp_of_extremities       35
peripheral_pulse          47
mucous_membrane           13
capillary_refill_time      6
pain                      29
peristalsis               19
abdominal_distention      22
nasogastric_tube          64
nasogastric_reflux        14
nasogastric_reflux_ph      0
rectal_exam_feces        125
abdomen                  154
packed_cell_volume         0
total_protein              0
abdomo_appearance         31
abdomo_protein             0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [8]:
train_missing_cols = train.columns[train.isnull().any()] 
test_missing_cols = test.columns[test.isnull().any()]

train_missing_cols = list(train_missing_cols) 
test_missing_cols = list(test_missing_cols)

In [9]:
def replace_missing_with_most_common(data, columns):
    for column in columns:
        col_most_common = data[column].value_counts().index[0]
        data[column] = data[column].replace({np.nan: col_most_common})
    return data

train = replace_missing_with_most_common(train, train_missing_cols)
test = replace_missing_with_most_common(test,test_missing_cols)

In [10]:
numeric_train = train.select_dtypes(include = [np.number]) 
numeric_test = test.select_dtypes(include = [np.number])

categorical_train = train.select_dtypes(exclude = [np.number])
categorical_test = test.select_dtypes(exclude = [np.number])

In [11]:
train['surgery'] = train['surgery'].replace({'yes': 1,'no': 0})
train['age'] = train['age'].replace({'adult': 1,'young': 0})
train['surgical_lesion'] = train['surgical_lesion'].replace({'yes': 1,'no': 0})
train['cp_data'] = train['cp_data'].replace({'yes': 1,'no': 0})
train['outcome'] = train['outcome'].replace({'died': 0,'euthanized': 1,'lived': 2})

columns = ['temp_of_extremities','peripheral_pulse','mucous_membrane','capillary_refill_time','pain','peristalsis','abdominal_distention',
'nasogastric_tube','nasogastric_reflux','rectal_exam_feces','abdomen','abdomo_appearance']
train = pd.get_dummies(data = train,columns=columns)

In [12]:
test['surgery'] = test['surgery'].replace({'yes': 1,'no': 0})
test['age'] = test['age'].replace({'adult': 1,'young': 0})
test['surgical_lesion'] = test['surgical_lesion'].replace({'yes': 1,'no': 0})
test['cp_data'] = test['cp_data'].replace({'yes': 1,'no': 0})

test = pd.get_dummies(data = test,columns = ['temp_of_extremities','peripheral_pulse','mucous_membrane','capillary_refill_time','pain','peristalsis','abdominal_distention','nasogastric_tube','nasogastric_reflux','rectal_exam_feces','abdomen','abdomo_appearance'])

In [13]:
y_train = train['outcome']
X_train_id = train['id']
X_train = train.drop(columns = ['outcome', 'id'])

X_test_id = test['id']
X_test = test.drop(columns = 'id')

In [14]:
X_train['pain_moderate'] = 0 
X_test['nasogastric_reflux_slight'] = 0
X_test['pain_slight'] = 0
X_test['peristalsis_distend_small'] = 0
X_test['rectal_exam_feces_serosanguious'] = 0
X_test = X_test.reindex(X_train.columns, axis=1)

In [15]:
train_weights = compute_class_weight(class_weight = 'balanced',classes = np.unique(train['outcome']),y = train['outcome'])
skf = StratifiedKFold(n_splits = 5)

In [16]:
rs=RobustScaler()
X_train_rs = rs.fit_transform(X_train)
X_test_rs = rs.transform(X_test)

In [17]:
best_random_lgbm = LGBMClassifier(n_estimators = 50,learning_rate = 0.1,num_leaves = 4,class_weight = 'balanced')
best_random_lgbm.fit(X_train_rs, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 613
[LightGBM] [Info] Number of data points in the train set: 1235, number of used features: 58
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [18]:
best_grid_xgb = XGBClassifier(max_depth = 11,max_leaves = 4,n_estimators = 150)
best_grid_xgb.fit(X_train_rs, y_train)

In [19]:
orig_catboost = CatBoostClassifier(random_state = 21,boosting_type = 'Ordered',verbose = 0)
orig_catboost.fit(X_train_rs, y_train)

<catboost.core.CatBoostClassifier at 0x14d3cc93370>

In [20]:
def stack_predict_submit(models, X_train, y_train, X_test, X_test_id):
    stacked_model = StackingClassifier(stack_method = 'predict_proba',estimators = models,cv = 'prefit',n_jobs = -1)
    stacked_model.fit(X_train, y_train)
    stacked_predictions = stacked_model.predict(X_test)
    value_to_replace = {0: 'died', 1: 'euthanized', 2: 'lived'}
    stacked_predictions_worded = np.vectorize(value_to_replace.get)(stacked_predictions)
    submission_combined = np.column_stack((X_test_id, stacked_predictions_worded))
    submission_df = pd.DataFrame(submission_combined, columns = ['id', 'outcome'])
    submission_csv = submission_df.to_csv('submission.csv', index=False)
    return submission_csv

best_random_models_list = [('lgbm', best_random_lgbm),('xgb', best_grid_xgb),('orig_catboost', orig_catboost)]
stack_predict_submit(best_random_models_list,X_train_rs,y_train,X_test_rs, X_test_id)