In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [41]:
train_df = pd.read_csv('../dataset/train.csv')
train_labels = train_df.pop('outcome').map({'died': 0, 'euthanized': 1, 'lived': 2})

In [42]:
train_df.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,distend_small,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,distend_small,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,distend_large,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,distend_small,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,normal,47.0,7.3,cloudy,2.6,no,0,0,0,yes


In [43]:
train_labels.unique()

array([0, 1, 2], dtype=int64)

In [46]:
train_df['pain'].unique()

array(['depressed', 'mild_pain', 'extreme_pain', 'alert', 'severe_pain',
       nan, 'slight'], dtype=object)

In [9]:
train_df.drop(['id'], axis=1, inplace=True)

In [10]:
train_df.drop(['hospital_number'], axis=1, inplace=True)

In [11]:
train_df.dtypes

surgery                   object
age                       object
rectal_temp              float64
pulse                    float64
respiratory_rate         float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
nasogastric_tube          object
nasogastric_reflux        object
nasogastric_reflux_ph    float64
rectal_exam_feces         object
abdomen                   object
packed_cell_volume       float64
total_protein            float64
abdomo_appearance         object
abdomo_protein           float64
surgical_lesion           object
lesion_1                   int64
lesion_2                   int64
lesion_3                   int64
cp_data                   object
dtype: object

In [12]:
categorical_cols = train_df.select_dtypes(["bool_", "object_"]).columns
numeric_cols = train_df.select_dtypes(exclude=["bool_", "object_"]).columns

In [13]:
encoder = OrdinalEncoder()
train_df[categorical_cols] = encoder.fit_transform(train_df[categorical_cols])

In [14]:
train_df.isna().sum().sum()

765

In [15]:
iterative_imputer = IterativeImputer()
train_df[numeric_cols] = pd.DataFrame(iterative_imputer.fit_transform(train_df[numeric_cols]), columns= numeric_cols)

In [16]:
categorical_imputer = SimpleImputer(strategy= "most_frequent")
train_df[categorical_cols] = pd.DataFrame(categorical_imputer.fit_transform(train_df[categorical_cols]), columns= categorical_cols)

In [17]:
train_df.isna().sum().sum()

0

In [39]:
train_df['pain'].unique()

array([1., 3., 2., 0., 4., 5.])

In [18]:
scaler = StandardScaler()
new_col_names = [col + "_scaled" for col in numeric_cols]

train_df[new_col_names] = scaler.fit_transform(train_df[numeric_cols]) 

In [19]:
mi_scores = mutual_info_classif(train_df, train_labels)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=train_df.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

lesion_1_scaled                 0.182216
lesion_1                        0.181177
abdomo_protein_scaled           0.177001
pulse_scaled                    0.175267
total_protein_scaled            0.164674
pulse                           0.162154
abdomo_protein                  0.157172
total_protein                   0.146731
packed_cell_volume_scaled       0.140898
pain                            0.136776
nasogastric_reflux_ph           0.122620
nasogastric_reflux_ph_scaled    0.119788
packed_cell_volume              0.118449
temp_of_extremities             0.082766
respiratory_rate_scaled         0.081934
abdomo_appearance               0.081521
respiratory_rate                0.063860
mucous_membrane                 0.061630
rectal_temp_scaled              0.052523
peripheral_pulse                0.051615
capillary_refill_time           0.051319
peristalsis                     0.041787
nasogastric_reflux              0.038504
cp_data                         0.034314
rectal_exam_fece

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df, train_labels, train_size = 0.8)

In [21]:
rf = RandomForestClassifier(n_estimators = 1000)
rf.fit(X_train, y_train)

print(accuracy_score(y_valid, rf.predict(X_valid)))

0.659919028340081


In [22]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=3000, random_state=42)
lr.fit(X_train, y_train)

print(accuracy_score(y_valid, lr.predict(X_valid)))

0.6072874493927125


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=200)
gbc.fit(X_train, y_train)

print(accuracy_score(y_valid, gbc.predict(X_valid)))

0.680161943319838


In [24]:
from xgboost import XGBClassifier

class CustomXGBClassifier(XGBClassifier):
    
    def __init__(self, **params):
        
        super().__init__(**params)
        self.eval_set = params['eval_set']
    
    def fit(self, X, y):
        super().fit(X, y, eval_set=self.eval_set, verbose=100) 

In [25]:
xgb = CustomXGBClassifier(n_estimators=1000, learning_rate=0.01, eval_set=[(X_valid, y_valid)], early_stopping_rounds=40, objective='binary:logistic')
xgb.fit(X_train, y_train)

print(accuracy_score(y_valid, xgb.predict(X_valid)))

[0]	validation_0-mlogloss:1.09358


Parameters: { "eval_set" } are not used.



[100]	validation_0-mlogloss:0.83522
[200]	validation_0-mlogloss:0.77729
[300]	validation_0-mlogloss:0.76599
[377]	validation_0-mlogloss:0.76399
0.659919028340081


In [26]:
from sklearn.ensemble import VotingClassifier

model = VotingClassifier(
    estimators = [
        ('xgb', xgb),
        ('rf', rf),
        ('lr', lr),
        ('gbc', gbc)
    ],
    voting = 'soft'
)

model.fit(X_train, y_train)

print(accuracy_score(y_valid, model.predict(X_valid)))

[0]	validation_0-mlogloss:1.09358


Parameters: { "eval_set" } are not used.



[100]	validation_0-mlogloss:0.83522
[200]	validation_0-mlogloss:0.77729
[300]	validation_0-mlogloss:0.76599
[378]	validation_0-mlogloss:0.76399


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6882591093117408


In [29]:
test_df = pd.read_csv("../dataset/test.csv")
test_df.drop(['hospital_number'], axis=1, inplace=True)
test_df.drop(['id'], axis=1, inplace=True)
test_df.head()

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,38.6,40.0,20.0,normal,normal,normal_pink,less_3_sec,mild_pain,...,distend_small,42.0,7.5,clear,2.3,no,0,0,0,no
1,yes,adult,38.2,112.0,48.0,cool,reduced,bright_pink,more_3_sec,depressed,...,distend_small,44.0,6.0,serosanguious,2.6,no,2208,0,0,yes
2,yes,adult,37.7,66.0,12.0,cool,normal,bright_red,less_3_sec,mild_pain,...,distend_small,31.5,6.0,cloudy,1.6,yes,2205,0,0,yes
3,no,adult,37.1,88.0,20.0,cool,reduced,pale_cyanotic,less_3_sec,depressed,...,distend_large,75.0,81.0,,1.0,yes,1400,0,0,no
4,yes,adult,38.3,50.0,12.0,,normal,bright_pink,less_3_sec,mild_pain,...,distend_small,37.0,6.8,cloudy,2.6,yes,2208,0,0,yes


In [45]:
test_df['pain'].unique()

array(['mild_pain', 'depressed', 'severe_pain', 'extreme_pain', nan,
       'moderate', 'alert'], dtype=object)

array(['depressed', 'mild_pain', 'extreme_pain', 'alert', 'severe_pain',
       nan, 'slight'], dtype=object)

In [36]:
categorical_cols

Index(['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion',
       'cp_data'],
      dtype='object')

In [35]:
test_df[categorical_cols] = encoder.transform(test_df[categorical_cols])
## use target encoder for categorical columns

ValueError: Found unknown categories ['moderate'] in column 6 during transform

In [None]:
test_df[numeric_cols] = pd.DataFrame(iterative_imputer.transform(test_df[numeric_cols]), columns= numeric_cols)
test_df[categorical_cols] = pd.DataFrame(categorical_imputer.transform(test_df[categorical_cols]), columns= categorical_cols)

In [None]:
test_df[new_col_names] = scaler.transform(test_df[numeric_cols]) 

In [None]:
preds = [x == 1 for x in model.predict(test_df[train_df.columns])]
preds

In [None]:
# submission_df = pd.DataFrame({
#     "id" : test_df["id"],
#     "outcome" : preds
# })