In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.shape, test.shape

((1235, 29), (824, 28))

In [4]:
train.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [5]:
train.describe()

Unnamed: 0,id,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3
count,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0,1235.0
mean,617.0,954500.4,38.202186,79.574089,30.054251,4.382591,49.602429,21.388016,3.290931,3832.496356,14.612146,3.577328
std,356.6581,1356403.0,0.788668,29.108638,16.452066,1.937357,10.5358,26.676453,1.589195,5436.733774,193.705735,88.858953
min,0.0,521399.0,35.4,30.0,8.0,1.0,23.0,3.5,0.1,0.0,0.0,0.0
25%,308.5,528800.0,37.8,53.0,18.0,2.0,43.0,6.6,2.0,2205.0,0.0,0.0
50%,617.0,529777.0,38.2,76.0,28.0,4.5,48.0,7.5,3.0,2209.0,0.0,0.0
75%,925.5,534145.0,38.6,100.0,36.0,6.0,57.0,9.1,4.3,3205.0,0.0,0.0
max,1234.0,5305129.0,40.8,184.0,96.0,7.5,75.0,89.0,10.1,41110.0,3112.0,2209.0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1235 non-null   int64  
 1   surgery                1235 non-null   object 
 2   age                    1235 non-null   object 
 3   hospital_number        1235 non-null   int64  
 4   rectal_temp            1235 non-null   float64
 5   pulse                  1235 non-null   float64
 6   respiratory_rate       1235 non-null   float64
 7   temp_of_extremities    1196 non-null   object 
 8   peripheral_pulse       1175 non-null   object 
 9   mucous_membrane        1214 non-null   object 
 10  capillary_refill_time  1229 non-null   object 
 11  pain                   1191 non-null   object 
 12  peristalsis            1215 non-null   object 
 13  abdominal_distention   1212 non-null   object 
 14  nasogastric_tube       1155 non-null   object 
 15  naso

In [7]:
train['outcome'].value_counts()

outcome
lived         574
died          410
euthanized    251
Name: count, dtype: int64

Les classes ne sont pas homogènes

In [8]:
# Insérer visualisation

In [9]:
# Préprocessing

Il y a beaucoup de features catégorielles

In [10]:
train.isnull().sum()

id                         0
surgery                    0
age                        0
hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
temp_of_extremities       39
peripheral_pulse          60
mucous_membrane           21
capillary_refill_time      6
pain                      44
peristalsis               20
abdominal_distention      23
nasogastric_tube          80
nasogastric_reflux        21
nasogastric_reflux_ph      0
rectal_exam_feces        190
abdomen                  213
packed_cell_volume         0
total_protein              0
abdomo_appearance         48
abdomo_protein             0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
outcome                    0
dtype: int64

In [11]:
# Drop column id
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [13]:
target = train['outcome']
features = train.drop('outcome', axis=1)


In [14]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(features, target, test_size=0.2, random_state=42)

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [31]:
object_columns = train.select_dtypes(include='object').columns

In [32]:
categorical_features = []
for column in object_columns:
    categorical_features.append(train.columns.get_loc(column))

In [33]:
numeric_features = []

In [34]:
for column in train.columns:
    if train.columns.get_loc(column) not in categorical_features:
        numeric_features.append(train.columns.get_loc(column))

In [44]:
numeric_transformer = make_pipeline(StandardScaler())
categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown="ignore"))

In [45]:
Preprocessor = ColumnTransformer(
    transformers=[
        ('numeric_transformer', numeric_transformer, numeric_features),
        ('categorical_transformer', categorical_transformer, categorical_features)
    ]
)

In [46]:
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier

In [47]:
hgb_pipe = make_pipeline(Preprocessor, HistGradientBoostingClassifier())

In [48]:
seed = 42

In [49]:
categorical_features.remove(27)

ValueError: list.remove(x): x not in list

In [50]:
#parameters to tune
parameters = {
 'histgradientboostingclassifier__max_iter': [1000,1200,1500],
 'histgradientboostingclassifier__learning_rate': [0.1],
 'histgradientboostingclassifier__max_depth' : [25, 50, 75],
 'histgradientboostingclassifier__l2_regularization': [1.5],
 'histgradientboostingclassifier__scoring': ['f1_micro'],
 'histgradientboostingclassifier__random_state' : [seed],
 }
#instantiate the gridsearch
hgb_grid = GridSearchCV(hgb_pipe, parameters, n_jobs=5,
 cv=5, scoring='f1_micro',
 verbose=2, refit=True)


In [51]:
#fit on the grid
hgb_grid.fit(X_train2, y_train2)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END histgradientboostingclassifier__l2_regularization=1.5, histgradientboostingclassifier__learning_rate=0.1, histgradientboostingclassifier__max_depth=25, histgradientboostingclassifier__max_iter=1000, histgradientboostingclassifier__random_state=42, histgradientboostingclassifier__scoring=f1_micro; total time=  41.2s
[CV] END histgradientboostingclassifier__l2_regularization=1.5, histgradientboostingclassifier__learning_rate=0.1, histgradientboostingclassifier__max_depth=25, histgradientboostingclassifier__max_iter=1000, histgradientboostingclassifier__random_state=42, histgradientboostingclassifier__scoring=f1_micro; total time=  41.3s
[CV] END histgradientboostingclassifier__l2_regularization=1.5, histgradientboostingclassifier__learning_rate=0.1, histgradientboostingclassifier__max_depth=25, histgradientboostingclassifier__max_iter=1000, histgradientboostingclassifier__random_state=42, histgradientboostingclassifier_

In [52]:
# Print the best parameters found
print(hgb_grid.best_params_)
# Print the best scores found
print()
print(hgb_grid.best_score_)

{'histgradientboostingclassifier__l2_regularization': 1.5, 'histgradientboostingclassifier__learning_rate': 0.1, 'histgradientboostingclassifier__max_depth': 25, 'histgradientboostingclassifier__max_iter': 1500, 'histgradientboostingclassifier__random_state': 42, 'histgradientboostingclassifier__scoring': 'f1_micro'}

0.702456032405271


In [53]:
from sklearn.metrics import confusion_matrix, f1_score

In [54]:
#Predict on X_test and print the confusion matrix
print()
y_pred_hgb = hgb_grid.predict(X_test2)
print(confusion_matrix(y_test2, y_pred_hgb))
#Print the F1_score
print()
print(f1_score(y_test2, y_pred_hgb, average='micro'))


[[51  4 17]
 [ 9 37  8]
 [23  5 93]]

0.7327935222672065


In [55]:
import pprint


In [56]:
pprint.pp(hgb_grid.best_params_)

{'histgradientboostingclassifier__l2_regularization': 1.5,
 'histgradientboostingclassifier__learning_rate': 0.1,
 'histgradientboostingclassifier__max_depth': 25,
 'histgradientboostingclassifier__max_iter': 1500,
 'histgradientboostingclassifier__random_state': 42,
 'histgradientboostingclassifier__scoring': 'f1_micro'}


In [57]:
submission_pred = hgb_grid.predict(test)

In [59]:
import pandas as pd

In [60]:
sub_df = pd.DataFrame(submission_pred)

In [61]:
sub_df

Unnamed: 0,0
0,lived
1,lived
2,lived
3,euthanized
4,lived
...,...
819,died
820,euthanized
821,died
822,lived


In [62]:
import csv

In [64]:
sub_df.to_csv('submission.csv')


In [65]:
test

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,534053,38.6,40.0,20.0,normal,normal,normal_pink,less_3_sec,...,distend_small,42.0,7.5,clear,2.3,no,0,0,0,no
1,yes,adult,528469,38.2,112.0,48.0,cool,reduced,bright_pink,more_3_sec,...,distend_small,44.0,6.0,serosanguious,2.6,no,2208,0,0,yes
2,yes,adult,528178,37.7,66.0,12.0,cool,normal,bright_red,less_3_sec,...,distend_small,31.5,6.0,cloudy,1.6,yes,2205,0,0,yes
3,no,adult,534784,37.1,88.0,20.0,cool,reduced,pale_cyanotic,less_3_sec,...,distend_large,75.0,81.0,,1.0,yes,1400,0,0,no
4,yes,adult,529840,38.3,50.0,12.0,,normal,bright_pink,less_3_sec,...,distend_small,37.0,6.8,cloudy,2.6,yes,2208,0,0,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,no,adult,529461,40.3,114.0,36.0,cool,reduced,normal_pink,more_3_sec,...,distend_large,57.0,8.1,serosanguious,4.5,yes,3205,0,0,yes
820,yes,adult,535338,37.2,100.0,20.0,cool,reduced,pale_cyanotic,more_3_sec,...,distend_small,50.0,66.0,serosanguious,2.0,yes,2209,0,0,no
821,yes,adult,529640,39.2,132.0,12.0,cool,reduced,dark_cyanotic,more_3_sec,...,,53.0,7.6,serosanguious,4.5,yes,2205,0,0,no
822,no,adult,5287179,38.3,54.0,66.0,normal,normal,normal_pink,less_3_sec,...,,49.0,8.6,clear,5.0,no,3111,0,0,yes
