1. concat train & horse (remove duplicates)
2. train-test split (from now on assume never seen test), see https://stackoverflow.com/questions/56308116/should-feature-selection-be-done-before-train-test-split-or-after
3. preprocessing pipeline
    * feature engineering
    * missing values in obj cols
    * missing values in num cols
    * feature engineering - one-hot-encode obj cols
    * feature engineering - standardize num cols
    * feature engineering - label-encode outcome
4. train model
5. fit model
6. deploy model

In [1]:
import numpy as np
import pandas as pd
import statistics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

In [2]:
train = pd.read_csv("train.csv").drop(["id"],axis=1)
#sorted(train.columns)
horse = pd.read_csv("horse.csv")
#sorted(horse.columns)
sorted(train.columns) == sorted(horse.columns)
# test.csv is actually the eval df, need to split test from train + horse

True

In [3]:
# 1. concat train & horse (remove duplicates)
df = pd.concat([train,horse],ignore_index=True).drop_duplicates()
df.info()
# column names is a dictionary, pd.concat accounts for different orders
# no missing data in train, missing data in horse

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1534 entries, 0 to 1533
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                1534 non-null   object 
 1   age                    1534 non-null   object 
 2   hospital_number        1534 non-null   int64  
 3   rectal_temp            1474 non-null   float64
 4   pulse                  1510 non-null   float64
 5   respiratory_rate       1476 non-null   float64
 6   temp_of_extremities    1478 non-null   object 
 7   peripheral_pulse       1465 non-null   object 
 8   mucous_membrane        1487 non-null   object 
 9   capillary_refill_time  1502 non-null   object 
 10  pain                   1479 non-null   object 
 11  peristalsis            1490 non-null   object 
 12  abdominal_distention   1478 non-null   object 
 13  nasogastric_tube       1430 non-null   object 
 14  nasogastric_reflux     1428 non-null   object 
 15  naso

In [4]:
# 2. train-test split
X_train,X_test,y_train,y_test=train_test_split(df.drop(["outcome"],axis=1),
                                               df["outcome"],
                                               test_size=0.2,
                                               random_state=42)

In [5]:
# obj cols
obj_cols = list(X_train.select_dtypes(include='object'))
obj_cols

['surgery',
 'age',
 'temp_of_extremities',
 'peripheral_pulse',
 'mucous_membrane',
 'capillary_refill_time',
 'pain',
 'peristalsis',
 'abdominal_distention',
 'nasogastric_tube',
 'nasogastric_reflux',
 'rectal_exam_feces',
 'abdomen',
 'abdomo_appearance',
 'surgical_lesion',
 'cp_data']

In [6]:
# inspect obj cols
for i in obj_cols:
    print(X_train[i].unique())
# "none" - valid entry, "None" - missing data, nan - missing data

['yes' 'no']
['adult' 'young']
['cool' 'cold' 'normal' nan 'warm' 'None']
['reduced' 'normal' nan 'increased' 'absent' 'None']
['pale_cyanotic' 'pale_pink' 'bright_red' 'normal_pink' 'bright_pink' nan
 'dark_cyanotic' 'None']
['less_3_sec' 'more_3_sec' nan 'None' '3']
['severe_pain' 'extreme_pain' 'mild_pain' 'depressed' 'alert' nan 'None'
 'slight']
['absent' 'hypomotile' nan 'normal' 'hypermotile' 'None' 'distend_small']
['severe' 'slight' 'moderate' nan 'none' 'None']
['significant' 'none' 'slight' nan 'None']
['more_1_liter' nan 'less_1_liter' 'none' 'None']
['absent' nan 'normal' 'decreased' 'increased' 'None' 'serosanguious']
['distend_large' 'distend_small' nan 'other' 'firm' 'normal' 'None']
['serosanguious' 'cloudy' 'clear' nan 'None']
['yes' 'no']
['no' 'yes']


In [7]:
# num cols
num_cols = list(X_train.select_dtypes(include=["int64","float64"]))
num_cols

['hospital_number',
 'rectal_temp',
 'pulse',
 'respiratory_rate',
 'nasogastric_reflux_ph',
 'packed_cell_volume',
 'total_protein',
 'abdomo_protein',
 'lesion_1',
 'lesion_2',
 'lesion_3']

In [8]:
# inspect num cols
X_train[num_cols].describe().transpose()
# inspect if min & max far from 25% & 75%
# problematic ones - hospital_number, lesion_1, lesion_2, lesion_3
# convert hospital_number to string
# remove lesion_1, lesion_2, lesion_3

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hospital_number,1227.0,980478.109209,1392836.0,518476.0,528890.0,529849.0,534238.5,5305129.0
rectal_temp,1176.0,38.206207,0.7746652,35.4,37.8,38.2,38.6,40.8
pulse,1205.0,78.483817,29.07081,30.0,52.0,72.0,96.0,184.0
respiratory_rate,1177.0,29.957519,16.38711,8.0,18.0,24.0,36.0,96.0
nasogastric_reflux_ph,1026.0,4.417154,1.938035,1.0,2.0,4.5,6.2,7.5
packed_cell_volume,1204.0,49.078073,10.57052,23.0,42.0,48.0,56.0,75.0
total_protein,1200.0,21.970417,26.84566,3.5,6.6,7.5,13.0,89.0
abdomo_protein,1066.0,3.281332,1.63213,0.1,2.0,3.0,4.3,10.1
lesion_1,1227.0,3677.442543,5147.581,0.0,2124.0,2209.0,3205.0,41110.0
lesion_2,1227.0,30.555012,352.918,0.0,0.0,0.0,0.0,7111.0


In [9]:
# 3.1. feature engineering
X_train = X_train.drop(["lesion_1","lesion_2","lesion_3"],axis=1)
#X_train.columns.values
X_train["hospital_number"] = X_train["hospital_number"].astype(str)
#X_train["hospital_number"].info()

In [10]:
# overwrite obj cols
obj_cols = list(X_train.select_dtypes(include='object'))
obj_cols

['surgery',
 'age',
 'hospital_number',
 'temp_of_extremities',
 'peripheral_pulse',
 'mucous_membrane',
 'capillary_refill_time',
 'pain',
 'peristalsis',
 'abdominal_distention',
 'nasogastric_tube',
 'nasogastric_reflux',
 'rectal_exam_feces',
 'abdomen',
 'abdomo_appearance',
 'surgical_lesion',
 'cp_data']

In [11]:
# overwrite num cols
num_cols = list(X_train.select_dtypes(include=["int64","float64"]))
num_cols

['rectal_temp',
 'pulse',
 'respiratory_rate',
 'nasogastric_reflux_ph',
 'packed_cell_volume',
 'total_protein',
 'abdomo_protein']

###### pipeline below

In [12]:
# test_df
# 3.1. feature engineering
X_test = X_test.drop(["lesion_1","lesion_2","lesion_3"],axis=1)
#X_test.columns.values
X_test["hospital_number"] = X_test["hospital_number"].astype(str)
#X_test["hospital_number"].info()

In [13]:
# train_df & test_df
# 3.2. & 3.3. replace missing values in obj & num cols
def replace_na(train_df,test_df):
    """
    obj cols - replace "None" and nan with mode from train_df
    num cols - replace nan with mean from train_df
    """
    for i in obj_cols:
        train_df[i][(train_df[i]=="None")|(train_df[i].isna()==True)] = statistics.mode(train_df[i])
        test_df[i][(test_df[i]=="None")|(test_df[i].isna()==True)] = statistics.mode(train_df[i])
    for i in num_cols:
        train_df[i][train_df[i].isna()==True] = train_df[i].mean()
        test_df[i][test_df[i].isna()==True] = test_df[i].mean()
    return train_df,test_df

In [14]:
X_train,X_test = replace_na(X_train,X_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[i][(train_df[i]=="None")|(train_df[i].isna()==True)] = statistics.mode(train_df[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[i][(test_df[i]=="None")|(test_df[i].isna()==True)] = statistics.mode(train_df[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[i][(train_df[i]=="None")|(train_df[i].isna()==True)] = statistics.mode(train_df[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

In [15]:
# test_df
# inspect obj cols
#for i in obj_cols:
    #print(X_test[i].unique())
# inspect num cols
#X_test[num_cols].describe().transpose()

In [16]:
# 3.4. feature engineering - one-hot-encode obj cols
ohe = OneHotEncoder(handle_unknown="ignore") # handle_unknown bc large num of hospital_number
ohe.fit(X_train[obj_cols])

In [17]:
X_train_ohe = pd.DataFrame(ohe.transform(X_train[obj_cols]).toarray()) # sparse matrix, array, df
X_train_ohe.head(10) # re-indexed but order did not change, 335 cols

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,325,326,327,328,329,330,331,332,333,334
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
5,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
6,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
7,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
8,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
9,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [18]:
# 3.5. feature engineering - standardize num cols
sc = StandardScaler()
sc.fit(X_train[num_cols])

In [19]:
X_train_sc = pd.DataFrame(sc.transform(X_train[num_cols])) # array, df
X_train_sc.columns = X_train[num_cols].columns
X_train_sc.head(10) # re-indexed but order did not change, 7 cols

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein
0,-0.008189,-1.058574,-0.870011,-1.364593,2.190002,1.583776,-0.842664
1,0.519466,0.052651,0.127313,1.175861,-1.536128,-0.582963,0.8014531
2,0.519466,2.830712,-0.62068,0.0,-1.058419,0.0,2.920539e-16
3,-0.272016,2.344551,0.376644,-1.364593,0.756875,1.659141,-0.842664
4,-0.008189,-0.780768,0.625975,1.740406,-0.198543,-0.556585,-0.5138406
5,1.04712,-0.502962,0.376644,1.458134,0.947959,-0.507598,-0.842664
6,0.519466,-1.128026,0.0,0.0,-0.007459,-0.484988,-1.105723
7,-0.272016,0.330457,-0.371349,0.724225,-1.249502,-0.579194,0.2753356
8,-0.667756,-0.780768,0.0,0.0,0.0,0.0,2.920539e-16
9,-0.008189,-1.266929,-0.994676,1.458134,-0.962877,-0.500061,0.2095709


In [20]:
X_train_fe = pd.concat([X_train_sc,X_train_ohe],axis=1)
X_train_fe.head(10) # 342 cols

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,0,1,2,...,325,326,327,328,329,330,331,332,333,334
0,-0.008189,-1.058574,-0.870011,-1.364593,2.190002,1.583776,-0.842664,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.519466,0.052651,0.127313,1.175861,-1.536128,-0.582963,0.8014531,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.519466,2.830712,-0.62068,0.0,-1.058419,0.0,2.920539e-16,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,-0.272016,2.344551,0.376644,-1.364593,0.756875,1.659141,-0.842664,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,-0.008189,-0.780768,0.625975,1.740406,-0.198543,-0.556585,-0.5138406,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
5,1.04712,-0.502962,0.376644,1.458134,0.947959,-0.507598,-0.842664,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
6,0.519466,-1.128026,0.0,0.0,-0.007459,-0.484988,-1.105723,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
7,-0.272016,0.330457,-0.371349,0.724225,-1.249502,-0.579194,0.2753356,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
8,-0.667756,-0.780768,0.0,0.0,0.0,0.0,2.920539e-16,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
9,-0.008189,-1.266929,-0.994676,1.458134,-0.962877,-0.500061,0.2095709,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [21]:
# 3.6. feature engineering - label-encode outcome
le = LabelEncoder()
le.fit(y_train) 

In [22]:
y_train_le = le.transform(y_train) # array
y_train_le[:10] # re-indexed but order did not change

array([1, 2, 0, 1, 2, 2, 2, 0, 0, 0])

###### pipeline below

In [23]:
# test_df
# 3.4. feature engineering - one-hot-encod obj cols
X_test_ohe = pd.DataFrame(ohe.transform(X_test[obj_cols]).toarray())
# 3.5. feature engineering - standardize num cols
X_test_sc = pd.DataFrame(sc.transform(X_test[num_cols]))
X_test_sc.columns = X_test[num_cols].columns
X_test_fe = pd.concat([X_test_sc,X_test_ohe],axis=1)
# 3.6. feature engineering - label-encode outcome
y_test_le = le.transform(y_test)

In [24]:
#X_test_fe.head(10)
#y_test_le.head[:10]

###### lgbm

In [25]:
# 4. train model
lgbm = LGBMClassifier()
lgbm.fit(X_train_fe.values,y_train_le) # input are arrays

In [26]:
# 5. fit model
y_pred = lgbm.predict(X_test_fe.values)
y_pred

array([0, 0, 2, 2, 1, 0, 0, 2, 2, 2, 2, 0, 2, 0, 1, 2, 2, 2, 2, 0, 0, 2,
       0, 2, 2, 0, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 0, 1, 0, 2, 0, 2, 0, 2,
       1, 0, 2, 0, 1, 2, 1, 0, 2, 0, 0, 2, 0, 2, 2, 2, 2, 1, 2, 2, 0, 2,
       2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0,
       0, 0, 2, 2, 0, 2, 0, 0, 1, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1,
       0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 0, 1, 0, 1, 2, 2, 0, 2, 2, 2, 0, 1, 1, 2, 0, 2, 1, 2, 2, 2, 2,
       2, 2, 1, 2, 1, 1, 2, 2, 2, 0, 0, 1, 0, 1, 1, 1, 1, 0, 2, 2, 2, 2,
       1, 2, 2, 2, 0, 1, 2, 0, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2,
       2, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 0,
       1, 0, 0, 2, 2, 2, 1, 0, 1, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 1, 2, 2,
       2, 2, 2, 0, 2, 0, 0, 0, 2, 2, 0, 1, 2, 2, 1, 0, 0, 2, 0, 0, 0, 1,
       2, 2, 0, 0, 2, 1, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2,
       0, 2, 2, 1, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2,

In [27]:
print(classification_report(y_test_le,y_pred))

              precision    recall  f1-score   support

           0       0.67      0.66      0.66        97
           1       0.78      0.58      0.67        55
           2       0.76      0.83      0.79       155

    accuracy                           0.73       307
   macro avg       0.74      0.69      0.71       307
weighted avg       0.73      0.73      0.73       307



In [31]:
# 6. deploy model
deploy = pd.read_csv("test.csv").drop(["id"],axis=1)
deploy_id = pd.read_csv("test.csv")["id"]
# both indexed from 0

In [39]:
# 3.1. feature engineering
deploy = deploy.drop(["lesion_1","lesion_2","lesion_3"],axis=1)
#deploy.columns.values
deploy["hospital_number"] = deploy["hospital_number"].astype(str)
#deploy["hospital_number"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 824 entries, 0 to 823
Series name: hospital_number
Non-Null Count  Dtype 
--------------  ----- 
824 non-null    object
dtypes: object(1)
memory usage: 6.6+ KB


In [40]:
# 3.2. & 3.3. replace missing values in obj & num cols
X_train,deploy = replace_na(X_train,deploy)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[i][(train_df[i]=="None")|(train_df[i].isna()==True)] = statistics.mode(train_df[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[i][(test_df[i]=="None")|(test_df[i].isna()==True)] = statistics.mode(train_df[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[i][(train_df[i]=="None")|(train_df[i].isna()==True)] = statistics.mode(train_df[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentat

In [43]:
# inspect obj cols
#for i in obj_cols:
    #print(deploy[i].unique())
# inspect num cols
#deploy[num_cols].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rectal_temp,824.0,38.244539,0.785234,36.0,37.8,38.2,38.6,40.8
pulse,824.0,80.229369,29.164711,36.0,54.0,76.0,100.0,184.0
respiratory_rate,824.0,30.71966,17.43191,9.0,18.0,28.0,36.0,96.0
nasogastric_reflux_ph,824.0,4.508495,1.883464,1.0,3.0,4.5,6.5,7.5
packed_cell_volume,824.0,49.06335,10.450138,23.0,43.0,48.0,55.0,75.0
total_protein,824.0,20.796238,26.413588,3.9,6.6,7.5,8.9,89.0
abdomo_protein,824.0,3.33642,1.539235,0.1,2.0,3.3,4.3,10.1


In [44]:
# 3.4. feature engineering - one-hot-encod obj cols
deploy_ohe = pd.DataFrame(ohe.transform(deploy[obj_cols]).toarray())
# 3.5. feature engineering - standardize num cols
deploy_sc = pd.DataFrame(sc.transform(deploy[num_cols]))
deploy_sc.columns = deploy[num_cols].columns
deploy_fe = pd.concat([deploy_sc,deploy_ohe],axis=1)

In [46]:
#deploy_fe.head(10)

In [49]:
# 5. fit model
y_deploy_le = lgbm.predict(deploy_fe.values)
#y_deploy_le
y_deploy = le.inverse_transform(y_deploy_le)
#y_deploy

array(['lived', 'lived', 'lived', 'euthanized', 'lived', 'died', 'lived',
       'died', 'lived', 'lived', 'died', 'lived', 'lived', 'euthanized',
       'died', 'lived', 'lived', 'lived', 'died', 'died', 'died', 'died',
       'died', 'died', 'died', 'lived', 'lived', 'euthanized', 'lived',
       'died', 'lived', 'died', 'died', 'died', 'lived', 'euthanized',
       'died', 'lived', 'died', 'lived', 'died', 'euthanized', 'died',
       'euthanized', 'lived', 'lived', 'died', 'lived', 'lived', 'lived',
       'euthanized', 'lived', 'died', 'died', 'died', 'lived', 'died',
       'euthanized', 'euthanized', 'died', 'died', 'lived', 'lived',
       'died', 'died', 'lived', 'euthanized', 'died', 'lived', 'lived',
       'died', 'died', 'lived', 'lived', 'euthanized', 'died',
       'euthanized', 'euthanized', 'died', 'died', 'died', 'lived',
       'lived', 'euthanized', 'lived', 'lived', 'lived', 'euthanized',
       'euthanized', 'lived', 'lived', 'lived', 'died', 'euthanized',
       

In [57]:
y_test_pred = pd.concat([deploy_id.to_frame(),
                         pd.DataFrame(y_deploy,columns=["outcome"])],
                        axis=1)
y_test_pred

Unnamed: 0,id,outcome
0,1235,lived
1,1236,lived
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,euthanized
820,2055,euthanized
821,2056,lived
822,2057,lived


In [59]:
y_test_pred.to_csv("y_test_pred.csv",index=False)

In [None]:
# test gitignore