In [6]:
import pandas as pd

from catboost import CatBoostClassifier
import numpy as np



In [18]:
target = 'Heart Disease'
df = pd.read_csv('df_att.csv')
X = df.drop(target, axis=1)
y = df[target]

cat_features = X.select_dtypes(exclude='number').columns.tolist()
num_features = X.select_dtypes(include='number').columns.tolist()

test = pd.read_csv('test.csv')

df = test.drop('id', axis=1).copy()

df['Sex'] = list(map(lambda x: 'M' if x==1 else 'F', df['Sex']))
#Transforming into category, thus 0 and 1 doesn't have distance interpretation on sex

df['Chest pain type'] = list(map(lambda x: 'Light' if x==1 else 'Common' if x==2 else 'Hard' if x==3 else 'Dangerous', df['Chest pain type']))
#Transforming into category, same reason

df['FBS over 120'] = list(map(lambda x: 'Y' if x==1 else 'N', df['FBS over 120']))
#Transforming into category, same reason

df['Exercise angina'] = list(map(lambda x: 'Y' if x==1 else 'N', df['Exercise angina']))
# Transforming into category, same reason

df['Slope of ST'] = list(map(lambda x: 'Upsloping' if x==1 else 'Flat' if x==2 else 'Downsloping', df['Slope of ST']))
# Transforming into category, same reason

df['EKG results'] = list(map(lambda x: 'N' if x in [0,1] else 'P', df['EKG results']))
# Merging 0 and 1 to 'N' thus 0 and 1 have the same relation with Heart Disease and, transforming into category
# thus, here also, there's no distance relation between 0, 1 and 2



In [20]:
df

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,58,M,Hard,120,288,N,P,145,Y,0.8,Flat,3,3
1,55,F,Common,120,209,N,N,172,N,0.0,Upsloping,0,3
2,54,M,Dangerous,120,268,N,N,150,Y,0.0,Flat,3,7
3,44,F,Hard,112,177,N,N,168,N,0.9,Upsloping,0,3
4,43,M,Light,138,267,N,N,163,N,1.8,Flat,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
269995,58,M,Common,120,222,N,N,172,N,1.0,Upsloping,0,7
269996,58,M,Dangerous,132,289,N,N,172,N,2.8,Flat,0,3
269997,63,M,Hard,108,201,Y,N,158,N,0.8,Upsloping,0,3
269998,59,M,Dangerous,120,274,N,P,163,N,0.5,Upsloping,0,3


In [None]:
model = CatBoostClassifier(iterations=1233, 
                          depth=5,
                          learning_rate=0.05563228358640898,
                          l2_leaf_reg=1.2388924496891942,
                          border_count=157,
                          cat_features=cat_features)



In [15]:
model.fit(X, y, cat_features=cat_features, verbose=100)

0:	learn: 0.6384375	total: 241ms	remaining: 4m 57s
100:	learn: 0.2726614	total: 21.4s	remaining: 4m
200:	learn: 0.2699810	total: 43.2s	remaining: 3m 41s
300:	learn: 0.2687610	total: 1m 5s	remaining: 3m 23s
400:	learn: 0.2678155	total: 1m 28s	remaining: 3m 3s
500:	learn: 0.2671836	total: 1m 51s	remaining: 2m 43s
600:	learn: 0.2667626	total: 2m 14s	remaining: 2m 21s
700:	learn: 0.2663862	total: 2m 38s	remaining: 1m 59s
800:	learn: 0.2660691	total: 3m 1s	remaining: 1m 38s
900:	learn: 0.2657771	total: 3m 25s	remaining: 1m 15s
1000:	learn: 0.2655011	total: 3m 49s	remaining: 53.2s
1100:	learn: 0.2652563	total: 4m 14s	remaining: 30.5s
1200:	learn: 0.2650337	total: 4m 39s	remaining: 7.45s
1232:	learn: 0.2649604	total: 4m 47s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x224647a0e10>

In [22]:
CBC_prob = model.predict_proba(df)[:,1]
CBC_prob

array([0.95194955, 0.00706706, 0.98871946, ..., 0.04503868, 0.18120991,
       0.02596897], shape=(270000,))

In [23]:
np.save('CBC_prob', CBC_prob)