In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, ConfusionMatrixDisplay
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('df_att.csv')
X = df.drop(['Heart Disease', 'Unnamed: 0'], axis = 1)
y = df['Heart Disease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
model  =  CatBoostClassifier(iterations=500,
                             learning_rate=0.05,
                             depth=6,
                             loss_function='Logloss',
                             eval_metric='AUC', 
                             verbose=100)


In [4]:
cat_features = list(X_train.select_dtypes(exclude='number').columns)
print(cat_features)

model.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_test, y_test))

['Sex', 'Chest pain type', 'FBS over 120', 'EKG results', 'Exercise angina', 'Slope of ST']
0:	test: 0.9377821	best: 0.9377821 (0)	total: 662ms	remaining: 5m 30s


KeyboardInterrupt: 

In [None]:
model.get_best_score()

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, normalize='true')
disp.figure_.savefig("Confusion Matrix - First Model")

In [None]:
model.feature_importances_


In [8]:
X_test.columns

Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium'],
      dtype='object')

In [None]:
X_test['Exercise angina'].value_counts()

In [9]:
X_test.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
364426,57,M,Common,120,177,N,N,132,N,1.2,Flat,0,3
224752,41,M,Dangerous,132,274,N,P,122,N,0.0,Upsloping,0,3
110423,63,M,Dangerous,140,239,N,P,145,Y,2.2,Flat,1,7
272555,47,M,Dangerous,140,197,N,P,147,Y,1.6,Flat,1,7
199651,50,F,Hard,140,298,N,N,152,N,0.0,Upsloping,0,3


In [None]:
model.save_model('first_model.cbm')

In [None]:
model.best_score_

In [5]:
model = CatBoostClassifier()
model.load_model('first_model.cbm')
model

<catboost.core.CatBoostClassifier at 0x282c7d41f90>

In [12]:
test = pd.read_csv('test.csv')
test = test.drop('id', axis=1)
test

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,58,1,3,120,288,0,2,145,1,0.8,2,3,3
1,55,0,2,120,209,0,0,172,0,0.0,1,0,3
2,54,1,4,120,268,0,0,150,1,0.0,2,3,7
3,44,0,3,112,177,0,0,168,0,0.9,1,0,3
4,43,1,1,138,267,0,0,163,0,1.8,2,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
269995,58,1,2,120,222,0,0,172,0,1.0,1,0,7
269996,58,1,4,132,289,0,0,172,0,2.8,2,0,3
269997,63,1,3,108,201,1,0,158,0,0.8,1,0,3
269998,59,1,4,120,274,0,2,163,0,0.5,1,0,3


In [14]:
predict = model.predict_proba(test)[:,1]
predict

array([0.58041315, 0.01703616, 0.67824862, ..., 0.03871774, 0.02221842,
       0.02357754], shape=(270000,))

In [20]:
predict1_df = pd.DataFrame({'id': index ,'predict': predict})

predict1_df.head()

Unnamed: 0,id,predict
0,630000,0.580413
1,630001,0.017036
2,630002,0.678249
3,630003,0.010178
4,630004,0.165085


In [18]:
test = pd.read_csv('test.csv')
index = test.id

In [22]:
predict1_df.to_csv('first_submission.csv', index=False)