#Warm Up: Machine Learning with a Heart
##A competition hosted on DrivenData
###Aim: Given a set of features describing a patient, estimate the likelihood of them having heart disease.

####**Uploading data

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [0]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import warnings
from sklearn.exceptions import DataConversionWarning

###1. Creating a combined feature dataframe for preprocessing.

In [19]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
y=pd.read_csv('train_labels.csv')

#joining features
allfeat = pd.concat([train, test],axis=0)
print(train.shape,test.shape,allfeat.shape)

(180, 14) (90, 14) (270, 14)


###2. Splitting numerical data into categorical ranges for ease of classification

In [0]:
allfeat['resting_blood_pressure'] = pd.qcut(allfeat['resting_blood_pressure'], 10)
allfeat['serum_cholesterol_mg_per_dl'] = pd.qcut(allfeat['serum_cholesterol_mg_per_dl'], 10)
allfeat['max_heart_rate_achieved'] = pd.qcut(allfeat['max_heart_rate_achieved'], 5)
allfeat['age'] = pd.qcut(allfeat['age'],5)

###3. One-hot encoding all categorical variables in data

In [0]:
allfeat=pd.concat([allfeat,pd.get_dummies(allfeat['thal'], prefix='thal')],axis=1) 
allfeat=allfeat.drop(columns='thal') 

allfeat=pd.concat([allfeat,pd.get_dummies(allfeat['resting_ekg_results'],prefix='ekg')],axis=1) 
allfeat=allfeat.drop(columns='resting_ekg_results') 

allfeat=pd.concat([allfeat,pd.get_dummies(allfeat['chest_pain_type'],prefix='pain_type')],axis=1) 
allfeat=allfeat.drop(columns='chest_pain_type') 

allfeat=pd.concat([allfeat,pd.get_dummies(allfeat['max_heart_rate_achieved'],prefix='maxheartrate')],axis=1) 
allfeat=allfeat.drop(columns='max_heart_rate_achieved') 

allfeat=pd.concat([allfeat,pd.get_dummies(allfeat['age'],prefix='age')],axis=1) 
allfeat=allfeat.drop(columns='age') 

allfeat=pd.concat([allfeat,pd.get_dummies(allfeat['resting_blood_pressure'],prefix='restbp')],axis=1) 
allfeat=allfeat.drop(columns='resting_blood_pressure') 

allfeat=pd.concat([allfeat,pd.get_dummies(allfeat['serum_cholesterol_mg_per_dl'],prefix='serum_cholest')],axis=1) 
allfeat=allfeat.drop(columns='serum_cholesterol_mg_per_dl') 

allfeat=pd.concat([allfeat,pd.get_dummies(allfeat['sex'],prefix='sex')],axis=1) 
allfeat=allfeat.drop(columns='sex') 


####**Renaming columns (columns cannot have [ ] or ',' within header )

In [0]:
allfeat.columns=['patient_id', 'slope_of_peak_exercise_st_segment', 'num_major_vessels', 'fasting_blood_sugar_gt_120_mg_per_dl', 'oldpeak_eq_st_depression', 'exercise_induced_angina', 'thal_fixed_defect', 'thal_normal', 'thal_reversible_defect', 'ekg_0', 'ekg_1', 'ekg_2','pain_type_1', 'pain_type_2', 'pain_type_3', 'pain_type_4', 'maxheartrate_70.999_128.8', 'maxheartrate_128.8_147.0', 'maxheartrate_147.0_159.0','maxheartrate_159.0_170.0', 'maxheartrate_170.0_202.0', 'age_28.999_45.0', 'age_45.0_52.0', 'age_52.0_58.0', 'age_58.0_62.2', 'age_62.2_77.0','restbp1','restbp2','restbp3','restbp4','restbp5','restbp6','restbp7','restbp8','restbp9','restbp10','serum_cholest_1','serum_cholest_2','serum_cholest_3','serum_cholest_4','serum_cholest_5','serum_cholest_6','serum_cholest_7','serum_cholest_8','serum_cholest_9','serum_cholest_10','fem','male']

###4. Splitting our combined dataframe into training and test data according to original data indices

In [23]:
train=allfeat[:][0:180]
test=allfeat[:][180:270]

X_train=train.drop(columns='patient_id')
X_test=test.drop(columns='patient_id')
y=y.drop(columns='patient_id')

print(train.shape,test.shape,allfeat.info())
print('Training data...',train.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270 entries, 0 to 89
Data columns (total 48 columns):
patient_id                              270 non-null object
slope_of_peak_exercise_st_segment       270 non-null int64
num_major_vessels                       270 non-null int64
fasting_blood_sugar_gt_120_mg_per_dl    270 non-null int64
oldpeak_eq_st_depression                270 non-null float64
exercise_induced_angina                 270 non-null int64
thal_fixed_defect                       270 non-null uint8
thal_normal                             270 non-null uint8
thal_reversible_defect                  270 non-null uint8
ekg_0                                   270 non-null uint8
ekg_1                                   270 non-null uint8
ekg_2                                   270 non-null uint8
pain_type_1                             270 non-null uint8
pain_type_2                             270 non-null uint8
pain_type_3                             270 non-null uint8
pain_typ

###5. Apply classifier

####applying GridSearchCV() using an SVC(kernel='rbf')

In [0]:
param_grid = [{'C': np.arange(0.1, 10.1, 0.1)}] #set of trial values for min_child_weight
clf = GridSearchCV(SVC(probability=True), param_grid, cv=10, scoring= 'neg_log_loss',iid=True)

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

clf.fit(X_train,y)

probs=clf.predict_proba(X_test)

####storing probability of heart disease being present

In [25]:
present_proba=np.delete(probs,0,axis=1)
present_proba=present_proba.flatten()
print(present_proba.shape)

(90,)


###6. Store predictions in dataframe with appropriate column names and order, and save as .csv file

In [0]:
op=pd.DataFrame(data={'patient_id':test['patient_id'],'heart_disease_present':present_proba})
swaptitle=['patient_id','heart_disease_present']
op=op.reindex(columns=swaptitle)
op.to_csv('Gridsearch_SVC_submission.csv',index=False)