## 1. Load data
- pandas is a convenient package to represent and process tabular data: https://pandas.pydata.org/docs/user_guide/10min.html

In [None]:
import os
!pwd

In [None]:
import numpy as np  # this package is for matrix computation
import pandas as pd  # this package is for data formating and processing

# load data from data file
#train_df = pd.read_csv('/kaggle/input/heart-attack/train.csv')
train_df = pd.read_csv('data/train.csv')
prediction_X_df = pd.read_csv('data/test_X.csv')

In [None]:
train_df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

for i, col in enumerate(['Age', 'Sex', 'ChestPainType', 'RestingBP','Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']):
    plt.figure(i)
    sns.catplot(x=col, y='HeartDisease', data=train_df, kind='point', aspect=2, )

In [None]:
train_df.pivot_table('HeartDisease',index='Sex', columns='Age', aggfunc='count')

## 2. Data processing
- Categorical feature -> numerical feature
- Feature scaling: https://scikit-learn.org/stable/modules/preprocessing.html
- ...

In [None]:
# this function is to convert categorical feature to numerical (one-hot representation)
def convert_categorical_to_numerical(df):
    new_df = df.copy()  # so operations on new_df will not influence df

    # check get_dummies doc: https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html for more info
    sex = pd.get_dummies(new_df['Sex'], prefix='sex', dtype=float) # convert Sex to integer values
    chest = pd.get_dummies(new_df['ChestPainType'], prefix='chest', dtype=float) # convert ChestPainType to float values
    # YOUR TASK: convert other categorical features
    restingECG = pd.get_dummies(new_df['RestingECG'], prefix='restingECG', dtype=float) # convert RestingECG to float values
    exerciseAngina = pd.get_dummies(new_df['ExerciseAngina'], prefix='exerciseAngina', dtype=float) # convert ExerciseAngina to float values
    sT_Slope = pd.get_dummies(new_df['ST_Slope'], prefix='sT_Slope', dtype=float) # convert ST_Slope to float values

    # drop categorical features with their numerical values
    # YOUR TASK: drop other categorical features
    new_df.drop(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], axis=1, inplace=True)


    # create new dataframe with only numerical values
    # YOUR TASK: concatenate with other converted features
    new_df = pd.concat([new_df, sex, chest, restingECG, exerciseAngina, sT_Slope], axis=1)

    return new_df

# convert features for training and testing data
my_train_df = convert_categorical_to_numerical(train_df)
my_prediction_X_df = convert_categorical_to_numerical(prediction_X_df)

my_train_df.head(n=10)

In [None]:
# You may apply feature proceccing tricks mentioned in class
# e.g., feature normalization/standardization etc


In [None]:
#temp to remove some features

my_prediction_X_df = my_prediction_X_df.drop(["sex_M", "exerciseAngina_N", ], axis=1)
my_train_df = my_train_df.drop(["sex_M", "exerciseAngina_N"], axis=1)

In [None]:
my_prediction_X_df.info()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])
my_prediction_X_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(my_prediction_X_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])

In [None]:
#ITERATION 2

from sklearn.feature_selection import SelectKBest, f_classif

def select_kbest_clf(data_frame, target, k=2):
    """
    Selecting K-Best features for classification
    :param data_frame: A pandas dataFrame with the training data
    :param target: target variable name in DataFrame
    :param k: desired number of features from the data
    :returns feature_scores: scores for each feature in the data as
    pandas DataFrame
    """
    feat_selector = SelectKBest(f_classif, k=k)
    _ = feat_selector.fit(data_frame.drop(target, axis=1), data_frame[target])

    feat_scores = pd.DataFrame()
    feat_scores["F Score"] = feat_selector.scores_
    feat_scores["P Value"] = feat_selector.pvalues_
    feat_scores["Support"] = feat_selector.get_support()
    feat_scores["Attribute"] = data_frame.drop(target, axis=1).columns

    return feat_scores



In [None]:

n_feat = 21
kbest_feat = select_kbest_clf(my_train_df, "HeartDisease", k=n_feat)
kbest_feat_sorted = kbest_feat.sort_values(["F Score", "P Value"], ascending=[False, False], ignore_index=True)

kbest_feat_sorted




best_features = []
for it in range(0, (n_feat-1)):
  best_features.append(kbest_feat_sorted['Attribute'][it])

print(best_features)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

#features = my_train_df.drop(["HeartDisease", "PatientID"], axis=1)#Iteration 1
features = my_train_df[best_features] #Iteration 2
labels = my_train_df['HeartDisease']

train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.2, random_state=42)

#X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
#prediction_X = my_prediction_X_df.drop(["PatientID"], axis=1)
prediction_X = my_prediction_X_df[best_features]#Iteration 2

In [None]:
#NO Iteration 2
#from sklearn.feature_selection import SelectKBest, chi2
#train_X_new = SelectKBest(chi2, k=20).fit_transform(train_X, train_y)
from sklearn.feature_selection import SelectKBest, f_classif
select_k_best_classifier = SelectKBest(f_classif, k=10).fit_transform(train_X, train_y)


print(train_X.shape,  select_k_best_classifier.shape)

In [None]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
# prepare features and labels for training/testing



In [None]:
from sklearn.ensemble import RandomForestClassifier
#model = RandomForestClassifier(n_estimators=240, max_depth=8, random_state=5) # 0.96875 0.972644376899696 - 0.8333333333333334 0.8620689655172413
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9947916666666666 0.9953775038520801 - 0.8333333333333334 0.8620689655172413
#model = KNeighborsClassifier(n_neighbors=11) # 0.8958333333333334 0.9093655589123866 - 0.7986111111111112 0.8304093567251463
#model = LogisticRegression(C=0.1) #0.8802083333333334 0.8949771689497718 .8263888888888888 0.8571428571428572
#model = SVC(kernel='rbf', C=1, random_state=5) #0.9131944444444444 0.9242424242424242 0.8333333333333334 0.8636363636363635
#model = SVC(kernel='linear', C=21, random_state=5) #0.8871527777777778 0.9016641452344931 0.8333333333333334 0.8620689655172413

#ITERATION 2
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.8055555555555556 0.8444444444444446 - 0.8055555555555556 0.8444444444444446
#model = RandomForestClassifier(n_estimators=220, max_depth=3, random_state=5) # 0.8472222222222222 0.8682634730538922 - 0.8055555555555556 0.8444444444444446

#ITERATION 3 10s k
#model = RandomForestClassifier(n_estimators=220, max_depth=3, random_state=5) # 0.859375 0.8785607196401799 - 0.7986111111111112 0.8379888268156424
#model = RandomForestClassifier(n_estimators=220, max_depth=5, random_state=5) # 0.8958333333333334 0.9096385542168673 - 0.8125 0.8439306358381502
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.8958333333333334 0.9096385542168673 - 0.8263888888888888 0.8554913294797688
#model = LogisticRegression(C=0.1) #0.8611111111111112 0.8776758409785933 - 0.8125 0.8457142857142856

#ITERATION 4 7s k
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9097222222222222 0.9204892966360857 - 0.8194444444444444 0.853932584269663
#model = RandomForestClassifier(n_estimators=220, max_depth=5, random_state=5) # 0.8784722222222222 0.8955223880597015 - 0.8194444444444444 0.853932584269663

#ITERATION 4 15s k
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9930555555555556 0.9938461538461539 - 0.8333333333333334 0.8620689655172413
#model = RandomForestClassifier(n_estimators=220, max_depth=5, random_state=5) # 0.9079861111111112 0.9200603318250377 - 0.8472222222222222 0.875


#ITERATION 5 13s k
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9913194444444444 0.9922720247295208 - 0.8333333333333334 0.8604651162790696
#model = RandomForestClassifier(n_estimators=220, max_depth=5, random_state=5) # 0.9079861111111112 0.9200603318250377 - 0.8472222222222222 0.875

#ITERATION 6 17s k
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9965277777777778 0.9969135802469136 - 0.8402777777777778 0.8700564971751412
#model = RandomForestClassifier(n_estimators=220, max_depth=7, random_state=5) # 0.9513888888888888 0.9569230769230769 - 0.8333333333333334 0.8636363636363635


#ITERATION 6 12s k
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9913194444444444 0.9922958397534669 - 0.8125 0.8439306358381502
#model = RandomForestClassifier(n_estimators=220, max_depth=6, random_state=5) # 0.9184027777777778 0.928462709284627 - 0.8125 0.8421052631578947

#ITERATION 7 9s k
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9913194444444444 0.9922958397534669 - 0.8125 0.8439306358381502
#model = RandomForestClassifier(n_estimators=250, max_depth=5, random_state=5) # 0.890625 0.9044006069802731 - 0.7986111111111112 0.8361581920903955

#ITERATION 11 9s k
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9861111111111112 0.9876923076923076 - 0.8402777777777778 0.8654970760233917
#model = RandomForestClassifier(n_estimators=220, max_depth=5, random_state=5) # 0.8958333333333334 0.9096385542168673 - 0.8263888888888888 0.8571428571428572
#model = RandomForestClassifier(n_estimators=210, max_depth=5, random_state=5) # 0.8975694444444444 0.9112781954887218 - 0.8194444444444444 0.8505747126436781


#ITERATION 11 10s k No Male No N
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9913194444444444 0.9922958397534669 - 0.8125 0.8439306358381502
#model = RandomForestClassifier(n_estimators=210, max_depth=5, random_state=5) # 0.9913194444444444 0.9922958397534669 - 0.9045138888888888 0.9167927382753404


#ITERATION 12 19s k No Male No N
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9861111111111112 0.9876923076923076 - 0.8263888888888888 0.8571428571428572
#model = RandomForestClassifier(n_estimators=210, max_depth=5, random_state=5) # 0.8993055555555556 0.9129129129129129 - 0.8055555555555556 0.8409090909090909

#ITERATION 13 all  No Male No N
#model = RandomForestClassifier(n_estimators=220, max_depth=10, random_state=5) # 0.9982638888888888 0.9984591679506933 - 0.8402777777777778 0.8715083798882682
#model = RandomForestClassifier(n_estimators=210, max_depth=7, random_state=5) # 0.9565972222222222 0.9618320610687022 - 0.8472222222222222 0.8764044943820224 **

#ITERATION 14 13  No Male No N
#model = RandomForestClassifier(n_estimators=190, max_depth=5, random_state=5) # 0.9097222222222222 0.9214501510574018 - 0.8263888888888888 0.8587570621468928
#model = RandomForestClassifier(n_estimators=160, max_depth=6, random_state=5) # 0.9461805555555556 0.9528158295281582 - 0.8263888888888888 0.8571428571428572

#ITERATION 15 11  No Male No N
#model = RandomForestClassifier(n_estimators=190, max_depth=6, random_state=5) # 0.9322916666666666 0.9404580152671755 - 0.8402777777777778 0.8685714285714287
#model = RandomForestClassifier(n_estimators=210, max_depth=5, random_state=5) # 0.9131944444444444 0.9244712990936556 - 0.7916666666666666 0.8295454545454546

#  8-- 10 -- 11 --12 -15

#FINAL 20
model = RandomForestClassifier(n_estimators=210, max_depth=7, random_state=5) # 0.953125 0.9586523736600305 - 0.8472222222222222 0.8764044943820224 **

model.fit(train_X, train_y)
# evaluate accuracy/f1 score on training data
train_y_pred = model.predict(train_X)
print(accuracy_score(train_y, train_y_pred) , f1_score(train_y, train_y_pred))

In [None]:
# evaluate accuracy/f1 score on test data (20%)
test_y_pred = model.predict(test_X)
print(accuracy_score(test_y, test_y_pred), f1_score(test_y, test_y_pred))

In [None]:
base_model_rf = RandomForestClassifier(random_state=5)
parameters = {
    'n_estimators': [150, 160, 180, 190, 210, 220 ],
    'max_depth': [5,6, 7, 10]
}

cv_rf = GridSearchCV(base_model_rf, parameters, cv=5)
cv_rf.fit(train_X, train_y)

print(cv_rf.cv_results_.keys()) # all results for 5-fold cross validation
print(cv_rf.cv_results_['mean_test_score'])

In [None]:
cv_rf.best_estimator_

In [None]:
svc = SVC(random_state=5)
parameters = {
    'kernel': ['linear', 'rbf', 'gausian'],
    'C': [0.1, 0.9,1,2, 10, 11, 13, 21, 25]
}

model_svc = GridSearchCV(svc, parameters, cv=5,  scoring='f1')

model_svc.fit(train_X, train_y)
print(model_svc.cv_results_.keys()) # all results for 5-fold cross validation
print(model_svc.cv_results_['mean_test_score'])

In [None]:
model_svc.best_estimator_

4. Make predictions and format them into required submission file

In [None]:
# make predictions on test data

#original test_y_pred = model.predict(test_X)
#prediction_X = my_prediction_X_df.drop(["PatientID"], axis=1)
prediction_y_pred = model.predict(prediction_X)

# prepare the prediction file to submit on Kaggle
submission_df = pd.DataFrame({
    'PatientID': my_prediction_X_df['PatientID'],
    'HeartDisease': prediction_y_pred
    }
)
submission_df.to_csv("y_predict_rf-all-feat-v23.csv", index=False)
submission_df.head(20)

In [None]:


from sklearn.ensemble import RandomForestClassifier

base_model_rf = RandomForestClassifier()
parameters = {
    'n_estimators': [220, 240, 250, 255, 270],
    'max_depth': [3,5,6, 7, 8, 10]
}

#RandomForestClassifier(max_depth=7, n_estimators=250)

#'n_estimators': [5, 50, 250],
#'max_depth': [2, 4, 8, 16, 32, None]

cv_rf = GridSearchCV(base_model_rf, parameters, cv=5)
cv_rf.fit(train_X, train_y)

print(cv_rf.cv_results_.keys()) # all results for 5-fold cross validation
print(cv_rf.cv_results_['mean_test_score']) # average validation performance for different hyperparameter values

In [None]:
for dataset in [y_train, y_test]:
    print(round(len(dataset)/len(labels), 3))

In [None]:
# Iteration 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])
test_X_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(test_X_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])

In [None]:
# Iteration 2
mean_train = my_train_df['Cholesterol'].mean()
my_train_df['Cholesterol'].replace(0, mean_train, inplace=True)

mean_test = my_test_X_df['Cholesterol'].mean()
my_test_X_df['Cholesterol'].replace(0, mean_test, inplace=True)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])
test_X_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(test_X_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])

In [None]:
# Iteration 3
my_train_df['Cholesterol_valid'] = np.where(my_train_df['Cholesterol'].eq(0), 1, 0)
my_test_X_df['Cholesterol_valid'] = np.where(my_test_X_df['Cholesterol'], 1, 0)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])
test_X_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(test_X_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])

In [None]:
##Next cell are test

In [None]:
#NO
#my_train_df['Cholesterol_valid'] = my_train_df[['Cholesterol']].eq(0).any(axis=1)

#my_train_df['eq'] = np.where(df['eq'], 'Y', 'N')

#my_train_df['Cholesterol_valid'] = np.where(my_train_df['Cholesterol'].eq(0), 1, 0)
#my_test_X_df['Cholesterol_valid'] = np.where(my_test_X_df['Cholesterol'], 1, 0)


#mean_train = my_train_df['Cholesterol'].mean()
#my_train_df['Cholesterol'] = np.where(my_train_df['Cholesterol'].eq(0), mean_train, my_train_df['Cholesterol'])

#mean_test = my_test_X_df['Cholesterol'].mean()
#my_test_X_df['Cholesterol'] = np.where(my_test_X_df['Cholesterol'].eq(0), mean_test, my_test_X_df['Cholesterol'])


mean_train = my_train_df['Cholesterol'].mean()
my_train_df['Cholesterol'].replace(0, mean_train, inplace=True)

mean_test = my_test_X_df['Cholesterol'].mean()
my_test_X_df['Cholesterol'].replace(0, mean_test, inplace=True)

#my_train_df['Oldpeak'] = my_train_df['Oldpeak'].abs()
#my_test_X_df['Oldpeak'] = my_test_X_df['Oldpeak'].abs()

In [None]:
my_train_df.head(n=5)

In [None]:


#Calculating the Rolling Z-Score
#window_size = 20
#train_X['rolling_zscore'] = (train_X['Cholesterol'] - train_X['Cholesterol'].rolling(window_size).mean()) / train_X['Cholesterol'].rolling(window_size).std()

#Identifying Outliers

#train_X['outlier'] = (train_X['rolling_zscore'] > 3) | (train_X['rolling_zscore'] < -3)

#import scipy.stats as stats
#my_train_df['cholesterol_zscore'] = stats.zscore(my_train_df['Cholesterol'])
#my_train_df.head()

##TODO my_test_X_df

In [None]:
# Dropping  Hillmer

# Cholesterol Many zero values

#my_train_df.groupby('Cholesterol')['Cholesterol'].count()
#my_train_df.groupby('HeartDisease')['HeartDisease'].count()
#my_train_df.groupby('Oldpeak')['Oldpeak'].count()
#my_train_df.groupby('RestingBP')['RestingBP'].count() #80 to 200
#my_train_df.groupby('MaxHR')['MaxHR'].count() #
#my_train_df.groupby('Age')['Age'].count()
#my_train_df['MaxHR']> 160

In [None]:
#NO
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(my_train_df))


In [None]:
#NO
from sklearn.preprocessing import MinMaxScaler
norm = MinMaxScaler().fit(my_train_df)
my_train_df_norm = norm.transform(my_train_df)
my_train_df = my_train_df_norm

In [None]:
#NO
from sklearn.preprocessing import StandardScaler

scale = StandardScaler().fit(my_train_df)
my_train_df_scaled = scale.transform(my_train_df)
my_train_df_scaled



In [None]:
#NO
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])

In [None]:
#NO
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(my_train_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])

test_X_df[['Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']] = scaler.fit_transform(test_X_df[['Age','RestingBP', 'Cholesterol', 'MaxHR','Oldpeak']])



In [None]:
#TEMP
#my_train_df.groupby('FastingBS')['FastingBS'].count() #0 & 1
#my_train_df.groupby('HeartDisease')['HeartDisease'].count() #0 & 1
#my_train_df.groupby('Oldpeak')['Oldpeak'].count() #-2 TO 6.2
#my_train_df[['Oldpeak']] = scaler.fit_transform(my_train_df[['Oldpeak']])


#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#my_train_df[['Cholesterol']] = scaler.fit_transform(my_train_df[['Cholesterol']])

#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#my_train_df[['MaxHR']] = scaler.fit_transform(my_train_df[['MaxHR']])


#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#my_train_df['RestingBP_scal'] = scaler.fit_transform(my_train_df[['RestingBP']])

#from sklearn.preprocessing import Normalizer
#norm = Normalizer()
#my_train_df['RestingBP_norm'] = norm.fit_transform(my_train_df[['RestingBP']])

#my_train_df['RestingBP_ori'] = my_train_df['RestingBP']

In [None]:
my_train_df.head(100)

## 3. Create classifier and fit the data
- sklearn is a convenient package for ML: https://scikit-learn.org/stable/
- you are encouraged to try any ML models: https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
- you are encouraged to try model selection methods: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

# prepare features and labels for training/testing

train_X = my_train_df.drop(["HeartDisease", "PatientID"], axis=1)
train_y = my_train_df["HeartDisease"]
test_X = my_test_X_df.drop(["PatientID"], axis=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
#model = RandomForestClassifier(max_depth=7, n_estimators=255) # 0.9347222222222222 0.943577430972389
#model = LogisticRegression(C=0.1) # 0.8763888888888889 0.8934131736526946
model = KNeighborsClassifier(n_neighbors=11) # 0.8861111111111111 0.9023809523809524
model.fit(train_X, train_y)
# evaluate accuracy/f1 score on training data
train_y_pred = model.predict(train_X)
print(accuracy_score(train_y, train_y_pred) , f1_score(train_y, train_y_pred))



In [None]:
#NEXT CELL ARE TESTs


In [None]:

# define and fit your model, with manually set hyperparameter
# e.g., here is an example of KNN classifier, and you may tune the hyperparameter "n_neighbors"
#model = KNeighborsClassifier(n_neighbors=11, weights='uniform', algorithm='auto')
#m1
#model = KNeighborsClassifier(n_neighbors=11)

#m2
#from sklearn.linear_model import LogisticRegression
#model = LogisticRegression(C=0.01)

from sklearn.ensemble import RandomForestClassifier
#model = RandomForestClassifier(n_estimators=250, max_depth=32)
#model = RandomForestClassifier(n_estimators=250, max_depth=7) # Sub 0.86666 0.9375 0.9458483754512635

#model = RandomForestClassifier(n_estimators=250, max_depth=8) # Sub 0.8599 0.9611111111111111 0.9663461538461539
#model = RandomForestClassifier(n_estimators=250, max_depth=10) # Sub 0.83582 0.9833333333333333 0.9854368932038836

#model = RandomForestClassifier(n_estimators=250, max_depth=10) # Sub 0.82587 0.9875 0.9890909090909091


model = RandomForestClassifier(n_estimators=240, max_depth=8)# Sub 0.xxxx 0.9375 0.9458483754512635


#NO SUBMITTED model = RandomForestClassifier(max_depth=8, n_estimators=250) #w Oldpeak abs 0.9611111111111111 0.9663461538461539

#from sklearn.naive_bayes import GaussianNB
#model = GaussianNB(var_smoothing= 0.1) # 0.65

#from sklearn.tree import DecisionTreeClassifier
#model = DecisionTreeClassifier(random_state=0, max_depth=3) 0.22

#from sklearn.svm import SVC
#model = SVC(kernel='rbf', C=1)


model.fit(train_X, train_y)



# evaluate accuracy/f1 score on training data
train_y_pred = model.predict(train_X)
print(accuracy_score(train_y, train_y_pred))
print(f1_score(train_y, train_y_pred))

#model = KNeighborsClassifier(n_neighbors=11) 0.66
#0.8861111111111111
#0.9021479713603817

#model = LogisticRegression() 0.02
#0.8722222222222222
#0.8896882494004797

#model = RandomForestClassifier(n_estimators=250, max_depth=32) 0.81
# 1.0
# 1.0

#model = SVC(kernel='rbf', C=1)
#0.9069444444444444
#0.919952210274791


#model = GaussianNB() 0.65306
#0.8569444444444444
#0.8751515151515152

#model = GaussianNB(var_smoothing= 0.1873817422860384)
#0.8666666666666667
#0.8829268292682927

#model = GaussianNB(var_smoothing= 0.1)
#0.8708333333333333
#0.8872727272727274



In [None]:
model.get_params()

In [None]:
#HC model selection: hyperparameter tuning

from sklearn.ensemble import RandomForestClassifier

base_model_rf = RandomForestClassifier()
parameters = {
    'n_estimators': [220, 240, 250, 255, 270],
    'max_depth': [3,5,6, 7, 8, 10]
}

#RandomForestClassifier(max_depth=7, n_estimators=250)

#'n_estimators': [5, 50, 250],
#'max_depth': [2, 4, 8, 16, 32, None]

cv_rf = GridSearchCV(base_model_rf, parameters, cv=5)
cv_rf.fit(train_X, train_y)

print(cv_rf.cv_results_.keys()) # all results for 5-fold cross validation
print(cv_rf.cv_results_['mean_test_score']) # average validation performance for different hyperparameter values

#[0.81666667 0.84444444 0.84861111 0.84166667 0.85972222 0.86944444
# 0.85277778 0.86666667 0.87083333 0.82777778 0.86805556 0.87222222
# 0.84166667 0.85694444 0.87222222 0.83194444 0.86944444 0.86805556]
#MinMaxScaler()

In [None]:
# HC
cv_rf.best_estimator_

In [None]:
from sklearn.svm import SVC

svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 0.9,1,2, 10]
}

model_svc = GridSearchCV(svc, parameters, cv=5)

model_svc.fit(train_X, train_y)
print(model_svc.cv_results_.keys()) # all results for 5-fold cross validation
print(model_svc.cv_results_['mean_test_score'])

In [None]:
model_svc.get_params()

In [None]:
from sklearn.tree import DecisionTreeClassifier
base_model_dt = DecisionTreeClassifier()
hyperpara_grid = {'max_depth':[4,5,6,7,8,9,10]} # candidate values for the hyperparameter to try
dt = KNeighborsClassifier()
#HC clf = GridSearchCV(base_model, hyperpara_grid, cv=5) # 5-fold cross validation
dt = GridSearchCV(base_model_dt, hyperpara_grid, cv=5, scoring='f1') # 5-fold cross validation
dt.fit(train_X, train_y)
print(dt.cv_results_.keys()) # all results for 5-fold cross validation
print(dt.cv_results_['mean_test_score'])

In [None]:
# model selection: hyperparameter tuning
hyperpara_grid = {'n_neighbors':[3, 5, 8, 10, 11, 13, 15]} # candidate values for the hyperparameter to try

#hyperpara_grid = {'n_neighbors':[11]} # candidate values for the hyperparameter to try
base_model = KNeighborsClassifier()
#HC clf = GridSearchCV(base_model, hyperpara_grid, cv=5) # 5-fold cross validation
clf = GridSearchCV(base_model, hyperpara_grid, cv=5, scoring='f1') # 5-fold cross validation
clf.fit(train_X, train_y)
print(clf.cv_results_.keys()) # all results for 5-fold cross validation
print(clf.cv_results_['mean_test_score']) # average validation performance for different hyperparameter values

#[0.73402355 0.74541386 0.72588925 0.72236912 0.74330666 0.739037  ]
# w/o                [0.71085231 0.71198864 0.70331137 0.72746104 0.74635724 0.73829504]
#cholesterol z-score [0.7156104  0.7097872  0.70730967 0.72430988 0.74725948 0.73686977]
#MinMaxScaler() [0.72640991 0.74269164 0.73082418 0.73898587 0.76037838 0.754347  ]
#StandardScaler 'Age','RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'
#0.86661442 0.88982141 0.89073001 0.89803115 0.89585915 0.89646112]

In [None]:
clf.best_estimator_

In [None]:

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())

In [None]:
clf.best_estimator_.get_params()

In [None]:
#HC model selection: hyperparameter tuning
from sklearn.linear_model import LogisticRegression
base_model_lr = LogisticRegression()
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

cv_lr = GridSearchCV(base_model_lr, parameters, cv=5)
cv_lr.fit(train_X, train_y)

print(cv_lr.cv_results_.keys()) # all results for 5-fold cross validation
print(cv_lr.cv_results_['mean_test_score']) # average validation performance for different hyperparameter values

#[0.74305556 0.82916667 0.85972222 0.86388889 0.85972222 0.8625 0.85694444]
#MinMaxScaler() [0.71944444 0.82083333 0.86111111 0.85972222 0.85694444 0.85694444 0.85972222]

In [None]:
cv_lr.best_estimator_

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()

parameters = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=nb_classifier,
                 param_grid=parameters,
                 cv=5,   # use any cross validation technique
                 verbose=1,
                 scoring='accuracy')
gs_NB.fit(train_X, train_y)

gs_NB.best_params_


In [None]:
clf.cv_results_

## 4. Make predictions and format them into required submission file

In [None]:
# make predictions on test data

#original test_y_pred = model.predict(test_X)
test_y_pred = model.predict(test_X)

# prepare the prediction file to submit on Kaggle
submission_df = pd.DataFrame({
    'PatientID': my_test_X_df['PatientID'],
    'HeartDisease': test_y_pred
    }
)
submission_df.to_csv("y_predict.csv", index=False)
submission_df.head(20)