In [243]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Original dataset: https://www.kaggle.com/ronitf/heart-disease-uci/
df = pd.read_csv('/Users/igormishurov/Documents/heart.csv')

# Make a copy of a dataset
df_clear = df 
# The dataset has no missing values itself. All datatypes are correct. Target classes are also balanced

# Check distribution of Age feature for 2 target groups 
ax = sns.distplot(df[df['target']== 0]['age'], kde = False)
sns.distplot(df[df['target']== 1]['age'], kde = False, ax = ax)

# Binarize Age feature for logistic regression
df.loc[(df['age'] > 28) & (df['age'] <= 30), 'age'] = 0
df.loc[(df['age'] > 30) & (df['age'] <= 40), 'age'] = 1
df.loc[(df['age'] > 40) & (df['age'] <= 50), 'age'] = 2
df.loc[(df['age'] > 50) & (df['age'] <= 60), 'age'] = 3
df.loc[(df['age'] > 60) & (df['age'] <= 80), 'age'] = 4

# Notice that cp impacts target in dependence of sex
df['sex*cp'] = df['sex'] * df['cp']
sns.factorplot(x = 'sex*cp', y = 'target', data = df)

# Binarize trestbps feature for logistic regression
ax1 = sns.distplot(df[df['target']== 0]['trestbps'], kde = False)
sns.distplot(df[df['target']== 1]['trestbps'], kde = False, ax = ax1)
df.loc[(df['trestbps'] > 90) & (df['trestbps'] <= 110), 'trestbps'] = 0
df.loc[(df['trestbps'] > 110) & (df['trestbps'] <= 120), 'trestbps'] = 1
df.loc[(df['trestbps'] > 120) & (df['trestbps'] <= 130), 'trestbps'] = 2
df.loc[(df['trestbps'] > 130) & (df['trestbps'] <= 140), 'trestbps'] = 3
df.loc[(df['trestbps'] > 140) & (df['trestbps'] <= 150), 'trestbps'] = 4
df.loc[(df['trestbps'] > 150) & (df['trestbps'] <= 160), 'trestbps'] = 5
df.loc[(df['trestbps'] > 160) & (df['trestbps'] <= 170), 'trestbps'] = 6
df.loc[(df['trestbps'] > 170) & (df['trestbps'] <= 180), 'trestbps'] = 7
df.loc[(df['trestbps'] > 180) & (df['trestbps'] <= 190), 'trestbps'] = 8
df.loc[(df['trestbps'] > 190) & (df['trestbps'] <= 200), 'trestbps'] = 9
#Notice that trestbps impacts target in dependence of sex
df['sex*trestbps'] = df['sex'] * df['trestbps']
sns.factorplot(x = 'sex*trestbps', y = 'target', data = df)

#ax2 = sns.distplot(df[df['target']== 0]['chol'], kde = False)
#sns.distplot(df[df['target']== 1]['chol'], kde = False, ax = ax2)
df['chol'].describe()
df.loc[(df['chol'] > 125) & (df['chol'] <= 200), 'chol'] = 0
df.loc[(df['chol'] > 200) & (df['chol'] <= 250), 'chol'] = 1
df.loc[(df['chol'] > 250) & (df['chol'] <= 300), 'chol'] = 2
df.loc[(df['chol'] > 300) & (df['chol'] <= 350), 'chol'] = 3
df.loc[(df['chol'] > 350) & (df['chol'] <= 400), 'chol'] = 4
df.loc[(df['chol'] > 400) & (df['chol'] <= 450), 'chol'] = 5
df.loc[(df['chol'] > 450) & (df['chol'] <= 500), 'chol'] = 6
df.loc[(df['chol'] > 500) & (df['chol'] <= 570), 'chol'] = 7

df['fbs*sex'] = df['fbs']*df['sex']
df['chol*fbs'] = df['fbs']*df['chol']

#sns.factorplot(x = 'restecg', y = 'target', hue = 'age', data = df)
df['restecg*age'] = df['restecg'] * df['age']
df['restecg*chol'] = df['restecg'] * df['age']

ax2 = sns.distplot(df[df['target']== 0]['thalach'], kde = False)
sns.distplot(df[df['target']== 1]['thalach'], kde = False, ax = ax2)
df['thalach'].describe()
df.loc[(df['thalach'] > 70) & (df['thalach'] <= 100), 'thalach'] = 0
df.loc[(df['thalach'] > 100) & (df['thalach'] <= 130), 'thalach'] = 1
df.loc[(df['thalach'] > 130) & (df['thalach'] <= 160), 'thalach'] = 2
df.loc[(df['thalach'] > 160) & (df['thalach'] <= 190), 'thalach'] = 3
df.loc[(df['thalach'] > 190) & (df['thalach'] <= 210), 'thalach'] = 4

#sns.factorplot(x = 'exang', y = 'target', hue = 'fbs', data = df)
df['exang*sex'] = df['exang']*df['sex']
df['exang*age'] = df['exang']*df['age']
df['exang*chol'] = df['exang']*df['chol']
df['exang*restecg'] = df['restecg'] * df['exang']
df['exang*fbs'] = df['exang']*df['fbs']

ax2 = sns.distplot(df[df['target']== 0]['oldpeak'], kde = False)
sns.distplot(df[df['target']== 1]['oldpeak'], kde = False, ax = ax2)
df.loc[(df['oldpeak'] >= 0) & (df['oldpeak'] <= 1), 'oldpeak'] = 0
df.loc[(df['oldpeak'] > 1) & (df['oldpeak'] <= 2), 'oldpeak'] = 1
df.loc[(df['oldpeak'] > 2) & (df['oldpeak'] <= 3), 'oldpeak'] = 2
df.loc[(df['oldpeak'] > 3) & (df['oldpeak'] <= 4), 'oldpeak'] = 3
df.loc[(df['oldpeak'] > 4) & (df['oldpeak'] <= 5), 'oldpeak'] = 4
df.loc[(df['oldpeak'] > 5) & (df['oldpeak'] <= 7), 'oldpeak'] = 5
sns.factorplot(x = 'oldpeak', y = 'target', hue = 'thalach', data = df)
df['oldpeak*sex'] = df['oldpeak'] * df['sex']

sns.factorplot(x = 'slope', y = 'target', hue = 'oldpeak', data = df)
df['slope*sex'] = df['sex'] * df['slope']

sns.factorplot(x = 'ca', y = 'target', hue = 'sex', data = df)
df['ca*sex'] = df['ca'] * df['sex']
df['thal*sex'] = df['thal'] * df['sex']

y = df['target']
df = df.drop('target', axis = 1)

# Get dummy 
df_dummy = pd.get_dummies(df, columns = ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'oldpeak', 'slope', 'ca', 'thal', 'sex*cp', 'sex*trestbps', 'fbs*sex', 'chol*fbs', 'restecg*chol', 
'exang*sex', 'restecg*age', 'exang*chol', 'exang*restecg', 'exang*fbs', 'oldpeak*sex', 'slope*sex', 'ca*sex', 'thal*sex', 'exang*age'])

# Stratisfied Cross validation log for logistic regression
kfold = KFold(n_splits=10)
lr = LogisticRegression()
scores = cross_val_score(lr, df_dummy, y, cv = kfold)
print("Mean cross validation of log reg: {}".format(scores.mean()))

# Stratisfied Cross validation log for random forest (no need, actually)
rf = RandomForestClassifier()
scores = cross_val_score(rf, df_clear.drop('target', axis = 1), y, cv=kfold)
print("Mean cross validation of random forest: {}".format(scores.mean()))







Mean cross validation of log reg: 0.7751612903225806
Mean cross validation of random forest: 0.781505376344086
