In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn.preprocessing import StandardScaler
import random
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate

In [4]:
data = pd.read_csv("heart.csv")

In [5]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [7]:
data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [8]:
data["trestbps"]=np.log(data["trestbps"])

In [9]:
data=data.drop(["fbs"],axis=1)
data=data.drop(["ca"],axis=1)

In [10]:
data["chol"]=np.log(data["chol"])

In [11]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalach,exang,oldpeak,slope,thal,target
0,63,1,3,4.976734,5.451038,0,150,0,2.3,0,1,1
1,37,1,2,4.867534,5.521461,1,187,0,3.5,0,2,1
2,41,0,1,4.867534,5.31812,0,172,0,1.4,2,2,1
3,56,1,1,4.787492,5.463832,1,178,0,0.8,2,2,1
4,57,0,0,4.787492,5.869297,1,163,1,0.6,2,2,1


In [12]:
target=data["target"]

In [13]:
print(data.shape[1])

12


In [14]:
data=data.drop(["target"],axis=1)
print(data.columns)

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'exang',
       'oldpeak', 'slope', 'thal'],
      dtype='object')


In [15]:
sc= StandardScaler()
data=sc.fit_transform(data)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(data,target, test_size = 0.2, random_state = 42)

In [17]:
lr=LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

In [18]:
cv_results = cross_validate(lr, X_train,y_train, cv=10)

In [19]:
print(cv_results)

{'fit_time': array([0.03001666, 0.0150013 , 0.03502989, 0.02399516, 0.01999545,
       0.02700257, 0.01999855, 0.02699232, 0.02600598, 0.01899409]), 'score_time': array([0.00199461, 0.00700545, 0.00199819, 0.00201201, 0.00199771,
       0.00300026, 0.00200796, 0.00099897, 0.0009973 , 0.00200391]), 'test_score': array([0.84      , 0.76      , 0.79166667, 0.83333333, 0.83333333,
       0.70833333, 0.79166667, 0.83333333, 0.70833333, 0.875     ])}


In [20]:
score=lr.score(X_test,y_test)

In [21]:
score

0.8688524590163934

In [22]:
from sklearn.ensemble import RandomForestClassifier
reg_rf = RandomForestClassifier()

In [23]:
reg_rf.fit(X_train,y_train)

RandomForestClassifier()

In [24]:
reg_rf.score(X_test,y_test)

0.8524590163934426

In [25]:
from sklearn.model_selection import RandomizedSearchCV

In [26]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [27]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [28]:
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = reg_rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)

In [29]:
rf_random.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [30]:
rf_random.best_params_

{'n_estimators': 1100,
 'min_samples_split': 15,
 'min_samples_leaf': 10,
 'max_features': 'sqrt',
 'max_depth': 5}

In [31]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
best_random_grid=rf_random.best_estimator_
y_pred=best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))



[[27  2]
 [ 4 28]]


In [32]:
print("Accuracy Score: {}".format(accuracy_score(y_test,y_pred)))

Accuracy Score: 0.9016393442622951
