In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [2]:
# reading from csv file
data = pd.read_csv('heart.csv')
data.head()

FileNotFoundError: ignored

### Attribute Information:
## 1-age
## 2-sex
## 3-chest pain type (4 values)
## 4-resting blood pressure
## 5-serum cholestoral in mg/dl
## 6-fasting blood sugar > 120 mg/dl
## 7-resting electrocardiographic results (values 0,1,2)
## 8-maximum heart rate achieved
## 9-exercise induced angina
## 10-oldpeak = ST depression induced by exercise relative to rest
## 11-the slope of the peak exercise ST segment
## 12-number of major vessels (0-3) colored by flourosopy
## 13-thal: 0 = normal; 1 = fixed defect; 2 = reversable defect`

The names and social security numbers of the patients were recently removed from the database, replaced with dummy values.

In [None]:
data.dtypes

## Check some info about the Dataset to (see nulls and DataTypes)

In [None]:
data.info()

# Checking for missing values

In [None]:
data.isnull().sum()

In [None]:
data["target"].value_counts()

# Checking for imbalanced data

In [None]:
y_train.value_counts()

In [None]:
y_train.value_counts().plot.bar()

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy=1.0, random_state=0)
X_train_balanced, y_train_balaned = ros.fit_resample(X_train, y_train)
y_train_balaned.value_counts()

## Show some statistics about the Dataset

In [None]:
data.describe()

## Feature_Importances

In [None]:
features = data.columns
importances = clf.feature_importances_
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)),importances[indices],color='b')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative importance')
plt.show()

### `Exploratory Data Analysis` (EDA)

### `Univariate Visualization`

### Check the Distribution of (exercise induced angina) Feature in Dataset

In [None]:
exang_values = data['cp'].value_counts(ascending=False)
print('categories of (exang) --- \n', exang_values)
print('**'*40)


## Plotting

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='cp', data=data, order=exang_values.index)
plt.title('CountPlot of "cp" Feature in Dataset', fontsize=15, c='k')
plt.xlabel('exang', fontsize=20, c='k')
plt.ylabel('Counts', fontsize=20, c='k')
## To show the percentage of each bar
for i in range(len(exang_values)):  ## loop over the numbers of categories (number of bars)
    each_count = exang_values.values[i]
    each_text = f'{((100*each_count)/ len(data)):.2f}%'
    plt.text(x=i, y=each_count+5, s=each_text, c='b', fontsize=15, ha='center')


## Scatterplot with important Feature (medina_income) with the target (median_house_value)

## Histogram Distr. of Numerical Features in Dataset

In [None]:
# THIS IS GOING TO BE A VERY LARGE PLOT
sns.pairplot(data,hue='target',palette='coolwarm')

In [None]:
data.hist(bins=30, figsize=(20, 10))
plt.show()

## pearson correlation

In [None]:

data.corr()['thal'].sort_values(ascending=False)

## Show This As Heatmap

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(data.corr(), annot=True, cmap='Blues', fmt='.2f')  ## symmetric matrix
plt.show()

In [None]:
## using pandas
data['cp'].value_counts().plot(kind='pie', figsize=(10, 6));

## The Modiling
## 1-KNN CLASSIFIER

### `Feature Engineering`
``Try to add some features, thinking that it will be more useful and more correlated to the target``

## Standardize the Variables

Because the KNN classifier predicts the class of a given test observation by identifying the observations that are nearest to it, the scale of the variables matters. Any variables that are on a large scale will have a much larger effect on the distance between the observations, and hence on the KNN classifier, than variables that are on a small scale.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(data.drop('target',axis=1))

In [None]:
scaled_features = scaler.transform(data.drop('target',axis=1))

In [None]:
df_feat = pd.DataFrame(scaled_features,columns=data.columns[:-1])
df_feat.head()

# Splitting dataset into training, validation, and test sets

In [None]:
from sklearn.model_selection import train_test_split


X, X_test, y, y_test=train_test_split(scaled_features,data['target'],test_size=0.30)
#X, X_test, y, y_test = train_test_split(data_input, data_output, test_size=0.30, random_state=0)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=0)

print('X_train.shape =', X_train.shape)
print('y_train.shape =', y_train.shape)
print('-------------------------------')
print('X_val =', X_val.shape)
print('y_val.shape =', y_val.shape)
print('-------------------------------')
print('X_test =', X_test.shape)
print('y_test.shape =', y_test.shape)

# KNN Training and Validation

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors = 100)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_val = accuracy_score(y_val, y_pred_val)

print('accuracy_train =', accuracy_train)
print('accuracy_val =', accuracy_val)

In [None]:
best_model = KNeighborsClassifier(n_neighbors=10, weights='uniform')
best_model.fit(X_train, y_train)
y_pred_test = best_model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
print(accuracy_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

## confusion matrex

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_test)
f,ax = plt.subplots(figsize=(10, 10))
sns.heatmap(cm, annot=True, linewidths=0.5,linecolor="red", fmt= '.0f',ax=ax)
plt.show()
plt.savefig('ConfusionMatrix.png')

In [None]:
print(classification_report(y_test,y_pred_test))

# Hyperparameter tuning

## Choosing a K Value


In [None]:
error_rate = []

# Will take some time
for i in range(1,30):

    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,30),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
# FIRST A QUICK COMPARISON TO OUR ORIGINAL K=1
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=1')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

In [None]:
# NOW WITH K=23
knn = KNeighborsClassifier(n_neighbors=30)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=23')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

## Tuning n_neighbors

In [None]:
n_neighbours_values = list(range(1, 31))
accuracy_train_values = []
accuracy_val_values = []

for n_neighbours in n_neighbours_values:
    model = KNeighborsClassifier(n_neighbors = n_neighbours)
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_val = accuracy_score(y_val, y_pred_val)
    accuracy_train_values.append(accuracy_train)
    accuracy_val_values.append(accuracy_val)

results = pd.DataFrame({
    'n_neighbours': n_neighbours_values,
    'accuracy_train': accuracy_train_values,
    'accuracy_val': accuracy_val_values
})

In [None]:
results

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

results.plot(x='n_neighbours', y=['accuracy_train', 'accuracy_val'], figsize=(5, 5))
plt.grid(axis='both')

According to the previous plot, we select `n_neighbours = 23` as the best value for `n_neighbours`

In [None]:
weights_values = ['uniform', 'distance']
accuracy_train_values = []
accuracy_val_values = []

for weights in weights_values:
    model = KNeighborsClassifier(n_neighbors=10, weights=weights)
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_val = accuracy_score(y_val, y_pred_val)
    accuracy_train_values.append(accuracy_train)
    accuracy_val_values.append(accuracy_val)

results = pd.DataFrame({
    'weights': weights_values,
    'accuracy_train': accuracy_train_values,
    'accuracy_val': accuracy_val_values
})

In [None]:
results.plot.bar(x='weights', y=['accuracy_train', 'accuracy_val'])
plt.grid(axis='y')

## 2-NAIVE BAISE CLASSIFIER

In [None]:
X = data.iloc[:,:-1 ]
y = data.iloc[:, -1]
print(X)
print(y)
X.head()

# Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
X_train.shape
X_test.shape

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

print (X_train)

# Training the Naive Bayes model on the Training set

In [None]:

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)


# Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

In [None]:
from sklearn.metrics import classification_report
print (classification_report(y_test,y_pred ))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
f,ax = plt.subplots(figsize=(10, 10))
sns.heatmap(cm, annot=True, linewidths=0.5,linecolor="red", fmt= '.0f',ax=ax)
plt.show()
plt.savefig('ConfusionMatrix.png')

## .....................................................................................

## LR Classifier

# Splitting dataset into input and output

In [None]:
data_input = data.drop(columns=['target'])
data_output = data['target']

# Splitting dataset into train, validation, and test

In [None]:
from sklearn.model_selection import train_test_split

X, X_test, y, y_test = train_test_split(data_input, data_output, test_size=0.30, random_state=0)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=0)

print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('------------------------')
print('X_val:', X_val.shape)
print('y_val:', y_val.shape)
print('------------------------')
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

# Helper function: Evaluate model

In [None]:
from sklearn.metrics import accuracy_score

def eval_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_val = accuracy_score(y_val, y_pred_val)
    return acc_train, acc_val

# Logistic Regression
- Logistic regression is a linear classifier
- Logistic regression requires feature scaling (StandardScaler) in order to converge

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

model = Pipeline([
    ('feature_scaling', StandardScaler()),
    ('logistic_regression', LogisticRegression(random_state=0))
])
eval_model(model, X_train, y_train, X_val, y_val)

# Hyperparameter tuning: C
The C hyperparameter is a regularization hyperparameter. It usually (but not always) has the following effect:
1. Smaller values for C:
    - High regularization
    - Reduces overfitting
    - But too small values can make accuracy very low
2. Larger values for C:
    - Low regularization
    - High training accuracy
    - Can lead to overfitting

In [None]:
C_values = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
acc_train_values = []
acc_val_values = []

for C in C_values:
    model = Pipeline([
        ('feature_scaling', StandardScaler()),
        ('logistic_regression', LogisticRegression(C=C, random_state=0))
    ])
    acc_train, acc_val = eval_model(model, X_train, y_train, X_val, y_val)
    acc_train_values.append(acc_train)
    acc_val_values.append(acc_val)

results = pd.DataFrame({
    'C': C_values,
    'acc_train': acc_train_values,
    'acc_val': acc_val_values
})

results

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

results.plot.bar(x='C', y=['acc_train', 'acc_val'], figsize=(12, 8))
plt.yticks(np.arange(0.0, 1.1, 0.1))
plt.grid(axis='y')
plt.show()

# Testing best model

In [None]:
model_best = Pipeline([
    ('feature_scaling', StandardScaler()),
    ('logistic_regression', LogisticRegression(C=0.0001, random_state=0))
])
model_best.fit(X_train, y_train)
y_pred_test = model_best.predict(X_test)
print(accuracy_score(y_test, y_pred_test))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_test)

In [None]:
from sklearn.metrics import classification_report
print (classification_report(y_test,y_pred_test ))

## .......................................................................................................

## Random forest Ensamble Learning

### `Feature Engineering`

In [None]:
x = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [None]:
print(x.shape)
print(y.shape)

# Splitting dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split
x_train ,x_test,y_train,y_test = train_test_split(x,y,random_state = 99)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(criterion = "gini"
                             ,max_depth = 8
                             ,min_samples_split = 10
                             ,random_state = 5)

# Training the RF  model on the Training set

In [None]:
clf.fit(x_train,y_train)

In [None]:
clf.feature_importances_

In [None]:
data.columns

In [None]:
y_pred = clf.predict(x_test)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import classification_report
print (classification_report(y_test,y_pred ))

## ...........................................................................................

## SVM Classifier