In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
import warnings
warnings.filterwarnings('ignore')
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [2]:
df = pd.read_csv('/media/raian/Projects/Datasets/UCI-HeartDisease/processed.cleveland_test.csv', header=None)
df.columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [3]:
df.shape

(303, 14)

In [4]:
# converting to binary class
# replace class 1,2,3,4 by 1

df['target'] = df['target'].mask(df['target']>0, 1)

# count of tuples in class 1 after replacement

df['target'][df['target'] == 1].value_counts().sum()

139

In [5]:
# Replace '?' by np.nan

df = df.replace('?',np.nan)

# Find missing values np.nan

df.isna().sum()


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64

In [6]:
# Impute np.nan with Most Frequent values

df_imp = df.copy(deep=True)
df_imp
imp = SimpleImputer(strategy='most_frequent')
df_imp = pd.DataFrame(imp.fit_transform(df_imp),columns=df_imp.columns)
df_imp
df_imp.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
df.ca = df.ca.astype('float64')
df.thal = df.thal.astype('float64')
df.dtypes

age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca          float64
thal        float64
target        int64
dtype: object

In [8]:
df_imp = df_imp.astype('float64')
df_imp.dtypes

age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca          float64
thal        float64
target      float64
dtype: object

In [9]:
X = df_imp.drop(['target'],axis=1)

y = df['target']

In [10]:
sc = StandardScaler()
X_norm = pd.DataFrame(sc.fit_transform(X), columns = X.columns)

# Apply a Classification Algorithm

In [11]:
# define the model

def classification(X_data, cls_name, scoring_method):
    model = cls_name
    
# evaluate model

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, X_data, y, scoring=scoring_method, cv=cv)
#     print(n_scores)
#     print(n_scores.shape)
# report performance
#     print('Accuracy: mean = %f ' % (n_scores.mean()))
    return n_scores.mean()

# Feature Subset Selection using LDA (Eigen)

In [12]:
# define LDA
lda = LDA(solver='eigen', shrinkage='auto',n_components = 1)

# fit and transform features
X_lda = lda.fit_transform(X, y)


### LR Default

In [13]:
Score_LDA = pd.DataFrame(columns=['Classification_Method', 'accuracy','precision','recall','f1','roc_auc'])
Score_LDA['Classification_Method'] = ['LR']
Score_LDA

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR,,,,,


In [14]:
Score_LDA.accuracy = classification(X_data=X_lda, cls_name=LogisticRegression(), scoring_method='accuracy')
Score_LDA.precision = classification(X_data=X_lda, cls_name=LogisticRegression(), scoring_method='precision')
Score_LDA.recall = classification(X_data=X_lda, cls_name=LogisticRegression(), scoring_method='recall')    
Score_LDA.f1 = classification(X_data=X_lda, cls_name=LogisticRegression(), scoring_method='f1')
Score_LDA.roc_auc = classification(X_data=X_lda, cls_name=LogisticRegression(), scoring_method='roc_auc')

In [15]:
Score_LDA

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR,0.842939,0.854435,0.798718,0.82259,0.91805


### LR (C=0.01) 

# Achieved better Accuracy than RFE

In [16]:
Score_LDA1 = pd.DataFrame(columns=['Classification_Method', 'accuracy','precision','recall','f1','roc_auc'])
Score_LDA1['Classification_Method'] = ['LR (C=0.01)']
Score_LDA1

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR (C=0.01),,,,,


In [17]:
Score_LDA1.accuracy = classification(X_data=X_lda, cls_name=LogisticRegression(C=0.01), scoring_method='accuracy')
Score_LDA1.precision = classification(X_data=X_lda, cls_name=LogisticRegression(C=0.01), scoring_method='precision')
Score_LDA1.recall = classification(X_data=X_lda, cls_name=LogisticRegression(C=0.01), scoring_method='recall')    
Score_LDA1.f1 = classification(X_data=X_lda, cls_name=LogisticRegression(C=0.01), scoring_method='f1')
Score_LDA1.roc_auc = classification(X_data=X_lda, cls_name=LogisticRegression(C=0.01), scoring_method='roc_auc')

In [18]:
Score_LDA1

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR (C=0.01),0.854946,0.89294,0.782051,0.829734,0.91805


### LR (C=0.1)

In [19]:
Score_LDA2 = pd.DataFrame(columns=['Classification_Method', 'accuracy','precision','recall','f1','roc_auc'])
Score_LDA2['Classification_Method'] = ['LR (C=0.1)']
Score_LDA2

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR (C=0.1),,,,,


In [20]:
Score_LDA2.accuracy = classification(X_data=X_lda, cls_name=LogisticRegression(C=0.1), scoring_method='accuracy')
Score_LDA2.precision = classification(X_data=X_lda, cls_name=LogisticRegression(C=0.1), scoring_method='precision')
Score_LDA2.recall = classification(X_data=X_lda, cls_name=LogisticRegression(C=0.1), scoring_method='recall')    
Score_LDA2.f1 = classification(X_data=X_lda, cls_name=LogisticRegression(C=0.1), scoring_method='f1')
Score_LDA2.roc_auc = classification(X_data=X_lda, cls_name=LogisticRegression(C=0.1), scoring_method='roc_auc')

In [21]:
Score_LDA2

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR (C=0.1),0.848387,0.864995,0.798718,0.827386,0.91805


# LDA (Eigen) on Normalized Data 

In [22]:
# fit and transform features
X_lda_norm = lda.fit_transform(X_norm, y)


### LR Default

In [23]:
Score_LDA_norm = pd.DataFrame(columns=['Classification_Method', 'accuracy','precision','recall','f1','roc_auc'])
Score_LDA_norm['Classification_Method'] = ['LR']
Score_LDA_norm

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR,,,,,


In [24]:
Score_LDA_norm.accuracy = classification(X_data=X_lda_norm, cls_name=LogisticRegression(), scoring_method='accuracy')
Score_LDA_norm.precision = classification(X_data=X_lda_norm, cls_name=LogisticRegression(), scoring_method='precision')
Score_LDA_norm.recall = classification(X_data=X_lda_norm, cls_name=LogisticRegression(), scoring_method='recall')    
Score_LDA_norm.f1 = classification(X_data=X_lda_norm, cls_name=LogisticRegression(), scoring_method='f1')
Score_LDA_norm.roc_auc = classification(X_data=X_lda_norm, cls_name=LogisticRegression(), scoring_method='roc_auc')

In [25]:
Score_LDA_norm

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR,0.842939,0.854435,0.798718,0.82259,0.91805


# Feature Subset Selection using LDA (Eigen) WITHOUT SHRINKAGE


In [42]:
# define LDA without shrinkage 

lda1 = LDA(solver='eigen',n_components = 1)

# fit and transform features
X_lda1 = lda1.fit_transform(X, y)


### LR Default

In [45]:
Score_LDA1 = pd.DataFrame(columns=['Classification_Method', 'accuracy','precision','recall','f1','roc_auc'])
Score_LDA1['Classification_Method'] = ['LR']
Score_LDA1

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR,,,,,


In [46]:
Score_LDA1.accuracy = classification(X_data=X_lda1, cls_name=LogisticRegression(), scoring_method='accuracy')
Score_LDA1.precision = classification(X_data=X_lda1, cls_name=LogisticRegression(), scoring_method='precision')
Score_LDA1.recall = classification(X_data=X_lda1, cls_name=LogisticRegression(), scoring_method='recall')    
Score_LDA1.f1 = classification(X_data=X_lda1, cls_name=LogisticRegression(), scoring_method='f1')
Score_LDA1.roc_auc = classification(X_data=X_lda1, cls_name=LogisticRegression(), scoring_method='roc_auc')

In [47]:
Score_LDA1

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR,0.845161,0.858625,0.798718,0.82437,0.920823


### LR (C=0.01)

# BEST SCORE OF LOGISTIC REGRESSION SO FAR 

In [49]:
Score_LDA1_C01 = pd.DataFrame(columns=['Classification_Method', 'accuracy','precision','recall','f1','roc_auc'])
Score_LDA1_C01['Classification_Method'] = ['LR (C=0.01)']
Score_LDA1_C01

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR (C=0.01),,,,,


In [52]:
Score_LDA1_C01.accuracy = classification(X_data=X_lda1, cls_name=LogisticRegression(C=0.01), scoring_method='accuracy')
Score_LDA1_C01.precision = classification(X_data=X_lda1, cls_name=LogisticRegression(C=0.01), scoring_method='precision')
Score_LDA1_C01.recall = classification(X_data=X_lda1, cls_name=LogisticRegression(C=0.01), scoring_method='recall')    
Score_LDA1_C01.f1 = classification(X_data=X_lda1, cls_name=LogisticRegression(C=0.01), scoring_method='f1')
Score_LDA1_C01.roc_auc = classification(X_data=X_lda1, cls_name=LogisticRegression(C=0.01), scoring_method='roc_auc')

In [53]:
Score_LDA1_C01

Unnamed: 0,Classification_Method,accuracy,precision,recall,f1,roc_auc
0,LR (C=0.01),0.860502,0.902538,0.784432,0.835456,0.920823


# LDA (Eigen) WITHOUT SHRINKAGE ON Normalized Data

In [57]:
# fit and transform features

X_lda1_norm = lda1.fit_transform(X_norm, y)


In [65]:
classification(X_data=X_lda1_norm, cls_name=LogisticRegression(C=0.01), scoring_method='accuracy')


0.8605017921146952

## Same accuracy on Actual Data & Normalized Data