## <span style="color:aquamarine">**Exercise: PCA on Heart Disease Dataset**</span>
===========================================================
1. `Load heart disease dataset in pandas dataframe`
2. Remove outliers using Z score. Usual guideline is to remove anything that has `Z score > 3 formula or Z score < -3`
3. Convert text columns to numbers using label encoding and one hot encoding
4. Apply scaling
5. Build a classification model using various methods `(SVM, logistic regression, random forest)` and check which model gives you the best accuracy
6. `Use PCA to reduce dimensions`, retrain your model and see what impact it has on your model in terms of accuracy. Keep in mind that many times doing PCA reduces the accuracy but computation is much lighter and that's the trade off you need to consider while building models in real life

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv(r"F:\Machine Learning all Algorithms\17 PCA\heart.csv")
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [3]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [5]:
df.shape

(918, 12)

### **Caluclate Z-scores for each columns. `np.abs((df - df.mean())/df.std())`**


In [6]:
(df['RestingBP'] - df['RestingBP'].mean())/df['RestingBP'].std()

0      0.410685
1      1.490940
2     -0.129442
3      0.302660
4      0.950812
         ...   
913   -1.209697
914    0.626736
915   -0.129442
916   -0.129442
917    0.302660
Name: RestingBP, Length: 918, dtype: float64

In [7]:
# Zscore for Cholestrol ( one way to do it is this)
df['zcholestrol']=(df['Cholesterol']-df['Cholesterol'].mean())/df['Cholesterol'].std()
df1=df[(df['zcholestrol'] > -3) & (df['zcholestrol'] < 3)]
df1.shape


(915, 13)

In [8]:
# For checking outliers for all numeric columns, we use the ZScore from scipy.stats 

from scipy.stats import zscore

# Select the numerical colums for Z-score calculation
numerical_cols = ['RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

# Filter Out Outliers having a Z-scores outside the range of [-3, 3] for these columns
df2 = df[(zscore(df[numerical_cols]) > -3).all(axis=1) & (zscore(df[numerical_cols]) < 3).all(axis=1)]
df2.shape

(899, 13)

In [9]:
# For Categorical Features we check the following: 

df['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [10]:
df['RestingECG'].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [11]:
df['ExerciseAngina'].unique()

array(['N', 'Y'], dtype=object)

In [12]:
df['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [13]:
df3 = df2.copy()
df3.ExerciseAngina.replace({
    'Y': 1,
    'N': 0
}, inplace=True)

df3.ST_Slope.replace({
    'Down': 1,
    'Flat': 2,
    'Up': 3
}, inplace=True)

df3.RestingECG.replace({
    'Normal':1,
    'ST': 2,
    'LVH': 3
}, inplace=True)

df3.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,zcholestrol
0,40,M,ATA,140,289,0,1,172,0,0.0,3,0,0.824621
1,49,F,NAP,160,180,0,1,156,0,1.0,2,1,-0.171867
2,37,M,ATA,130,283,0,2,98,0,0.0,3,0,0.769768
3,48,F,ASY,138,214,0,1,108,1,1.5,2,1,0.138964
4,54,M,NAP,150,195,0,1,122,0,0.0,3,0,-0.034736


In [14]:
df4 = pd.get_dummies(df3, drop_first=True, dtype='int')
df4.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,zcholestrol,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,0,0.824621,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,1,-0.171867,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,0,0.769768,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,1,0.138964,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,0,-0.034736,1,0,1,0


In [15]:
X = df4.drop(['HeartDisease', 'zcholestrol'], axis=1)
y = df4['HeartDisease']

X.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,1,0,1,0


In [16]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
X_sclaed=scaler.fit_transform(X)
X_sclaed

array([[-1.42815446,  0.46590022,  0.84963584, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-0.47585532,  1.63471366, -0.16812204, ..., -0.48465463,
         1.86949191, -0.22955001],
       [-1.7455875 , -0.1185065 ,  0.79361247, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       ...,
       [ 0.3706328 , -0.1185065 , -0.62564622, ..., -0.48465463,
        -0.5349047 , -0.22955001],
       [ 0.3706328 , -0.1185065 ,  0.35476274, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-1.63977649,  0.34901888, -0.21480818, ..., -0.48465463,
         1.86949191, -0.22955001]])

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_sclaed, y, test_size=0.2, random_state=42)

In [18]:
X_train.shape

(719, 13)

In [19]:
X_test.shape

(180, 13)

### **Finding the best model using GridSearchCV**

In [20]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [21]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1,10,20],
            'kernel': ['linear', 'rbf']
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear', multi_class='ovr'),
        'params': {
            'C': [1,10,20]
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10,50,100],
            'criterion': ['gini', 'entropy']
        }
    }
}


In [None]:
# Here we could've used RandomizedSearchCV as well, but GridSearchCV is doable for this small dataset. Also, we use X_scaled for faster compuatation then X.
scores = []

for model_name, mparam in model_params.items():
    clf = GridSearchCV(mparam['model'], mparam['params'], cv=5, return_train_score=False)
    clf.fit(X_sclaed, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
    })

df5=pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df5

Unnamed: 0,model,best_score,best_params
0,svm,0.815295,"{'C': 1, 'kernel': 'rbf'}"
1,logistic_regression,0.806375,{'C': 1}
2,random_forest,0.827505,"{'criterion': 'entropy', 'n_estimators': 100}"


### **Here Random Forest Performs the best with the highest score**

### **Use PCA to reduce dimensions**

In [23]:
X

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,1,132,0,1.2,2,1,0,0,1
914,68,144,193,1,1,141,0,3.4,2,1,0,0,0
915,57,130,131,0,1,115,1,1.2,2,1,0,0,0
916,57,130,236,0,3,174,0,0.0,2,0,1,0,0


In [24]:
from sklearn.decomposition import PCA
pca=PCA(0.95)
X_pca=pca.fit_transform(X)
X_pca

array([[ 93.12912839, -29.67670735],
       [-16.33895199, -14.80374789],
       [ 82.67026321,  38.91313153],
       ...,
       [-68.22650773,  17.69545401],
       [ 40.0272494 , -33.46953106],
       [-20.61297776, -37.61461313]])

In [27]:
X_pca.shape

(899, 2)

In [25]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [26]:
from sklearn.ensemble import RandomForestClassifier
rclf=RandomForestClassifier(n_estimators=10, criterion='entropy')
rclf.fit(X_train_pca, y_train)
rclf.score(X_test_pca, y_test)

0.6722222222222223

### **Here the accuracy has reduced, as we have lost some information due to PCA.**