#Principal Component Analysis (Dimentionality Reduction)
**Principal Component Analysis** is a dimensionality reduction technique used in data science and machine learning. It transforms a high-dimensional dataset into a lower-dimensional one by:

1. Identifying the principal components (directions of maximum variance)

2. Projecting the data onto these components

3. Preserving as much information (variance) as possible

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

##Load dataset

In [2]:
df = pd.read_csv("heart.csv")
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


# 1st Way

In [3]:
s_dum = pd.get_dummies(df.Sex)
s_dum

Unnamed: 0,F,M
0,False,True
1,True,False
2,False,True
3,True,False
4,False,True
...,...,...
913,False,True
914,False,True
915,False,True
916,True,False


In [4]:
dum_cp = pd.get_dummies(df.ChestPainType)
dum_cp

Unnamed: 0,ASY,ATA,NAP,TA
0,False,True,False,False
1,False,False,True,False
2,False,True,False,False
3,True,False,False,False
4,False,False,True,False
...,...,...,...,...
913,False,False,False,True
914,True,False,False,False
915,True,False,False,False
916,False,True,False,False


In [5]:
rest_dum = pd.get_dummies(df.RestingECG)
rest_dum

Unnamed: 0,LVH,Normal,ST
0,False,True,False
1,False,True,False
2,False,False,True
3,False,True,False
4,False,True,False
...,...,...,...
913,False,True,False
914,False,True,False
915,False,True,False
916,True,False,False


In [6]:
ex_dum = pd.get_dummies(df.ExerciseAngina)
ex_dum

Unnamed: 0,N,Y
0,True,False
1,True,False
2,True,False
3,False,True
4,True,False
...,...,...
913,True,False
914,True,False
915,False,True
916,True,False


In [7]:
st_dum = pd.get_dummies(df.ST_Slope)
st_dum

Unnamed: 0,Down,Flat,Up
0,False,False,True
1,False,True,False
2,False,False,True
3,False,True,False
4,False,False,True
...,...,...,...
913,False,True,False
914,False,True,False
915,False,True,False
916,False,True,False


In [8]:
X = pd.concat([df['Age'],s_dum, dum_cp, df.loc[:,['RestingBP','Cholesterol','FastingBS']], rest_dum, df['MaxHR'], ex_dum, df['Oldpeak'], st_dum],axis=1)
X

Unnamed: 0,Age,F,M,ASY,ATA,NAP,TA,RestingBP,Cholesterol,FastingBS,LVH,Normal,ST,MaxHR,N,Y,Oldpeak,Down,Flat,Up
0,40,False,True,False,True,False,False,140,289,0,False,True,False,172,True,False,0.0,False,False,True
1,49,True,False,False,False,True,False,160,180,0,False,True,False,156,True,False,1.0,False,True,False
2,37,False,True,False,True,False,False,130,283,0,False,False,True,98,True,False,0.0,False,False,True
3,48,True,False,True,False,False,False,138,214,0,False,True,False,108,False,True,1.5,False,True,False
4,54,False,True,False,False,True,False,150,195,0,False,True,False,122,True,False,0.0,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,False,True,False,False,False,True,110,264,0,False,True,False,132,True,False,1.2,False,True,False
914,68,False,True,True,False,False,False,144,193,1,False,True,False,141,True,False,3.4,False,True,False
915,57,False,True,True,False,False,False,130,131,0,False,True,False,115,False,True,1.2,False,True,False
916,57,True,False,False,True,False,False,130,236,0,True,False,False,174,True,False,0.0,False,True,False


In [9]:
y = df.HeartDisease
y

0      0
1      1
2      0
3      1
4      0
      ..
913    1
914    1
915    1
916    1
917    0
Name: HeartDisease, Length: 918, dtype: int64

# Alternative Way

In [10]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [11]:
mean_bp = df.RestingBP.mean()
df['RestingBP'] = df['RestingBP'].replace(0, mean_bp)
df['Cholesterol'] = df['Cholesterol'].replace(0, df.Cholesterol.mean())
df['FastingBS'] = df['FastingBS'].replace(0, df.FastingBS.mean())
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140.0,289.0,0.233115,Normal,172,N,0.0,Up,0
1,49,F,NAP,160.0,180.0,0.233115,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130.0,283.0,0.233115,ST,98,N,0.0,Up,0
3,48,F,ASY,138.0,214.0,0.233115,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150.0,195.0,0.233115,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110.0,264.0,0.233115,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144.0,193.0,1.000000,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130.0,131.0,0.233115,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130.0,236.0,0.233115,LVH,174,N,0.0,Flat,1


In [12]:
df[df.RestingBP==0]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


In [13]:
df = pd.get_dummies(df)
df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140.0,289.0,0.233115,172,0.0,0,False,True,False,...,False,False,False,True,False,True,False,False,False,True
1,49,160.0,180.0,0.233115,156,1.0,1,True,False,False,...,True,False,False,True,False,True,False,False,True,False
2,37,130.0,283.0,0.233115,98,0.0,0,False,True,False,...,False,False,False,False,True,True,False,False,False,True
3,48,138.0,214.0,0.233115,108,1.5,1,True,False,True,...,False,False,False,True,False,False,True,False,True,False
4,54,150.0,195.0,0.233115,122,0.0,0,False,True,False,...,True,False,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110.0,264.0,0.233115,132,1.2,1,False,True,False,...,False,True,False,True,False,True,False,False,True,False
914,68,144.0,193.0,1.000000,141,3.4,1,False,True,True,...,False,False,False,True,False,True,False,False,True,False
915,57,130.0,131.0,0.233115,115,1.2,1,False,True,True,...,False,False,False,True,False,False,True,False,True,False
916,57,130.0,236.0,0.233115,174,0.0,1,True,False,False,...,False,False,True,False,False,True,False,False,True,False


In [14]:
X = df.drop('HeartDisease',axis='columns')
y = df.HeartDisease
print(X)
print(y)

     Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  Sex_F  Sex_M  \
0     40      140.0        289.0   0.233115    172      0.0  False   True   
1     49      160.0        180.0   0.233115    156      1.0   True  False   
2     37      130.0        283.0   0.233115     98      0.0  False   True   
3     48      138.0        214.0   0.233115    108      1.5   True  False   
4     54      150.0        195.0   0.233115    122      0.0  False   True   
..   ...        ...          ...        ...    ...      ...    ...    ...   
913   45      110.0        264.0   0.233115    132      1.2  False   True   
914   68      144.0        193.0   1.000000    141      3.4  False   True   
915   57      130.0        131.0   0.233115    115      1.2  False   True   
916   57      130.0        236.0   0.233115    174      0.0   True  False   
917   38      138.0        175.0   0.233115    173      0.0  False   True   

     ChestPainType_ASY  ChestPainType_ATA  ChestPainType_NAP  \
0          

# Train Test Split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Before PCA

In [16]:
rf = cross_val_score(RandomForestClassifier(n_estimators=35),X_train, y_train)

print(rf)

[0.8707483  0.89795918 0.81632653 0.87755102 0.85616438]


In [17]:
lr = cross_val_score(LogisticRegression(max_iter=5000),X_train, y_train)

print(lr)

[0.85034014 0.91156463 0.80272109 0.85714286 0.81506849]


In [18]:
model = KNeighborsClassifier(n_neighbors=9)
model.fit(X_train, y_train)
model.score(X_test, y_test)

[WinError 2] The system cannot find the file specified
  File "C:\Users\Ali Computers\AppData\Roaming\Python\Python312\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Program Files\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


0.7228260869565217

# After PCA

## Normalize the Feature Matrix (X) with Standard Scaling


In [20]:
scalar = StandardScaler()
scaled_x = scalar.fit_transform(X)
scaled_x.shape

(918, 20)

## Perform Dimensionality Reduction Using PCA (95% Variance Retained)


In [21]:
pca = PCA(0.95)
x_pca = pca.fit_transform(scaled_x)
x_pca.shape

(918, 13)

#Train test split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.2)

##Train on KNN

In [23]:
model = KNeighborsClassifier(n_neighbors=9)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8532608695652174

##Random Forest

In [24]:
rf = cross_val_score(RandomForestClassifier(n_estimators=35),X_train, y_train)

print(rf)

[0.78911565 0.85034014 0.91156463 0.82312925 0.82876712]


##Logistic Regression

In [25]:
lr = cross_val_score(LogisticRegression(max_iter=150),X_train, y_train)

print(lr)

[0.80952381 0.85034014 0.89795918 0.81632653 0.85616438]


##Try retaining less variance using PCA and determine how it makes a difference.

In [26]:
pca = PCA(0.80)
x_pca = pca.fit_transform(scaled_x)
x_pca.shape

(918, 10)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.2)

In [28]:
rf = cross_val_score(RandomForestClassifier(n_estimators=35),X_train, y_train)

print(rf)

[0.84353741 0.85714286 0.82993197 0.80272109 0.87671233]


In [29]:
model = KNeighborsClassifier(n_neighbors=9)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8315217391304348

In [30]:
lr = cross_val_score(LogisticRegression(max_iter=150),X_train, y_train)

print(lr)

[0.8707483  0.88435374 0.84353741 0.82993197 0.88356164]
