<a href="https://colab.research.google.com/github/NeelvaniVarsha/MLLab/blob/main/Lab11_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
digits = load_digits()
X = digits.data
y = digits.target

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [6]:
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train_pca, y_train)

In [7]:
y_pred = logreg.predict(X_test_pca)

In [8]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy using PCA with 2 components: {accuracy:.4f}')

Accuracy using PCA with 2 components: 0.5167


In [9]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [10]:
from google.colab import files
uploaded = files.upload()

Saving heart.csv to heart.csv


In [11]:
df = pd.read_csv('heart.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [12]:
numeric_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
z_scores = stats.zscore(df[numeric_cols])
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df_clean = df[filtered_entries].reset_index(drop=True)

In [13]:
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numeric_cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ])

In [15]:
X = df_clean.drop('HeartDisease', axis=1)
y = df_clean['HeartDisease']
X_processed = preprocessor.fit_transform(X)

In [16]:
cat_encoded_cols = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
feature_names = list(cat_encoded_cols) + numeric_cols

In [17]:
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_processed_df, y, test_size=0.2, random_state=42)

In [19]:
models = {
    'SVM': SVC(kernel='rbf', random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

In [20]:
accuracies = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")

SVM Accuracy: 0.8889
Logistic Regression Accuracy: 0.8889
Random Forest Accuracy: 0.8889


In [21]:
best_model_name = max(accuracies, key=accuracies.get)
print(f"\nBest Model: {best_model_name} with Accuracy: {accuracies[best_model_name]:.4f}")


Best Model: SVM with Accuracy: 0.8889


In [22]:
pca = PCA()
pca.fit(X_processed_df)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"\nNumber of PCA components to retain 95% variance: {n_components}")


Number of PCA components to retain 95% variance: 10


In [23]:
best_model = models[best_model_name]
pipeline = Pipeline([
    ('pca', PCA(n_components=n_components)),
    ('classifier', best_model)
])

In [24]:
pipeline.fit(X_train, y_train)
y_pred_pca = pipeline.predict(X_test)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f"{best_model_name} Accuracy with PCA: {accuracy_pca:.4f}")
print(f"Accuracy Change with钽Impact: {accuracy_pca - accuracies[best_model_name]:.4f}")

SVM Accuracy with PCA: 0.8944
Accuracy Change with钽Impact: 0.0056


In [25]:
X_processed_df['HeartDisease'] = y
X_processed_df.to_csv('processed_heart.csv', index=False)

In [27]:
df = pd.read_csv('processed_heart.csv')
df

Unnamed: 0,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.428154,0.465900,0.849636,-0.550362,1.384320,-0.855469,0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.475855,1.634714,-0.168122,-0.550362,0.752973,0.137516,1
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.745588,-0.118507,0.793612,-0.550362,-1.535661,-0.855469,0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,-0.581666,0.349019,0.149344,-0.550362,-1.141069,0.634008,1
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.053200,1.050307,-0.028064,-0.550362,-0.588640,-0.855469,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-0.899099,-1.287320,0.616205,-0.550362,-0.194048,0.336112,1
895,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.534554,0.699663,-0.046738,1.816985,0.161085,2.520678,1
896,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.370633,-0.118507,-0.625646,-0.550362,-0.864854,0.336112,1
897,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.370633,-0.118507,0.354763,-0.550362,1.463238,-0.855469,1
