In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Load Digits dataset
digits = load_digits()
X = digits.data  # Features
y = digits.target  # Labels

# Step 2: Split the data into training and testing sets (80% - 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Apply PCA with n_components = 2
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Step 5: Train Logistic Regression on the PCA-transformed data
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train_pca, y_train)

# Step 6: Make predictions on the test set
y_pred = logreg.predict(X_test_pca)

# Step 7: Evaluate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy using PCA with 2 components: {accuracy:.4f}')

Accuracy using PCA with 2 components: 0.5167


In [3]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Step 1: Load the dataset
# Assuming the dataset is stored in 'heart.csv'
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('heart.csv')

# Step 2: Remove outliers using Z-score
numeric_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
z_scores = stats.zscore(df[numeric_cols])
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df_clean = df[filtered_entries].reset_index(drop=True)

# Step 3: Encode categorical variables and apply scaling
# Identify categorical and numerical columns
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numeric_cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ])

# Apply preprocessing
X = df_clean.drop('HeartDisease', axis=1)
y = df_clean['HeartDisease']
X_processed = preprocessor.fit_transform(X)

# Get feature names after one-hot encoding
cat_encoded_cols = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
feature_names = list(cat_encoded_cols) + numeric_cols

# Convert processed features back to DataFrame for clarity
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

# Step 4: Build classification models and evaluate accuracy
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed_df, y, test_size=0.2, random_state=42)

# Define models
models = {
    'SVM': SVC(kernel='rbf', random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}
# Train and evaluate each model
accuracies = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")

# Find the best model
best_model_name = max(accuracies, key=accuracies.get)
print(f"\nBest Model: {best_model_name} with Accuracy: {accuracies[best_model_name]:.4f}")

# Step 5: Apply PCA and evaluate impact on model accuracy
# Determine number of components to retain 95% variance
pca = PCA()
pca.fit(X_processed_df)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"\nNumber of PCA components to retain 95% variance: {n_components}")

# Create pipeline with PCA and the best model
best_model = models[best_model_name]
pipeline = Pipeline([
    ('pca', PCA(n_components=n_components)),
    ('classifier', best_model)
])

# Train and evaluate model with PCA
pipeline.fit(X_train, y_train)
y_pred_pca = pipeline.predict(X_test)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f"{best_model_name} Accuracy with PCA: {accuracy_pca:.4f}")
print(f"Accuracy Change with钽Impact: {accuracy_pca - accuracies[best_model_name]:.4f}")

# Save the processed dataset (optional)
X_processed_df['HeartDisease'] = y
X_processed_df.to_csv('processed_heart.csv', index=False)

Saving heart.csv to heart.csv
SVM Accuracy: 0.8889
Logistic Regression Accuracy: 0.8889
Random Forest Accuracy: 0.8889

Best Model: SVM with Accuracy: 0.8889

Number of PCA components to retain 95% variance: 10
SVM Accuracy with PCA: 0.8944
Accuracy Change with钽Impact: 0.0056
