In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Load preprocessed data (post other steps)
df = pd.read_csv('/content/data/raw/StrokeData.csv')
df['bmi'].fillna(df['bmi'].median(), inplace=True)  # Group task placeholder
df = pd.get_dummies(df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)

# Assume outlier removal, scaling, and selection done earlier
# Placeholder for Member 5's feature selection output
selected_features = ['age', 'avg_glucose_level', 'bmi', 'gender_Male', 'ever_married_Yes']  # Example; replace with actual selected features
X = df[selected_features]
y = df['stroke']

# Apply PCA to analyze variance
pca = PCA()
pca.fit(X)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Plot explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='-')
plt.title('Cumulative Explained Variance Ratio by PCA Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
plt.legend()
plt.grid(True)
plt.savefig('/content/results/eda_visualizations/pca_variance.png')
plt.show()

# Determine optimal components (95% variance)
n_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Optimal number of components to retain 95% variance: {n_components}")

# Apply PCA with optimal components
pca_reduced = PCA(n_components=n_components)
X_pca_reduced = pca_reduced.fit_transform(X)

# Display results
print(f"Shape of reduced data: {X_pca_reduced.shape}")

# Interpretation
print("Interpretation: 95% variance is retained with", n_components, "components, reducing dimensionality while preserving stroke prediction patterns.")
