In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from google.colab import files

# Step 1: Upload the dataset
print("Upload your dataset:")
uploaded = files.upload()  # Prompt for file upload
filename = list(uploaded.keys())[0]

# Step 2: Load the dataset
data = pd.read_csv(filename)
print("First few rows of the dataset:")
display(data.head())

# Step 3: Check for the target variable
target_column = 'PCOS (Y/N)'
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found in dataset.")

# Step 4: Handle missing values
numeric_data = data.select_dtypes(include=['float64', 'int64'])  # Select only numeric columns
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(numeric_data)

# Step 5: Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_imputed)

# Step 6: Perform PCA
pca = PCA()
pca_result = pca.fit_transform(data_scaled)

# Step 7: Visualize Explained Variance
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Scree Plot
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', label='Individual Variance')
plt.title('Scree Plot')
plt.xlabel('Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.grid()
plt.legend()
plt.show()

# Cumulative Variance Plot
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', color='orange', label='Cumulative Variance')
plt.axhline(y=0.9, color='r', linestyle='--', label='90% Threshold')
plt.title('Cumulative Explained Variance')
plt.xlabel('Principal Components')
plt.ylabel('Cumulative Variance Ratio')
plt.grid()
plt.legend()
plt.show()

# Step 8: Find number of components explaining ~90% variance
num_components_90 = np.argmax(cumulative_variance >= 0.9) + 1
print(f"Number of components to explain ~90% variance: {num_components_90}")

# Step 9: Reduce data using selected components
pca_df = pd.DataFrame(pca_result[:, :num_components_90], columns=[f'PC{i+1}' for i in range(num_components_90)])
pca_df[target_column] = data[target_column].values  # Add the target column back to the PCA data

# Step 10: EDA with PCA
plt.figure(figsize=(8, 5))
sns.scatterplot(x='PC1', y='PC2', hue=target_column, data=pca_df)
plt.title('Scatter Plot of First Two Principal Components')
plt.grid()
plt.show()

# Step 11: Split the data for classification
X = pca_df.drop(columns=[target_column])  # Features
y = pca_df[target_column]  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 12: Define and evaluate models
# Define a dictionary of individual models
base_models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Support Vector Machine (Linear Kernel)": SVC(kernel='linear', random_state=42),
    "Support Vector Machine (RBF Kernel)": SVC(kernel='rbf', random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42)
}

# Define ensemble models
ensemble_models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=42), random_state=42),
    "Bagging (Decision Tree)": BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42), random_state=42),
    "Voting Classifier (Hard)": VotingClassifier(estimators=[
        ("Logistic Regression", LogisticRegression(random_state=42, max_iter=1000)),
        ("SVM", SVC(kernel='rbf', probability=True, random_state=42)),
        ("Decision Tree", DecisionTreeClassifier(random_state=42))
    ], voting='hard'),
    "Voting Classifier (Soft)": VotingClassifier(estimators=[
        ("Logistic Regression", LogisticRegression(random_state=42, max_iter=1000)),
        ("SVM", SVC(kernel='rbf', probability=True, random_state=42)),
        ("Decision Tree", DecisionTreeClassifier(random_state=42))
    ], voting='soft')
}

# Combine all models
all_models = {**base_models, **ensemble_models}

# Dictionary to store results
results = {}

print("\n--- Evaluating All Models (Base + Ensemble) ---\n")
for model_name, model in all_models.items():
    print(f"Training and evaluating: {model_name}")

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy

    # Print performance metrics
    print(f"Accuracy for {model_name}: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

# Step 13: Visualize Model Performance
plt.figure(figsize=(14, 6))
plt.bar(results.keys(), results.values(), color='skyblue')
plt.title('Model Accuracy Comparison (Including Ensemble Models)', fontsize=16)
plt.ylabel('Accuracy', fontsize=14)
plt.xlabel('Models', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Print the best model
best_model_name = max(results, key=results.get)
print(f"The best performing model is: {best_model_name} with accuracy: {results[best_model_name]:.4f}")

# Step 14: Save PCA results with target
pca_df.to_csv('pca_transformed_data_with_target.csv', index=False)
print("PCA results with target saved as 'pca_transformed_data_with_target.csv'")
