In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import time
import os
import psutil
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Define a colorblind-friendly palette (Okabe-Ito)
COLORBLIND_PALETTE = {
    'blue': '#0072B2',
    'orange': '#E69F00',
    'green': '#009E73',
    'red': '#D55E00',
    'purple': '#CC79A7',
    'yellow': '#F0E442',
    'cyan': '#56B4E9',
    'grey': '#999999'
}

# Colors for the three classes (consistent across all visualizations)
CLASS_COLORS = {
    'FALSE POSITIVE': COLORBLIND_PALETTE['red'],
    'CANDIDATE': COLORBLIND_PALETTE['blue'],
    'CONFIRMED': COLORBLIND_PALETTE['green']
}

# Colors for with/without PCA comparison
PCA_COLORS = {
    'Without PCA': COLORBLIND_PALETTE['orange'],
    'With PCA': COLORBLIND_PALETTE['purple']
}

##  Data Loading and Exploratory Data Analysis

In [3]:
def load_and_preprocess_data(csv_path='koi_data.csv'):
    """
    Load and preprocess the KOI dataset for multi-class classification
    Parameters:
    -----------
    csv_path : str
        Path to the CSV file
    Returns:
    --------
    X_train, X_test, y_train, y_test, feature_names, scaler, class_names
    """
    df = pd.read_csv(csv_path)

    # Create multi-class classification target
    class_mapping = {'FALSE POSITIVE': 0, 'CANDIDATE': 1, 'CONFIRMED': 2}
    class_names = ['FALSE POSITIVE', 'CANDIDATE', 'CONFIRMED']
    df['target'] = df['koi_disposition'].map(class_mapping)

    # Display class distribution
    class_counts = df['target'].value_counts()

    print("\nClass distribution:")
    for class_id, count in class_counts.items():
        class_name = class_names[class_id]
        percentage = count / len(df) * 100
        print(f"{class_name}: {count} ({percentage:.2f}%)")

    # Select features - exclude non-numerical and target columns
    exclude_cols = ['koi_disposition', 'target']
    feature_cols = [col for col in df.columns if col not in exclude_cols]

    # Split data
    X = df[feature_cols]
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Create a scaler for later use
    scaler = StandardScaler()
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    return X_train, X_test, y_train, y_test, feature_cols, scaler, class_names

##  Principal Component Analysis (PCA)

In [4]:
def perform_pca_analysis(X_train, feature_names, scaler, class_names, y_train, n_components=20):
    """
    Perform PCA analysis to find optimal number of components.
    Parameters:
    -----------
    X_train : DataFrame
        Training data
    feature_names : list
        Names of features
    scaler : StandardScaler
        Fitted scaler
    class_names : list
        Names of classes
    y_train : Series
        Training labels
    n_components : int
        Number of PCA components to analyze
    Returns:
    --------
    optimal_components : int
        Optimal number of components based on 95% variance
    pca : PCA
        Fitted PCA object
    """
    # Scale the data
    X_scaled = scaler.fit_transform(X_train)

    # Apply PCA
    pca = PCA(n_components=min(n_components, len(feature_names)))
    pca.fit(X_scaled)

    # Calculate the optimal number of components to explain 95% variance
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
    optimal_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1  # add 1 as indexing starts from 0

    # Now return the optimal components and PCA object
    return optimal_components, pca

## Support Vector Machine Model

In [5]:
# Helper functions
def get_memory_usage():
    """Get current memory usage of the process in MB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024  # Convert to MB

def run_svm_experiment(X_train, X_test, y_train, y_test, feature_names, scaler, class_names, with_pca=True, n_components=10):
    """
    Run SVM experiment with or without PCA for multi-class classification
    Parameters:
    -----------
    X_train, X_test, y_train, y_test : DataFrame/Series
        Training and test data
    feature_names : list
        List of feature names
    scaler : StandardScaler
        Fitted scaler
    class_names : list
        Names of classes
    with_pca : bool
        Whether to use PCA for dimensionality reduction
    n_components : int
        Number of PCA components to use if with_pca is True
    Returns:
    --------
    results : dict
        Dictionary with experiment results
    """
    experiment_name = "SVM with PCA" if with_pca else "SVM without PCA"
    print(f"\n{'='*50}")
    print(f"Running {experiment_name} - Multi-class Classification")
    print(f"{'='*50}")

    # Log initial memory usage
    initial_memory = get_memory_usage()
    print(f"Initial memory usage: {initial_memory:.2f} MB")

    # Create pipeline
    if with_pca:
        pipeline = Pipeline([
            ('scaler', scaler),
            ('pca', PCA(n_components=n_components)),
            ('svm', SVC())
        ])
        print(f"Using PCA with {n_components} components")
    else:
        pipeline = Pipeline([
            ('scaler', scaler),
            ('svm', SVC())
        ])
        print("Using all features without PCA")

    # Define parameter grid for grid search
    param_grid = {
        'svm__C': [0.1, 1, 10, 100],
        'svm__gamma': ['scale', 'auto', 0.1, 0.01],
        'svm__kernel': ['rbf', 'linear', 'poly'],
        'svm__class_weight': ['balanced', None]
    }

    # Create grid search
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1
    )

    # Measure training time
    print("\nTraining model...")
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    training_time = time.time() - start_time

    # Log memory usage after training
    final_memory = get_memory_usage()
    memory_used = final_memory - initial_memory

    # Get best model
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"Training completed in {training_time:.2f} seconds")
    print(f"Memory usage: {memory_used:.2f} MB")
    print(f"Best parameters: {best_params}")

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    print("\nModel performance:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1 Score:  {f1:.4f}")

    # Print detailed classification report
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names))

    # Create confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Create interactive confusion matrix
    cm_fig = px.imshow(
        cm,
        x=class_names,
        y=class_names,
        text_auto=True,
        color_continuous_scale='Blues',
        labels=dict(x="Predicted", y="True", color="Count"),
        title=f"Confusion Matrix - {experiment_name}"
    )
    cm_fig.update_layout(
        title={
            'text': f"Confusion Matrix - {experiment_name}",
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        height=600,
        width=700,
        xaxis=dict(
            title="Predicted Class",
            tickangle=-45
        ),
        yaxis=dict(
            title="True Class"
        )
    )

    # Calculate class-specific metrics
    class_precision, class_recall, class_f1, support = precision_recall_fscore_support(
        y_test, y_pred, labels=range(len(class_names))
    )

    # Create interactive grouped bar chart
    class_metrics_fig = go.Figure()

    # Add trace for Precision
    class_metrics_fig.add_trace(go.Bar(
        x=class_names,
        y=class_precision,
        name='Precision',
        marker_color=COLORBLIND_PALETTE['blue'],
        text=[f"{val:.2f}" for val in class_precision],
        textposition='auto',
        textfont=dict(size=14),
        hovertemplate='Metric=Precision<br>Class=%{x}<br>Score=%{y:.4f}<br>Support=%{customdata}<extra></extra>',
        customdata=support
    ))

    # Add trace for Recall
    class_metrics_fig.add_trace(go.Bar(
        x=class_names,
        y=class_recall,
        name='Recall',
        marker_color=COLORBLIND_PALETTE['orange'],
        text=[f"{val:.2f}" for val in class_recall],
        textposition='auto',
        textfont=dict(size=14),
        hovertemplate='Metric=Recall<br>Class=%{x}<br>Score=%{y:.4f}<br>Support=%{customdata}<extra></extra>',
        customdata=support
    ))

    # Add trace for F1 Score
    class_metrics_fig.add_trace(go.Bar(
        x=class_names,
        y=class_f1,
        name='F1 Score',
        marker_color=COLORBLIND_PALETTE['green'],
        text=[f"{val:.2f}" for val in class_f1],
        textposition='auto',
        textfont=dict(size=14),
        hovertemplate='Metric=F1 Score<br>Class=%{x}<br>Score=%{y:.4f}<br>Support=%{customdata}<extra></extra>',
        customdata=support
    ))

    # Update layout
    class_metrics_fig.update_layout(
        title={
            'text': f"Class-specific Performance Metrics - {experiment_name}",
            'y': 0.95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(size=18)
        },
        xaxis_title=dict(text="Exoplanet Class", font=dict(size=14)),
        yaxis_title=dict(text="Score", font=dict(size=14)),
        yaxis=dict(range=[0, 1.0]),
        legend=dict(
            title=dict(text="Metric", font=dict(size=14)),
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
            font=dict(size=12)
        ),
        barmode='group',
        height=550,
        width=950,
        plot_bgcolor='rgba(240, 240, 240, 0.8)'
    )

    # Store results
    results = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'training_time': training_time,
        'memory_usage': memory_used,
        'best_params': best_params,
        'class_precision': class_precision,
        'class_recall': class_recall,
        'class_f1': class_f1,
        'support': support,
        'confusion_matrix': cm,
        'cm_fig': cm_fig,
        'class_metrics_fig': class_metrics_fig
    }
    return results

## Results Comparison

In [6]:
def compare_results(results_without_pca, results_with_pca, class_names):
    """
    Compare and visualize results with and without PCA for multi-class classification
    Parameters:
    -----------
    results_without_pca : dict
        Results dictionary for SVM without PCA
    results_with_pca : dict
        Results dictionary for SVM with PCA
    class_names : list
        Names of classes
    """
    print("\n\n" + "="*80)
    print("COMPARISON OF RESULTS: SVM WITHOUT PCA vs. SVM WITH PCA")
    print("="*80)
    # Compare metrics
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    print("\nOverall Performance Metrics:")
    print(f"{'Metric':<12} {'Without PCA':<15} {'With PCA':<15} {'Difference':<12} {'% Change':<10}")
    print("-"*65)
    for metric in metrics:
        without_pca = results_without_pca[metric]
        with_pca = results_with_pca[metric]
        diff = with_pca - without_pca
        pct_change = (diff / without_pca) * 100 if without_pca != 0 else float('inf')
        print(f"{metric.capitalize():<12} {without_pca:.4f} {' '*8} {with_pca:.4f} {' '*8} {diff:.4f} {' '*6} {pct_change:+.2f}%")

    # Create interactive bar chart for metrics comparison
    perf_comparison_fig = go.Figure()

    # Define values for both approaches
    without_pca_values = [results_without_pca[m] for m in metrics]
    with_pca_values = [results_with_pca[m] for m in metrics]

    # Add trace for Without PCA
    perf_comparison_fig.add_trace(go.Bar(
        x=['Accuracy', 'Precision', 'Recall', 'F1 Score'],
        y=without_pca_values,
        name='Without PCA',
        marker_color=PCA_COLORS['Without PCA'],
        text=[f"{val:.3f}" for val in without_pca_values],
        textposition='auto',
        hovertemplate='Metric=%{x}<br>Score=%{y:.4f}<br>Approach=Without PCA<extra></extra>'
    ))

    # Add trace for With PCA
    perf_comparison_fig.add_trace(go.Bar(
        x=['Accuracy', 'Precision', 'Recall', 'F1 Score'],
        y=with_pca_values,
        name='With PCA',
        marker_color=PCA_COLORS['With PCA'],
        text=[f"{val:.3f}" for val in with_pca_values],
        textposition='auto',
        hovertemplate='Metric=%{x}<br>Score=%{y:.4f}<br>Approach=With PCA<extra></extra>'
    ))

    # Update layout
    perf_comparison_fig.update_layout(
        title={
            'text': 'Performance Metrics: With vs. Without PCA',
            'y': 0.95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        xaxis_title="Metric",
        yaxis_title="Score",
        yaxis=dict(range=[0, 1.0]),
        legend=dict(
            title="",
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        barmode='group'
    )

    # Create interactive bar chart for class-specific F1 scores
    f1_comparison_fig = go.Figure()

    # Get support values from results
    support_values = results_without_pca.get('support', np.ones(len(class_names)))

    # Add trace for Without PCA
    f1_comparison_fig.add_trace(go.Bar(
        x=class_names,
        y=results_without_pca['class_f1'],
        name='Without PCA',
        marker_color=PCA_COLORS['Without PCA'],
        text=[f"{val:.3f}" for val in results_without_pca['class_f1']],
        textposition='auto',
        hovertemplate='Class=%{x}<br>F1 Score=%{y:.4f}<br>Approach=Without PCA<br>Support=%{customdata}<extra></extra>',
        customdata=support_values
    ))

    # Add trace for With PCA
    f1_comparison_fig.add_trace(go.Bar(
        x=class_names,
        y=results_with_pca['class_f1'],
        name='With PCA',
        marker_color=PCA_COLORS['With PCA'],
        text=[f"{val:.3f}" for val in results_with_pca['class_f1']],
        textposition='auto',
        hovertemplate='Class=%{x}<br>F1 Score=%{y:.4f}<br>Approach=With PCA<br>Support=%{customdata}<extra></extra>',
        customdata=support_values
    ))

    # Update layout
    f1_comparison_fig.update_layout(
        title={
            'text': 'F1 Score by Class: With vs. Without PCA',
            'y': 0.95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        xaxis_title="Exoplanet Class",
        yaxis_title="F1 Score",
        yaxis=dict(range=[0, 1.0]),
        legend=dict(
            title="",
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        barmode='group',
        height=500
    )

    # Create interactive bar chart for computational metrics
    comp_df = pd.DataFrame({
        'Metric': ['Training Time (s)', 'Memory Usage (MB)'],
        'Without PCA': [results_without_pca['training_time'], results_without_pca['memory_usage']],
        'With PCA': [results_with_pca['training_time'], results_with_pca['memory_usage']]
    })

    # Melt the DataFrame for easier plotting
    melted_comp_df = pd.melt(comp_df, id_vars=['Metric'], var_name='PCA', value_name='Value')

    # Create interactive subplots for computational metrics
    comp_fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=("Training Time Comparison", "Memory Usage Comparison"),
        specs=[[{"type": "bar"}, {"type": "bar"}]]
    )

    # Add traces for training time
    time_df = melted_comp_df[melted_comp_df['Metric'] == 'Training Time (s)']
    colors = [PCA_COLORS['Without PCA'] if pca == 'Without PCA' else PCA_COLORS['With PCA'] for pca in time_df['PCA']]
    comp_fig.add_trace(
        go.Bar(
            x=time_df['PCA'],
            y=time_df['Value'],
            marker_color=colors,
            text=time_df['Value'].apply(lambda x: f"{x:.2f}s"),
            textposition='auto',
            hovertemplate='%{x}: %{y:.2f} seconds<extra></extra>',
            showlegend=False
        ),
        row=1, col=1
    )
    # Add traces for memory usage
    mem_df = melted_comp_df[melted_comp_df['Metric'] == 'Memory Usage (MB)']
    colors = [PCA_COLORS['Without PCA'] if pca == 'Without PCA' else PCA_COLORS['With PCA'] for pca in mem_df['PCA']]
    comp_fig.add_trace(
        go.Bar(
            x=mem_df['PCA'],
            y=mem_df['Value'],
            marker_color=colors,
            text=mem_df['Value'].apply(lambda x: f"{x:.2f}MB"),
            textposition='auto',
            hovertemplate='%{x}: %{y:.2f} MB<extra></extra>',
            showlegend=False
        ),
        row=1, col=2
    )
    comp_fig.update_layout(
        title={
            'text': 'Computational Efficiency Comparison',
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        height=500,
        width=900,
        yaxis1=dict(title="Seconds"),
        yaxis2=dict(title="Megabytes (MB)")
    )

    # Compare confusion matrices
    cm_comparison_fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=("Confusion Matrix - Without PCA", "Confusion Matrix - With PCA"),
        specs=[[{"type": "heatmap"}, {"type": "heatmap"}]]
    )
    # Without PCA confusion matrix
    cm_comparison_fig.add_trace(
        go.Heatmap(
            z=results_without_pca['confusion_matrix'],
            x=class_names,
            y=class_names,
            colorscale='Blues',
            text=results_without_pca['confusion_matrix'],
            texttemplate="%{text}",
            hovertemplate='True: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
        ),
        row=1, col=1
    )

    # With PCA confusion matrix
    cm_comparison_fig.add_trace(
        go.Heatmap(
            z=results_with_pca['confusion_matrix'],
            x=class_names,
            y=class_names,
            colorscale='Blues',
            text=results_with_pca['confusion_matrix'],
            texttemplate="%{text}",
            hovertemplate='True: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
        ),
        row=1, col=2
    )

    cm_comparison_fig.update_layout(
        title={
            'text': 'Confusion Matrix Comparison',
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        height=600,
        width=2000
    )

    # Update x and y axis labels
    cm_comparison_fig.update_xaxes(title_text="Predicted", tickangle=-45, row=1, col=1)
    cm_comparison_fig.update_xaxes(title_text="Predicted", tickangle=-45, row=1, col=2)
    cm_comparison_fig.update_yaxes(title_text="True", row=1, col=1)
    cm_comparison_fig.update_yaxes(title_text="True", row=1, col=2)

    # Print conclusion
    print("\nCONCLUSION:")
    # Compare accuracy
    if results_with_pca['accuracy'] > results_without_pca['accuracy']:
        acc_msg = f"PCA IMPROVED accuracy by {(results_with_pca['accuracy'] - results_without_pca['accuracy']) * 100:.2f}%"
    elif results_with_pca['accuracy'] < results_without_pca['accuracy']:
        acc_msg = f"PCA REDUCED accuracy by {(results_without_pca['accuracy'] - results_with_pca['accuracy']) * 100:.2f}%"
    else:
        acc_msg = "PCA had NO EFFECT on accuracy"

    # Compare training time
    time_without = results_without_pca['training_time']
    time_with = results_with_pca['training_time']
    time_diff = time_with - time_without
    time_pct = (time_diff / time_without) * 100
    if time_with < time_without:
        time_msg = f"PCA REDUCED training time by {-time_pct:.2f}%"
    elif time_with > time_without:
        time_msg = f"PCA INCREASED training time by {time_pct:.2f}%"
    else:
        time_msg = "PCA had NO EFFECT on training time"

    # Compare memory usage
    mem_without = results_without_pca['memory_usage']
    mem_with = results_with_pca['memory_usage']
    mem_diff = mem_with - mem_without
    mem_pct = (mem_diff / mem_without) * 100
    if mem_with < mem_without:
        mem_msg = f"PCA REDUCED memory usage by {-mem_pct:.2f}%"
    elif mem_with > mem_without:
        mem_msg = f"PCA INCREASED memory usage by {mem_pct:.2f}%"
    else:
        mem_msg = "PCA had NO EFFECT on memory usage"
    print(f"• {acc_msg}")
    print(f"• {time_msg}")
    print(f"• {mem_msg}")

    overall_msg = "Based on these results, "
    if results_with_pca['accuracy'] >= results_without_pca['accuracy'] and (time_with < time_without or mem_with < mem_without):
        overall_msg += "PCA is RECOMMENDED for this dataset as it maintained or improved accuracy while reducing computational resources."
    elif results_with_pca['accuracy'] < results_without_pca['accuracy'] and (time_with < time_without or mem_with < mem_without):
        overall_msg += "there is a TRADE-OFF: PCA reduces computational resources but at the cost of some accuracy."
    elif results_with_pca['accuracy'] >= results_without_pca['accuracy'] and time_with >= time_without and mem_with >= mem_without:
        overall_msg += "PCA improved accuracy but did not reduce computational resources, so its use depends on whether accuracy or efficiency is more important."
    else:
        overall_msg += "PCA is NOT RECOMMENDED for this dataset as it neither improved accuracy nor reduced computational resources."
    print(f"• {overall_msg}")

    # Return comparison figures
    return perf_comparison_fig, f1_comparison_fig, comp_fig, cm_comparison_fig

In [7]:
def main():
    """
    Main function that runs the exoplanet classification analysis with SVM and interactive visualizations
    """
    # Load and preprocess data
    X_train, X_test, y_train, y_test, feature_names, scaler, class_names = load_and_preprocess_data('koi_data.csv')

    # Perform PCA analysis (just to get optimal components)
    print("\n" + "="*50)
    print("PCA ANALYSIS")
    print("="*50)
    optimal_components, pca = perform_pca_analysis(
        X_train, feature_names, scaler, class_names, y_train, n_components=20
    )

    # Run SVM w/o PCA
    results_without_pca = run_svm_experiment(
        X_train, X_test, y_train, y_test, feature_names, scaler, class_names, with_pca=False
    )
    # Show confusion matrix and class metrics for SVM w/o PCA
    results_without_pca['cm_fig'].show()
    results_without_pca['class_metrics_fig'].show()

    # Run SVM with PCA
    results_with_pca = run_svm_experiment(
        X_train, X_test, y_train, y_test, feature_names, scaler, class_names,
        with_pca=True, n_components=optimal_components
    )
    # Show confusion matrix and class metrics for SVM with PCA
    results_with_pca['cm_fig'].show()
    results_with_pca['class_metrics_fig'].show()

    # Compare results
    perf_comparison_fig, f1_comparison_fig, comp_fig, cm_comparison_fig = compare_results(
        results_without_pca, results_with_pca, class_names
    )

    # Show comparison visualizations
    perf_comparison_fig.show()
    f1_comparison_fig.show()
    comp_fig.show()
    cm_comparison_fig.show()

    # Return figures
    return {
        'confusion_matrix_no_pca': results_without_pca['cm_fig'],
        'class_metrics_no_pca': results_without_pca['class_metrics_fig'],
        'confusion_matrix_with_pca': results_with_pca['cm_fig'],
        'class_metrics_with_pca': results_with_pca['class_metrics_fig'],
        'performance_comparison': perf_comparison_fig,
        'f1_comparison': f1_comparison_fig,
        'computational_comparison': comp_fig,
        'confusion_matrix_comparison': cm_comparison_fig
    }

if __name__ == "__main__":
    figures = main()


Class distribution:
FALSE POSITIVE: 3744 (48.09%)
CONFIRMED: 2616 (33.60%)
CANDIDATE: 1425 (18.30%)
Training set shape: (6228, 38)
Test set shape: (1557, 38)

PCA ANALYSIS

Running SVM without PCA - Multi-class Classification
Initial memory usage: 257.62 MB
Using all features without PCA

Training model...
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Training completed in 785.49 seconds
Memory usage: 33.02 MB
Best parameters: {'svm__C': 100, 'svm__class_weight': None, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}

Model performance:
  Accuracy:  0.8516
  Precision: 0.8463
  Recall:    0.8516
  F1 Score:  0.8394

Detailed Classification Report:
                precision    recall  f1-score   support

FALSE POSITIVE       1.00      1.00      1.00       749
     CANDIDATE       0.66      0.39      0.49       285
     CONFIRMED       0.73      0.90      0.81       523

      accuracy                           0.85      1557
     macro avg       0.80      0.76      0.76      1


Running SVM with PCA - Multi-class Classification
Initial memory usage: 303.08 MB
Using PCA with 20 components

Training model...
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Training completed in 817.15 seconds
Memory usage: 6.35 MB
Best parameters: {'svm__C': 100, 'svm__class_weight': None, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}

Model performance:
  Accuracy:  0.8497
  Precision: 0.8465
  Recall:    0.8497
  F1 Score:  0.8329

Detailed Classification Report:
                precision    recall  f1-score   support

FALSE POSITIVE       1.00      1.00      1.00       749
     CANDIDATE       0.68      0.33      0.45       285
     CONFIRMED       0.72      0.92      0.81       523

      accuracy                           0.85      1557
     macro avg       0.80      0.75      0.75      1557
  weighted avg       0.85      0.85      0.83      1557





COMPARISON OF RESULTS: SVM WITHOUT PCA vs. SVM WITH PCA

Overall Performance Metrics:
Metric       Without PCA     With PCA        Difference   % Change  
-----------------------------------------------------------------
Accuracy     0.8516          0.8497          -0.0019        -0.23%
Precision    0.8463          0.8465          0.0002        +0.02%
Recall       0.8516          0.8497          -0.0019        -0.23%
F1           0.8394          0.8329          -0.0065        -0.78%

CONCLUSION:
• PCA REDUCED accuracy by 0.19%
• PCA INCREASED training time by 4.03%
• PCA REDUCED memory usage by 80.78%
• Based on these results, there is a TRADE-OFF: PCA reduces computational resources but at the cost of some accuracy.
