In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from pathlib import Path
from typing import Tuple, Dict, List

# Configure matplotlib for VS Code display
plt.rcParams['font.sans-serif'] = ['Arial']
plt.rcParams['axes.unicode_minus'] = False
plt.style.use('ggplot')

def load_data() -> Tuple[np.ndarray, np.ndarray]:
    """Load and validate dataset from Excel files.
    
    Returns:
        Tuple containing:
        - X: Feature matrix (n_samples, n_features)
        - y: Target vector (n_samples,)
    
    Raises:
        FileNotFoundError: If data files are missing
        ValueError: If data dimensions mismatch
    """
    feature_path = Path(r"D:\Desktop\Pca_LASSO_\final_40x10.xlsx")
    target_path = Path(r"D:\Desktop\Pca_LASSO_\Index.xlsx")

    # Validate file existence
    if not feature_path.exists():
        raise FileNotFoundError(f"Feature file not found: {feature_path}")
    if not target_path.exists():
        raise FileNotFoundError(f"Target file not found: {target_path}")

    try:
        X = pd.read_excel(feature_path, header=None).values
        y_df = pd.read_excel(target_path)
        y = y_df.iloc[:, 0].values
    except Exception as e:
        raise RuntimeError(f"Error loading data: {str(e)}")

    # Validate data dimensions
    if len(X) != len(y):
        raise ValueError(f"Data size mismatch: {len(X)} samples vs {len(y)} targets")
    
    return X, y

def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict:
    """Calculate evaluation metrics for regression models.
    
    Args:
        y_true: Ground truth values
        y_pred: Predicted values
        model_name: Identifier for the model
    
    Returns:
        Dictionary containing model name and metrics
    """
    return {
        "Model": model_name,
        "R²": round(r2_score(y_true, y_pred), 4),
        "RMSE": round(np.sqrt(mean_squared_error(y_true, y_pred)), 4)
    }

def plot_results(y_true: np.ndarray, predictions: List[np.ndarray], model_names: List[str]) -> None:
    """Generate comparison plots for model predictions.
    
    Args:
        y_true: Ground truth values
        predictions: List of prediction arrays from different models
        model_names: List of model identifiers
    """
    plt.figure(figsize=(12, 6), dpi=100)
    
    # Prediction vs Actual plot
    plt.subplot(1, 2, 1)
    for y_pred, name in zip(predictions, model_names):
        plt.scatter(y_true, y_pred, alpha=0.6, label=name, s=40)
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 
             'k--', lw=1.5, alpha=0.8)
    plt.xlabel("Actual Values", fontsize=12)
    plt.ylabel("Predicted Values", fontsize=12)
    plt.title("Prediction Comparison", fontsize=14)
    plt.legend()
    
    # Residual distribution plot
    plt.subplot(1, 2, 2)
    for y_pred, name in zip(predictions, model_names):
        residuals = y_true - y_pred
        plt.hist(residuals, bins=15, alpha=0.5, label=name,
                 density=True, edgecolor='black')
    plt.xlabel("Residuals", fontsize=12)
    plt.ylabel("Density", fontsize=12)
    plt.title("Residual Distribution", fontsize=14)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig("model_comparison.png", bbox_inches='tight')
    plt.close()

def main() -> None:
    """Main execution workflow"""
    # 1. Data loading and validation
    try:
        X, y = load_data()
        print(f"Data loaded successfully. Shape: X{X.shape}, y{y.shape}")
    except Exception as e:
        print(f"Data loading failed: {str(e)}")
        return

    # 2. Data splitting
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    print(f"Train/test split: {len(X_train)}/{len(X_test)} samples")

    # 3. Model definitions with proper pipelines
    models: Dict = {
        "PCR": Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=0.95)),
            ('regressor', LinearRegression())
        ]),
        "LASSO": GridSearchCV(
            Pipeline([
                ('scaler', StandardScaler()),
                ('regressor', Lasso(max_iter=10000))
            ]),
            param_grid={'regressor__alpha': [0.001, 0.01, 0.1, 1, 10]},
            cv=5,
            n_jobs=-1
        ),
        "RandomForest": Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', RandomForestRegressor(
                n_estimators=100,
                max_depth=5,
                random_state=42,
                n_jobs=-1
            ))
        ])
    }

    results: List[Dict] = []
    predictions: List[np.ndarray] = []
    
    # 4. Model training and evaluation
    for name, model in models.items():
        print(f"\n{'='*30}\nTraining {name} model\n{'='*30}")
        
        try:
            # Handle GridSearchCV separately
            if isinstance(model, GridSearchCV):
                model.fit(X_train, y_train)
                best_params = model.best_params_
                print(f"Best parameters for {name}: {best_params}")
                final_model = model.best_estimator_
            else:
                final_model = model.fit(X_train, y_train)
            
            # Generate predictions
            y_pred = final_model.predict(X_test)
            predictions.append(y_pred)
            
            # Store results
            metrics = evaluate_model(y_test, y_pred, name)
            results.append(metrics)
            
            # Cross-validation (using full pipeline)
            cv_scores = cross_val_score(
                final_model, X, y, cv=5, 
                scoring='r2', n_jobs=-1
            )
            print(f"Cross-validation R²: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")
            
        except Exception as e:
            print(f"Error training {name} model: {str(e)}")
            continue

    # 5. Results presentation
    if results:
        result_df = pd.DataFrame(results)
        print("\nModel Performance Comparison:")
        print(result_df.to_string(index=False))  # 使用默认文本表格格式
        
        try:
            plot_results(y_test, predictions, list(models.keys()))
            result_df.to_excel("model_comparison_results.xlsx", index=False)
            print("\nResults saved to model_comparison.png and model_comparison_results.xlsx")
        except Exception as e:
            print(f"Error saving results: {str(e)}")
    else:
        print("No valid results to display")

if __name__ == "__main__":
    main()

Data loaded successfully. Shape: X(40, 10), y(40,)
Train/test split: 32/8 samples

Training PCR model
Cross-validation R²: -0.888 ± 0.746

Training LASSO model
Best parameters for LASSO: {'regressor__alpha': 10}
Cross-validation R²: -0.232 ± 0.236

Training RandomForest model
Cross-validation R²: -0.524 ± 0.483

Model Performance Comparison:
       Model      R²    RMSE
         PCR -0.4495 26.7964
       LASSO -0.2872 25.2517
RandomForest -0.1935 24.3150

Results saved to model_comparison.png and model_comparison_results.xlsx
