In [4]:
#!/usr/bin/env python3
"""
Lasso regression for 2D TFIM quantum system
Aligned with: 'Rethink the Role of Deep Learning towards Large-scale Quantum Systems' (ICML 2025)

Key settings (based on improved-ml-algorithm-master):
- Random Fourier Features (RFF) used for feature mapping
- RFF formula: φ(x) = [cos(Wx * γ/√D), sin(Wx * γ/√D)]
- W ~ N(0, I) with scaling γ/√D applied during transform
- Fixed λ = 10^3 (no hyperparameter tuning)
- 2D TFIM lattice, typical Lx×Ly ∈ {5×5, 8×8}
- Feature dimension after RFF: 2*R (R cosine + R sine features)
"""

import os
import argparse
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import ast


# ======================================================
# 参数解析
# ======================================================
def parse_args():
    parser = argparse.ArgumentParser(description='Train Lasso model (ICML 2025 settings)')
    parser.add_argument('--train-file', type=str, required=True, help='Path to training CSV file')
    parser.add_argument('--test-file', type=str, required=True, help='Path to test CSV file')
    parser.add_argument('--task', type=str, default='correlation',
                        choices=['correlation', 'entropy'],
                        help='Prediction task: correlation or entropy')

    # === 修改1: 固定 λ=10^3（论文设定），无调参 ===
    parser.add_argument('--lasso-alpha', type=float, default=1e3,
                        help='Regularization λ for Lasso (fixed = 10³)')
    parser.add_argument('--lasso-tol', type=float, default=1e-3)
    parser.add_argument('--lasso-maxiter', type=int, default=10000)
    parser.add_argument('--model-seed', type=int, default=42)

    parser.add_argument('--save-results', action='store_true')
    parser.add_argument('--output-dir', type=str, default='./results_log')

    # === 修改3: 保留随机傅立叶特征 (RFF) 映射 ===
    parser.add_argument('--num-rff', type=int, default=20,
                        help='Number of random Fourier features R (≈ 10–40 in paper)')
    parser.add_argument('--rff-gamma', type=float, default=0.6,
                        help='Scaling γ for RFF (≈ 0.5–0.7 in paper)')
    return parser.parse_args()


# ======================================================
# 数据加载
# ======================================================
def load_data(file_path, task='correlation'):
    """
    Load dataset (TFIM 2D shadow measurements + labels)
    输入 x : shadow measurement flatten vectors
    标签 y : correlation 或 entropy
    """
    print(f"Loading data from {file_path} ...")
    df = pd.read_csv(file_path)

    # measurement_samples 列存储 [List[List[float]]]
    X_raw = df['measurement_samples'].apply(ast.literal_eval).tolist()
    X = np.array(X_raw)  # (N, M, Nq)
    N, num_meas, num_qubits = X.shape
    X_flat = X.reshape(N, -1) # (N, M*Nq)
    print(f" Loaded {N} samples, feature dim (before RFF): {X_flat.shape[1]}")

    # 标签列
    def parse_field(s):
        if isinstance(s, str) and s.startswith('Any['):
            s = s[4:-1]
        return ast.literal_eval(s) if isinstance(s, str) else s

    if task == 'correlation':
        y_approx = df['approx_correlation'].apply(parse_field).tolist()
        y_exact  = df['exact_correlation'].apply(parse_field).tolist()
    else:
        y_approx = df['approx_entropy'].apply(parse_field).tolist()
        y_exact  = df['exact_entropy'].apply(parse_field).tolist()

    return np.array(X_flat), np.array(y_approx), np.array(y_exact)


# ======================================================
# 随机傅立叶特征映射
# ======================================================
def generate_rff_params(D, R, gamma=0.6, seed=None):
    """
    Generate RFF random parameters W (to be reused for train/test)
    
    Args:
        D: Original feature dimension
        R: Number of RFF features
        gamma: Scaling parameter
        seed: Random seed
    
    Returns:
        W: Random projection matrix (R, D)
    """
    if seed is not None:
        np.random.seed(seed)
    
    # Generate random projection matrix W ~ N(0, I)
    # Scaling by gamma/sqrt(D) will be applied in the transform function
    W = np.random.randn(R, D)
    
    return W


def apply_rff_transform(X, W, gamma=0.6):
    """
    Apply RFF transformation using improved-ml-algorithm-master formula
    
    Formula: φ(x) = [cos(Wx * γ/√D), sin(Wx * γ/√D)]
    where:
    - x is the original n×d feature matrix
    - W is pre-generated random matrix ~ N(0, I)
    - γ is scaling parameter
    - D is the original feature dimension
    
    Returns: [cos(Wx * γ/√D), sin(Wx * γ/√D)] with dimension 2*R
    
    This matches the improved-ml-algorithm-master implementation:
    val = np.dot(data_local, w_k.T) * gamma / np.sqrt(m_local)
    cosv = np.cos(val)
    sinv = np.sin(val)
    """
    N, D = X.shape
    R = W.shape[0]
    
    # Compute projections: Wx * γ/√D
    proj = np.dot(X, W.T) * gamma / np.sqrt(D)  # Shape: (N, R)
    
    # Create RFF features: cos and sin
    cos_features = np.cos(proj)  # Shape: (N, R)
    sin_features = np.sin(proj)  # Shape: (N, R)
    
    # Concatenate cos and sin features: [cos(Wx * γ/√D), sin(Wx * γ/√D)]
    X_rff = np.concatenate([cos_features, sin_features], axis=1)  # Shape: (N, 2*R)
    
    return X_rff


# ======================================================
# 训练与评估
# ======================================================
def train_and_evaluate(X_train, y_train_approx, X_test, y_test_exact, args):
    np.random.seed(args.model_seed)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = scaler.transform(X_test)

    # === 修改4: 添加 RFF 映射 (improved-ml-algorithm-master formula) ===
    # CRITICAL: Generate W ONCE, then apply to both train and test
    # This ensures both datasets are in the SAME feature space
    D = X_train.shape[1]  # Original feature dimension
    W = generate_rff_params(D, R=args.num_rff, gamma=args.rff_gamma, 
                           seed=args.model_seed)
    
    # Apply RFF transformation using the SAME W for both sets
    X_train = apply_rff_transform(X_train, W, gamma=args.rff_gamma)
    X_test  = apply_rff_transform(X_test, W, gamma=args.rff_gamma)
    
    print(f" After RFF mapping: feature dim = {X_train.shape[1]} (2*R where R={args.num_rff})")

    # 多输出处理
    if y_train_approx.ndim == 1:
        y_train_approx = y_train_approx.reshape(-1, 1)
        y_test_exact  = y_test_exact.reshape(-1, 1)
    outputs = y_train_approx.shape[1]

    # === 修改5: 固定 λ=1e3 ，无调参 ===
    model = Lasso(alpha=args.lasso_alpha,
                max_iter=args.lasso_maxiter,
                tol=args.lasso_tol,
                random_state=args.model_seed)

    # Train model directly (no cross-validation)
    model.fit(X_train, y_train_approx)
    
    # Evaluate on test set
    y_pred = model.predict(X_test)
    test_rmse = np.sqrt(np.mean((y_pred - y_test_exact)**2))

    print(f" λ = {args.lasso_alpha}, R = {args.num_rff}, γ = {args.rff_gamma}")
    print(f" Test RMSE = {test_rmse:.6f}")
    return test_rmse



In [5]:
# ======================================================
# 主函数 - 直接设置参数（适用于 notebook 环境）
# ======================================================

# 直接设置参数，避免使用 parse_args()
class Args:
    def __init__(self):
        # 数据集路径
        self.train_file = "/home/ubuntu/code/python/DeepModelFusion/ml4quantum/dataset_generation/dataset_results/tfim_2d_new/n100|X(coupling, meas256)_y(energy,entropy,corrs)_q(5, 5).csv"
        self.test_file = "/home/ubuntu/code/python/DeepModelFusion/ml4quantum/dataset_generation/dataset_results/tfim_2d_new/n200|X(coupling, meas256)_y(energy,entropy,corrs)_q(5, 5).csv"
        
        # 任务类型：'correlation' 或 'entropy'
        self.task = 'correlation'  # 可以改为 'entropy'
        
        # Lasso 回归参数
        self.lasso_alpha = 1000.0    # 正则化强度 λ
        self.lasso_tol = 1e-2        # 收敛容差
        self.lasso_maxiter = 100000   # 最大迭代次数
        self.model_seed = 42         # 随机种子
        
        # RFF 参数
        self.num_rff = 20            # RFF 特征数量
        self.rff_gamma = 0.6         # RFF 缩放参数
        
        # 输出设置
        self.save_results = True
        self.output_dir = './results_tfim'

# 创建参数对象
args = Args()

print("="*60)
print(" 2D TFIM Quantum System Prediction via Lasso Regression (ICML 2025)")
print("="*60)
print(f" Task: {args.task}")
print(f" λ (Lasso regularization): {args.lasso_alpha}")
print(f" RFF params: R = {args.num_rff}, γ = {args.rff_gamma}")
print("="*60)

# 加载数据
X_train, y_train_approx, _ = load_data(args.train_file, task=args.task)
X_test, _, y_test_exact = load_data(args.test_file, task=args.task)
print(f" Train samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# 训练与评估
test_rmse = train_and_evaluate(X_train, y_train_approx,
                                X_test, y_test_exact, args)

# 结果输出
print("="*60)
print(f" Final Results — Task: {args.task}")
print(f" Test RMSE: {test_rmse:.6f}")
print("="*60)

  

 2D TFIM Quantum System Prediction via Lasso Regression (ICML 2025)
 Task: correlation
 λ (Lasso regularization): 1000.0
 RFF params: R = 20, γ = 0.6
Loading data from /home/ubuntu/code/python/DeepModelFusion/ml4quantum/dataset_generation/dataset_results/tfim_2d_new/n100|X(coupling, meas256)_y(energy,entropy,corrs)_q(5, 5).csv ...
 Loaded 100 samples, feature dim (before RFF): 6400
Loading data from /home/ubuntu/code/python/DeepModelFusion/ml4quantum/dataset_generation/dataset_results/tfim_2d_new/n200|X(coupling, meas256)_y(energy,entropy,corrs)_q(5, 5).csv ...
 Loaded 200 samples, feature dim (before RFF): 6400
 Train samples: 100, Test samples: 200
 After RFF mapping: feature dim = 40 (2*R where R=20)


  model = cd_fast.enet_coordinate_descent(


 λ = 1000.0, R = 20, γ = 0.6
 Test RMSE = 0.186798
 Final Results — Task: correlation
 Test RMSE: 0.186798
