In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import kaggle_evaluation.default_inference_server


# ==================== DATA PREPROCESSING ====================
def preprocessing(data, typ):
    """
    Preprocess the data by selecting features and handling missing values
    """
    main_features = [
        'E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'E10',
        'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E20',
        'I2',
        'P8', 'P9', 'P10', 'P12', 'P13',
        'S1', 'S2', 'S5'
    ]
    
    # Convert to numeric safely
    for col in data.columns:
        if col not in ['date_id', 'forward_returns', 'is_scored']:
            if data[col].dtype == 'object':
                data[col] = pd.to_numeric(data[col], errors='coerce')
    
    available_features = [f for f in main_features if f in data.columns]
    
    if typ == "train":
        data = data[available_features + ["forward_returns"]]
    else:
        data = data[available_features]
    
    data = data.fillna(0)
    return data


# ==================== SEQUENCE CREATION ====================
def create_sequences(X, y, timesteps=1):
    """
    Convert tabular data into sequences for LSTM input
    """
    Xs, ys = [], []
    for i in range(len(X) - timesteps):
        Xs.append(X[i:(i + timesteps)])
        ys.append(y[i + timesteps])
    return np.array(Xs), np.array(ys)


# ==================== LSTM MODEL TRAINING ====================
def train_lstm_model(X_train, y_train, X_val, y_val, timesteps=1):
    """
    Train an LSTM regression model
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train.values, timesteps)
    X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val.values, timesteps)
    
    model = Sequential([
        LSTM(128, input_shape=(timesteps, X_train.shape[1]), return_sequences=False),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    
    print("Training LSTM model...")
    history = model.fit(
        X_train_seq, y_train_seq,
        validation_data=(X_val_seq, y_val_seq),
        epochs=30,
        batch_size=64,
        verbose=1
    )
    print("LSTM model trained successfully!")
    
    return model, scaler


# ==================== GLOBAL MODEL STORAGE ====================
TRAINED_MODELS = {}


# ==================== PREDICTION FUNCTION ====================
def predict(test_data):
    """
    Use trained LSTM model to predict test sample
    (robust version with feature alignment and safety)
    """
    if isinstance(test_data, dict):
        df = pd.DataFrame([test_data])
    else:
        df = pd.DataFrame(test_data)
    
    df_processed = preprocessing(df, "test")
    
    model = TRAINED_MODELS['model_3']
    scaler = TRAINED_MODELS['scaler']
    timesteps = TRAINED_MODELS['timesteps']
    expected_features = TRAINED_MODELS.get('features', df_processed.columns.tolist())
    
    # 1️⃣ 补齐缺失列
    for col in expected_features:
        if col not in df_processed.columns:
            df_processed[col] = 0.0
    
    # 2️⃣ 保证顺序一致
    df_processed = df_processed[expected_features]
    
    # 3️⃣ 防空输入
    if df_processed.shape[0] == 0:
        return 0.0
    
    # 4️⃣ 转换成 numpy
    X_test = df_processed.values.astype(float)
    
    # 5️⃣ 标准化（防止 feature name 警告）
    try:
        X_test_scaled = scaler.transform(X_test)
    except Exception:
        # 如果 scaler 无法 transform（无特征名等），重新拟合一次以兼容
        scaler.fit(X_test)
        X_test_scaled = scaler.transform(X_test)
    
    # 6️⃣ 形状调整
    X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], timesteps, X_test_scaled.shape[1]))
    
    # 7️⃣ 预测
    pred = model.predict(X_test_scaled)
    
    return float(pred[0][0]) if pred.size > 0 else 0.0


# ==================== MAIN EXECUTION ====================
# ==================== MAIN EXECUTION ====================
if __name__ == "__main__":
    print("=" * 60)
    print("Hull Tactical Market Prediction - BiLSTM Solution")
    print("=" * 60)
    
    # ⭐ 修改点1：训练数据改为读取你本地“已经处理好”的 CSV =====================
    print("\nLoading training data from LOCAL processed CSV...")
    # TODO: 把下面路径改成你自己本地的路径
    LOCAL_TRAIN_PATH = "/kaggle/input/data111/train_filtered_threshold_0p01.csv"
    train = pd.read_csv(LOCAL_TRAIN_PATH)
    print(f"Local processed train shape: {train.shape}")
    
    # 做一个小检查：必须有 forward_returns 列
    if "forward_returns" not in train.columns:
        raise ValueError(
            "本地训练数据中必须包含目标列 'forward_returns'，"
            "请确认你的 CSV 列名。"
        )
    # ====================================================================
    
    # ⭐ 修改点2：不再对训练数据调用 preprocessing ==========================
    # 原来代码是：
    # print("\nPreprocessing data...")
    # train = preprocessing(train, 'train')
    # 现在本地 train 已经是你处理好的数据，就不需要再预处理了
    # ====================================================================
    
    # （可选）如果你仍然在 Kaggle 环境跑，并想打印一下官方 test 的形状：
    try:
        test = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/test.csv')
        print(f"Official test shape (for reference): {test.shape}")
    except Exception as e:
        print("Warning: cannot load official Kaggle test.csv, error:", e)
    
    # Split data
    train_split, val_split = train_test_split(train, test_size=0.01, random_state=4)
    X_train = train_split.drop(columns=["forward_returns"])
    y_train = train_split['forward_returns']
    X_val = val_split.drop(columns=["forward_returns"])
    y_val = val_split['forward_returns']
    
    print(f"Training samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")
    
    # Set LSTM time window
    timesteps = 1  # 你之后如果需要多步时序建模，可以改成 >1
    
    # Train Model 3 (BiLSTM)
    print("\n" + "=" * 60)
    print("Training Model 3 - Bidirectional LSTM")
    print("=" * 60)
    model_3, scaler = train_lstm_model(X_train, y_train, X_val, y_val, timesteps=timesteps)
    
    # Evaluate
    X_val_scaled = scaler.transform(X_val)
    X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val.values, timesteps)
    val_pred = model_3.predict(X_val_seq)
    val_rmse = np.sqrt(np.mean((y_val_seq - val_pred.flatten()) ** 2))
    print(f"\nValidation RMSE: {val_rmse:.6f}")
    
    # Save models and features
    TRAINED_MODELS['model_3'] = model_3
    TRAINED_MODELS['scaler'] = scaler
    TRAINED_MODELS['timesteps'] = timesteps
    TRAINED_MODELS['features'] = list(X_train.columns)
    
    # Setup inference server
    print("\n" + "=" * 60)
    print("Setting up inference server")
    print("=" * 60)
    
    inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)
    
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        print("Running in competition mode...")
        inference_server.serve()
    else:
        print("Running local inference...")
        # 这里仍假设你在 Kaggle 环境下，官方数据目录不变
        inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))
    
    print("\n" + "=" * 60)
    print("Execution completed!")
    print("=" * 60)

In [None]:
    print("============================================================")
    print("开始本地预测模拟 (Local Prediction Simulation)")
    print("============================================================")

    # 1. 确保读取了测试数据 (使用 Polars 读取，因为你的 predict 函数里用的是 pl)
    import polars as pl
    try:
    # 修改这里的路径为你本地真实的 test.csv 路径
       local_test_path = "/kaggle/input/hull-tactical-market-prediction/test.csv" 
       test_df = pl.read_csv(local_test_path)
       print(f"加载测试集成功，行数: {len(test_df)}")
    except Exception as e:
    # 如果找不到test.csv，我们手动创建一个模拟数据来测试流程
         print(f"读取测试集失败 ({e})，正在创建模拟数据...")
         test_df = pl.DataFrame({
           "date_id": [1001, 1002, 1003],
           "lagged_forward_returns": [0.001, -0.002, 0.005],
           # 添加其他必要的列，用0填充
           **{f"E{i}": [0.0]*3 for i in range(1, 21)},
           **{col: [0.0]*3 for col in ["S2", "P9", "S1", "S5", "I2", "P8", "P10", "P12", "P13"]},
           "is_scored": [True, True, True]
        })

    # 2. 模拟逐行预测
    final_predictions = []

    # 重置 Model 7 的迭代器 (如果不重置，可能会报错)
    i_M7 = 0 
    # 注意：确保 opt_preds 已经被定义和计算（在你的代码中间部分有计算 res.x）
    # 如果 opt_preds 没定义，这里给个默认值防止报错
    if 'opt_preds' not in globals():
      print("警告: opt_preds 未定义，使用默认值")
      opt_preds = np.full(2000, 0.05) 

    print("\n正在生成预测...")
    # 你的 predict 函数设计为接收一行数据的 DataFrame
    for i in range(len(test_df)):
    # 取出一行，保持 DataFrame 格式
       row = test_df[i] 
    
    try:
        # 调用你定义的 predict 函数
        pred_value = predict(row)
        final_predictions.append(pred_value)
        # 每10行打印一次，避免刷屏
        if i % 10 == 0:
            print(f"Row {i}: Prediction = {pred_value:.6f}")
            
    except Exception as e:
        print(f"Row {i} 预测出错: {e}")
        final_predictions.append(0.0)

    # 3. 输出最终结果摘要
    print("\n============================================================")
    print("预测完成")
    print(f"预测结果前5个: {final_predictions[:5]}")
    print(f"平均预测值: {np.mean(final_predictions)}")
    print("============================================================")

