In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hull-tactical-market-prediction/train.csv
/kaggle/input/hull-tactical-market-prediction/test.csv
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_inference_server.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2_grpc.py
/kaggl

In [2]:
import os
import polars as pl
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

# IMPORTANT: The competition environment requires these imports for serving the model.
import kaggle_evaluation.default_inference_server

# --- Global Configuration & File Paths ---
TRAIN_FILE = '/kaggle/input/hull-tactical-market-prediction/train.csv'
DATE_ID_COL = 'date_id'
TARGET_COL = 'market_forward_excess_returns'
RISK_FREE_COL = 'risk_free_rate'
LAGGED_RISK_FREE_COL = 'lagged_risk_free_rate'

# --- Step 1: Sharpe Ratio Calculation (The Metric) ---

def calculate_sharpe_ratio(df: pd.DataFrame, position_col: str = 'prediction') -> float:
    """
    Calculates the competition's Sharpe Ratio. 
    Formula: Sharpe = (Mean(Portfolio Returns)) / (StdDev(Portfolio Returns))
    
    Portfolio Return = Position * (Forward Return - Risk Free Rate) + Risk Free Rate
    where Position is clamped [0, 2].
    """
    # Use 'forward_returns' and 'risk_free_rate' columns for solution calculation
    daily_returns = (
        df[position_col] * (df['forward_returns'] - df[RISK_FREE_COL]) + df[RISK_FREE_COL]
    )
    
    # Competition uses a specific scaling (often T**0.5 for annualized Sharpe), 
    # but for local CV comparison, the daily Sharpe is sufficient and less noisy.
    mean_return = daily_returns.mean()
    std_dev = daily_returns.std()
    
    # Avoid division by zero
    if std_dev == 0:
        return 0.0
        
    return mean_return / std_dev

# --- Feature Engineering Class (Step 3) ---

class FeatureEngineer:
    """
    Encapsulates all feature creation logic to ensure consistency between
    training and inference. Uses Polars for speed.
    """
    def __init__(self):
        # Base features to be used in the model
        self.feature_cols = []
        # Raw features list is now initialized empty and set in Predictor._train_and_initialize
        self.raw_features = []
    
    def set_raw_features(self, df_columns: list):
        """Sets the list of available raw features based on the loaded Polars schema."""
        self.raw_features = [
            col for col in df_columns
            if col.startswith(('M', 'E', 'I', 'P', 'V', 'S', 'D', 'MOM')) or col.startswith('lagged')
        ]
        
    def create_features(self, df: pl.DataFrame) -> pl.DataFrame:
        """Applies feature engineering to the Polars DataFrame."""
        
        df_out = df.clone()
        
        # 1. Lagged Features (Step 3: Lagged Features)
        # We need a buffer of at least 20 days for the rolling features below.
        LAG_WINDOW = [1, 5]
        for col in self.raw_features:
            for lag in LAG_WINDOW:
                df_out = df_out.with_columns(
                    pl.col(col).shift(lag).alias(f'{col}_lag_{lag}')
                )

        # 2. Rolling Statistics (Step 3: Rolling Statistics)
        # Expanded feature coverage for rolling statistics
        for col in ['V1', 'M1', 'E1', 'S1', 'D1']: 
            if col in df_out.columns:
                for window in [5, 20]:
                    # Use Polars' rolling functions. Since this runs on the history buffer,
                    # the window naturally only looks backwards relative to the last row.
                    df_out = df_out.with_columns(
                        pl.col(col).rolling_mean(window_size=window).alias(f'{col}_roll_mean_{window}'),
                        pl.col(col).rolling_std(window_size=window).alias(f'{col}_roll_std_{window}')
                    )
        
        # New: Add a simple ratio feature (MOM/VIX proxy, good for capturing risk-adjusted momentum)
        # FIX: Added a check for column existence to prevent ColumnNotFoundError
        if 'MOM1' in df_out.columns and 'V1' in df_out.columns:
            df_out = df_out.with_columns(
                (pl.col('MOM1') / pl.col('V1').fill_null(1e-6)).alias('MOM1_V1_Ratio')
            )
        else:
             print("Warning: MOM1 or V1 not found in current DataFrame schema. Skipping MOM1_V1_Ratio.")
        
        # 3. Differencing (Step 3: Differencing)
        for col in ['P1', 'I1']:
            if col in df_out.columns:
                df_out = df_out.with_columns(
                    (pl.col(col) - pl.col(col).shift(1)).alias(f'{col}_diff_1')
                )

        # Final list of features (excluding NaNs and original non-lagged features)
        self.feature_cols = [
            col for col in df_out.columns 
            if not col in (self.raw_features + [DATE_ID_COL, TARGET_COL, 'forward_returns', RISK_FREE_COL]) 
            and not col.startswith('lagged')
        ]
        
        return df_out

# --- Predictor Class (Step 6: Architecture & State Management) ---

class Predictor:
    """
    Manages the professional workflow: model, feature engineering, and state
    (historical data) for sequential inference in a Code Competition.
    """
    def __init__(self, history_size=60):
        self.fe = FeatureEngineer()
        self.model = None
        self.is_initialized = False
        self.HISTORY_SIZE = history_size
        
        # Buffer to store historical data needed for rolling/lagged features
        self.history_buffer = pl.DataFrame()
        # NEW: List of columns that must be Float64 for consistency
        self.float_cols_in_train = [] 

        # 1. DEFINE MASTER COLUMNS FOR HISTORY BUFFER 
        # This list defines ALL columns we *want* in our buffer schema for consistency
        self.history_master_cols = [
            DATE_ID_COL, 
            RISK_FREE_COL, 
            'forward_returns', 
            LAGGED_RISK_FREE_COL, # Included here for schema consistency during concat
            TARGET_COL 
        ] # We will append self.fe.raw_features later after loading the data
        
        self._train_and_initialize()

    def _impute_and_preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Handles initial data cleaning and time-series compliant imputation.
        (Step 1: Handle Missing Data)
        """
        # Forward fill (ffill) any NaNs, carrying the last known value forward
        df = df.ffill()
        # Back fill (bfill) any remaining NaNs (usually early data) with the next available data
        df = df.bfill()
        
        # Final safety for any remaining (e.g., if the whole series is NaN)
        df = df.fillna(0) 
        
        return df

    def _backtest_and_cv(self, df: pd.DataFrame, model_params: dict):
        """
        Performs robust time-series cross-validation (Step 2).
        Uses a LightGBM baseline (Step 4).
        """
        print("Starting Time-Series Cross-Validation...")
        tscv = TimeSeriesSplit(n_splits=5, test_size=180) # 180 days is the size of the scored test set
        
        X = df[self.fe.feature_cols]
        y = df[TARGET_COL]
        
        sharpe_scores = []
        
        # Initialize the model with the best parameters found (hypothetically)
        model = lgb.LGBMRegressor(**model_params)
        
        for fold, (train_index, val_index) in enumerate(tscv.split(X)):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            model.fit(X_train, y_train)
            val_preds = model.predict(X_val)
            
            # Post-process for validation to calculate Sharpe
            val_df = df.iloc[val_index].copy()
            val_df['predicted_excess_return'] = val_preds
            
            # Simplified volatility prediction for allocation (Step 5)
            # In a real model, you'd train a separate volatility model.
            val_df['predicted_volatility'] = val_df['predicted_excess_return'].rolling(window=20).std().shift(-1).fillna(0.005)

            val_df['prediction'] = val_df.apply(
                lambda row: self._calculate_position_allocation(
                    row['predicted_excess_return'], 
                    row[RISK_FREE_COL], # Use RISK_FREE_COL for training/CV
                    row['predicted_volatility']
                ), axis=1
            )
            
            # Calculate Sharpe Ratio on the validation fold (Step 2: Target the Metric)
            sharpe = calculate_sharpe_ratio(val_df, position_col='prediction')
            sharpe_scores.append(sharpe)
            print(f"Fold {fold+1} Validation Sharpe: {sharpe:.4f}")

        print(f"\nAverage CV Sharpe: {np.mean(sharpe_scores):.4f}")
        return model

    def _train_and_initialize(self):
        """
        Runs the full training pipeline (Steps 1, 3, 4) and sets the history buffer (Step 6).
        """
        print("--- Professional Pipeline Initialization Started ---")
        
        # 1. Load Data (Step 1)
        df_raw = pl.read_csv(TRAIN_FILE)
        
        # NEW: Set raw features based on the loaded Polars DataFrame
        self.fe.set_raw_features(df_raw.columns)

        # Update master column list now that raw_features is set
        self.history_master_cols.extend(self.fe.raw_features)
        # Use set to remove duplicates and list() to ensure proper type for Polars filtering
        self.history_master_cols = list(set(self.history_master_cols))
        
        # --- FIX 1/2: Store the list of columns that must be cast to Float64 ---
        # Identify columns that should be float (raw features + targets/rates)
        cols_to_check = self.fe.raw_features + [TARGET_COL, 'forward_returns', RISK_FREE_COL, LAGGED_RISK_FREE_COL]
        
        # Store the list of columns that were cast to Float64 in training
        self.float_cols_in_train = [col for col in cols_to_check if col in df_raw.columns]
        
        df_raw = df_raw.with_columns([
            pl.col(col).cast(pl.Float64, strict=False) for col in self.float_cols_in_train
        ])
        # -----------------------------------------------------------------------
        
        # 2. Seed History Buffer (Step 6: Global State Management)
        
        # Filter down the master column list to only include columns that exist in the training file (df_raw)
        train_cols_for_seeding = [col for col in self.history_master_cols if col in df_raw.columns]

        self.history_buffer = df_raw.tail(self.HISTORY_SIZE).select(train_cols_for_seeding)

        # 3. Feature Engineering (Step 3) - Done on the full Polars data for training
        df_features = self.fe.create_features(df_raw).to_pandas()
        
        # 4. Imputation (Step 1)
        df_processed = self._impute_and_preprocess(df_features)
        
        # 5. Define Model Parameters (Step 4: Gradient Boosting)
        lgbm_params = {
            'objective': 'regression_l1',
            'metric': 'mae',
            'n_estimators': 500,
            'learning_rate': 0.03,
            'feature_fraction': 0.7,
            'bagging_fraction': 0.7,
            'bagging_freq': 1,
            'verbose': -1,
            'n_jobs': -1,
            'seed': 42
        }
        
        # 6. Cross-Validation and Final Retraining (Step 2 & Step 7)
        # We run CV, then retrain on the whole data for the final model
        self._backtest_and_cv(df_processed, lgbm_params)

        print("\nRetraining final model on ALL data (Step 7)...")
        # Final Model Training on ALL available data
        X_all = df_processed[self.fe.feature_cols]
        y_all = df_processed[TARGET_COL]
        self.model = lgb.LGBMRegressor(**lgbm_params)
        self.model.fit(X_all, y_all)
        
        self.is_initialized = True
        print("--- Predictor Initialized Successfully ---")

    def _calculate_position_allocation(self, 
                                       predicted_excess_return: float, 
                                       risk_free_rate: float,
                                       predicted_volatility: float = 0.005) -> float:
        """
        Implements the Optimal Allocation Strategy (Step 5).
        Transforms the predicted return into a constrained position size [0.0, 2.0].
        """
        # --- Step 5: Optimal Allocation Strategy (Simplified Kelly-like) ---
        
        # K is a scaling factor (needs tuning). Increased to 150 for a more aggressive allocation.
        K = 150 
        
        # Avoid division by near-zero volatility
        vol = max(predicted_volatility, 1e-6)
        
        # Signal based on predicted return, scaled by predicted volatility
        position = K * (predicted_excess_return / vol)
        
        # Mandatory clamping (Step 5: Clamping)
        return np.clip(position, 0.0, 2.0).item()

    def predict(self, test: pl.DataFrame) -> float:
        """
        The required inference function (Step 6).
        Runs sequentially for each day.
        """
        # Ensure initialization has completed (runs only on day 1)
        if not self.is_initialized:
            # This should have run in __init__, but as a safety:
            self._train_and_initialize() 

        # --- FIX 2/2: Cast incoming Polars DataFrame to Float64 for consistency ---
        cols_to_cast_in_test = [col for col in self.float_cols_in_train if col in test.columns]
        test = test.with_columns([
            pl.col(col).cast(pl.Float64, strict=False) for col in cols_to_cast_in_test
        ])
        # -------------------------------------------------------------------------------------

        master_cols = self.history_master_cols
        
        # 1. Prepare the 'test' row to match the schema of the history buffer
        # The test DataFrame (one row) is padded to the full master schema.
        test_row_data = {
            col: test[col][0] if col in test.columns else None 
            for col in master_cols
        }
        
        # Create a new DataFrame from the dictionary, resulting in the master column count
        test_padded = pl.DataFrame([test_row_data])
        
        # Select only the columns that exist in the current history_buffer schema 
        cols_to_select_for_concat = self.history_buffer.columns
        test_for_concat = test_padded.select(cols_to_select_for_concat)

        # 2. Update Historical Buffer
        self.history_buffer = pl.concat([self.history_buffer, test_for_concat]).tail(self.HISTORY_SIZE)
        
        # 3. Feature Engineering on updated buffer
        # This calculates all rolling/lagged features correctly for the latest row (the 'test' day)
        current_features_df = self.fe.create_features(self.history_buffer)
        
        # The data row to predict on is the *last* row of the features dataframe
        X_predict = current_features_df.tail(1)[self.fe.feature_cols].to_pandas()
        
        # 4. Model Inference: Predict the excess return
        try:
            predicted_excess_return = self.model.predict(X_predict)[0]
        except Exception as e:
            # Safe fallback
            print(f"Prediction failed: {e}. Falling back to 0.0 allocation.")
            return 0.0

        # 5. Post-Processing: Calculate the final 0.0 to 2.0 allocation
        
        # Use the lagged risk-free rate provided in the test set
        risk_free_rate = test[LAGGED_RISK_FREE_COL][0]
        
        # Simple volatility estimate (replace with a model-based prediction for Step 5 refinement)
        estimated_volatility = self.history_buffer['forward_returns'].std() if len(self.history_buffer) > 1 else 0.005
        
        final_allocation = self._calculate_position_allocation(
            predicted_excess_return, 
            risk_free_rate,
            estimated_volatility
        )

        return final_allocation

# --- API Server Setup (Step 6 & 7) ---

# Instantiate the predictor class. This triggers the entire training/CV pipeline.
predictor = Predictor()

# The final entry point function required by the competition
# RENAMED to 'predict' to satisfy the gateway requirement
def predict(test: pl.DataFrame) -> float:
    return predictor.predict(test)

# Setup the inference server
# The server is instantiated with the standard 'predict' function name.
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

# Run the server based on the environment (local gateway for testing, serve for competition run)
# Step 7: The competition requires running in this environment with Internet disabled.
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    # Local test mode (requires the data input path)
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))


--- Professional Pipeline Initialization Started ---
Starting Time-Series Cross-Validation...
Fold 1 Validation Sharpe: -0.0193
Fold 2 Validation Sharpe: 0.1094
Fold 3 Validation Sharpe: 0.1713
Fold 4 Validation Sharpe: 0.1084
Fold 5 Validation Sharpe: 0.0309

Average CV Sharpe: 0.0801

Retraining final model on ALL data (Step 7)...
--- Predictor Initialized Successfully ---
