In [32]:
# ============================================
# HULL TACTICAL MARKET PREDICTION - KAGGLE SUBMISSION
# ============================================

import pandas as pd
import numpy as np
import polars as pl
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge

# Try to import lightgbm (available on Kaggle)
try:
    import lightgbm as lgb
    HAS_LGB = True
except ImportError:
    HAS_LGB = False

print('Libraries loaded!')
print(f'LightGBM available: {HAS_LGB}')

# ============================================
# CONFIGURATION
# ============================================
MIN_INVESTMENT = 0
MAX_INVESTMENT = 2

# Data paths - adjust for Kaggle environment
if os.path.exists('/kaggle/input'):
    DATA_DIR = '/kaggle/input/hull-tactical-market-prediction'
else:
    DATA_DIR = '.'

print(f'Data directory: {DATA_DIR}')

# ============================================
# FEATURE ENGINEERING CLASS
# ============================================
class FeatureEngineer:
    def __init__(self):
        self.imputer = SimpleImputer(strategy='median')
        self.scaler = StandardScaler()
        self.feature_cols = None
        
    def fit(self, df, feature_cols):
        self.feature_cols = feature_cols
        X = df[feature_cols].values
        X_imputed = self.imputer.fit_transform(X)
        self.scaler.fit(X_imputed)
        return self
    
    def transform(self, df):
        X = df[self.feature_cols].values
        X_imputed = self.imputer.transform(X)
        X_scaled = self.scaler.transform(X_imputed)
        return X_scaled
    
    def fit_transform(self, df, feature_cols):
        self.fit(df, feature_cols)
        return self.transform(df)

# ============================================
# TRADING STRATEGY CLASS
# ============================================
class TradingStrategy:
    def __init__(self, method='sigmoid', scale=100):
        self.method = method
        self.scale = scale
        
    def predict_to_weight(self, predictions):
        if self.method == 'sigmoid':
            weights = 2 / (1 + np.exp(-self.scale * predictions))
        elif self.method == 'threshold':
            weights = np.where(predictions > 0, 1.5, 0.5)
        else:
            weights = np.ones_like(predictions)
        
        return np.clip(weights, MIN_INVESTMENT, MAX_INVESTMENT)

# ============================================
# MODEL TRAINING AND PREPARATION
# ============================================

# Load training data
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
print(f'Train shape: {train_df.shape}')

# Define feature columns
target_cols = ['forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
id_col = 'date_id'
exclude_cols = [id_col] + target_cols

# Get test columns to ensure we only use features available in test set
test_df_sample = pd.read_csv(f'{DATA_DIR}/test.csv', nrows=5)
test_cols = set(test_df_sample.columns)

# Feature columns: must be in both train and test, and not excluded
feature_cols = [col for col in train_df.columns 
                if col not in exclude_cols and col in test_cols]
print(f'Number of features: {len(feature_cols)}')

# Filter training data - use only complete data
missing_by_date = train_df[feature_cols].isnull().sum(axis=1)
threshold = len(feature_cols) * 0.05
valid_mask = missing_by_date <= threshold
valid_start_idx = valid_mask.idxmax()
valid_start_date = train_df.loc[valid_start_idx, 'date_id']
train_clean = train_df[train_df['date_id'] >= valid_start_date].copy().reset_index(drop=True)
print(f'Training samples after filtering: {len(train_clean)}')

# Prepare feature engineering
fe = FeatureEngineer()
X_train = fe.fit_transform(train_clean, feature_cols)
y_train = train_clean['forward_returns'].values
print(f'X_train shape: {X_train.shape}')

# Train model
if HAS_LGB:
    model = lgb.LGBMRegressor(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        random_state=42,
        verbose=-1
    )
else:
    model = GradientBoostingRegressor(
        n_estimators=200,
        max_depth=3,
        random_state=42
    )

model.fit(X_train, y_train)
print('Model trained!')

# Initialize trading strategy
strategy = TradingStrategy(method='sigmoid', scale=100)

# ============================================
# PREDICTION FUNCTION FOR KAGGLE API
# ============================================

def predict(test_batch: pl.DataFrame) -> pl.DataFrame:
    """
    Make predictions for a batch of test data.
    This function is called by Kaggle's evaluation API.
    """
    # Convert polars to pandas for processing
    test_pd = test_batch.to_pandas()
    
    # Get the row ID column (usually 'date_id')
    if 'date_id' in test_pd.columns:
        row_ids = test_pd['date_id'].values
    else:
        row_ids = test_pd.iloc[:, 0].values
    
    # Prepare features - only use columns that were in training
    X_test = fe.transform(test_pd)
    
    # Make predictions
    predictions = model.predict(X_test)
    
    # Convert to trading weights
    weights = strategy.predict_to_weight(predictions)
    
    # Ensure weights are within valid range [0, 2]
    weights = np.clip(weights, MIN_INVESTMENT, MAX_INVESTMENT)
    
    # Create result DataFrame
    result = pl.DataFrame({
        'date_id': row_ids,
        'prediction': weights
    })
    
    return result

print('Predict function defined!')

# ============================================
# KAGGLE SUBMISSION - INFERENCE SERVER SETUP
# ============================================

import kaggle_evaluation.core.templates
from kaggle_evaluation.default_gateway import DefaultGateway

class HullTacticalInferenceServer(kaggle_evaluation.core.templates.InferenceServer):
    """Custom inference server that wraps our predict function."""
    
    def __init__(self):
        super().__init__(predict)
    
    def _get_gateway_for_test(self, data_paths=None, file_share_dir=None):
        return DefaultGateway(data_paths)

# ============================================
# RUN INFERENCE SERVER / LOCAL TEST
# ============================================

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    # KAGGLE COMPETITION MODE
    print('Running in Kaggle competition mode...')
    inference_server = HullTacticalInferenceServer()
    inference_server.serve()
else:
    # LOCAL TESTING MODE
    print('Running in local testing mode...')
    
    # Load test data and make predictions
    test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
    test_pl = pl.from_pandas(test_df)
    
    print(f'Test data shape: {test_df.shape}')
    
    # Make predictions using our predict function
    submission = predict(test_pl)
    submission_pd = submission.to_pandas()
    
    # Save submission files
    import pyarrow as pa
    import pyarrow.parquet as pq
    
    # Save as parquet (Kaggle format)
    table = pa.Table.from_pandas(submission_pd, preserve_index=False)
    pq.write_table(table, 'submission.parquet')
    
    # Save as CSV (for reference)
    submission_pd.to_csv('submission.csv', index=False)
    
    print(f'\nâœ“ Submission saved!')
    print(f'  Total predictions: {len(submission_pd)}')
    print(f'  Weight range: [{submission_pd["prediction"].min():.4f}, {submission_pd["prediction"].max():.4f}]')
    print(f'  Mean weight: {submission_pd["prediction"].mean():.4f}')
    print(f'\nAll predictions:')
    print(submission_pd)

Libraries loaded!
LightGBM available: True
