In [1]:
import os
import pandas as pd
import polars as pl
from sklearn.linear_model import LinearRegression
import sys

sys.path.append('dataset') # Add dataset directory to sys.path for local module imports

# --- Model Training ---

# Determine data path
# Check local path first, then fallback to standard Kaggle input path
if os.path.exists('dataset/train.csv'):
    train_path = 'dataset/train.csv'
else:
    train_path = '/kaggle/input/hull-tactical-market-prediction/train.csv'

print(f"Loading data from {train_path}...")
try:
    train = pd.read_csv(train_path)
    print("Data loaded.")

    # Define features
    # Features start with D, E, I, M, P, S, V followed by digits
    feature_prefixes = ['D', 'E', 'I', 'M', 'P', 'S', 'V']
    features = [col for col in train.columns if any(col.startswith(p) and col[1:].isdigit() for p in feature_prefixes)]

    # Fill NaNs with 0
    X = train[features].fillna(0)
    y = train['forward_returns']

    # Train model
    print("Training Linear Regression model...")
    model = LinearRegression()
    model.fit(X, y)
    print("Model trained.")

except FileNotFoundError:
    print("Training data not found. Make sure the dataset is available.")
    # In a submission environment where training data isn't available directly in this cell (if separated),
    # you would load a saved model here. For this baseline, we assume data access.
    model = None


Loading data from /kaggle/input/hull-tactical-market-prediction/train.csv...
Data loaded.
Training Linear Regression model...
Model trained.


In [2]:
import kaggle_evaluation.default_inference_server
import numpy as np
def predict(test: pl.DataFrame) -> float:
    """
    Predicts the forward_returns for a given test batch.
        """
    # Handle case where model might not be trained (e.g., if training data was not found)
    if model is None:
        # Return a default prediction or raise an error
        print("Warning: Model not trained, returning 0.0")
        return 0.0
        
    # Convert polars DataFrame to pandas DataFrame
    test_pd = test.to_pandas()

    # Select the same features used in training and fill NaNs
    # Ensure we only select columns that exist in test (intersection), 
    # but strictly we should ensure all training features are present or handled.
    # Here we assume the test set schema matches the training feature set.
    X_test = test_pd[features].fillna(0)

    # Make prediction
    # model.predict expects a 2D array, even for a single sample
    prediction = model.predict(X_test)[0]

    return np.clip(float(prediction),0.0, 2.0)

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    # For local testing
    local_data_path = 'dataset'
    if os.path.exists(local_data_path):
        # The gateway expects a tuple of paths
        print(f"Starting local gateway with data path: {os.path.abspath(local_data_path)}")
        inference_server.run_local_gateway((local_data_path,))
    else:
        print("Local 'dataset' directory not found for gateway testing.")


Local 'dataset' directory not found for gateway testing.
