In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge

# Try to import lightgbm (available on Kaggle)
try:
    import lightgbm as lgb
    HAS_LGB = True
except ImportError:
    HAS_LGB = False

print('Libraries loaded!')
print(f'LightGBM available: {HAS_LGB}')

In [None]:
# Configuration
MIN_INVESTMENT = 0
MAX_INVESTMENT = 2

# Data paths - adjust for Kaggle environment
import os
if os.path.exists('/kaggle/input'):
    # Kaggle environment
    DATA_DIR = '/kaggle/input/hull-tactical-market-prediction'
else:
    # Local environment
    DATA_DIR = '.'

print(f'Data directory: {DATA_DIR}')

In [None]:
# Load data
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train shape: {train_df.shape}')
print(f'Test shape: {test_df.shape}')

In [None]:
# Feature Engineering Class
class FeatureEngineer:
    def __init__(self):
        self.imputer = SimpleImputer(strategy='median')
        self.scaler = StandardScaler()
        self.feature_cols = None
        
    def fit(self, df, feature_cols):
        self.feature_cols = feature_cols
        X = df[feature_cols].values
        X_imputed = self.imputer.fit_transform(X)
        self.scaler.fit(X_imputed)
        return self
    
    def transform(self, df):
        X = df[self.feature_cols].values
        X_imputed = self.imputer.transform(X)
        X_scaled = self.scaler.transform(X_imputed)
        return X_scaled
    
    def fit_transform(self, df, feature_cols):
        self.fit(df, feature_cols)
        return self.transform(df)

In [None]:
# Trading Strategy Class
class TradingStrategy:
    def __init__(self, method='sigmoid', scale=100):
        self.method = method
        self.scale = scale
        
    def predict_to_weight(self, predictions):
        if self.method == 'sigmoid':
            weights = 2 / (1 + np.exp(-self.scale * predictions))
        elif self.method == 'threshold':
            weights = np.where(predictions > 0, 1.5, 0.5)
        else:
            weights = np.ones_like(predictions)
        
        return np.clip(weights, MIN_INVESTMENT, MAX_INVESTMENT)

In [None]:
# Prepare features
target_cols = ['forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
id_col = 'date_id'
exclude_cols = [id_col] + target_cols

feature_cols = [col for col in train_df.columns if col not in exclude_cols]
print(f'Number of features: {len(feature_cols)}')

In [None]:
# Filter training data - use only complete data
missing_by_date = train_df[feature_cols].isnull().sum(axis=1)
threshold = len(feature_cols) * 0.05

valid_mask = missing_by_date <= threshold
valid_start_idx = valid_mask.idxmax()
valid_start_date = train_df.loc[valid_start_idx, 'date_id']

train_clean = train_df[train_df['date_id'] >= valid_start_date].copy().reset_index(drop=True)
print(f'Training samples after filtering: {len(train_clean)}')

In [None]:
# Prepare training data
fe = FeatureEngineer()
X_train = fe.fit_transform(train_clean, feature_cols)
y_train = train_clean['forward_returns'].values

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')

In [None]:
# Train model
if HAS_LGB:
    model = lgb.LGBMRegressor(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        random_state=42,
        verbose=-1
    )
else:
    model = GradientBoostingRegressor(
        n_estimators=200,
        max_depth=3,
        random_state=42
    )

model.fit(X_train, y_train)
print('Model trained!')

In [None]:
# Prepare test data and predict
X_test = fe.transform(test_df)
predictions = model.predict(X_test)

# Convert predictions to weights
strategy = TradingStrategy(method='sigmoid', scale=100)
weights = strategy.predict_to_weight(predictions)

print(f'Prediction statistics:')
print(f'  Min weight: {weights.min():.4f}')
print(f'  Max weight: {weights.max():.4f}')
print(f'  Mean weight: {weights.mean():.4f}')

In [15]:
# Create submission
submission = pd.DataFrame({
    'date_id': test_df['date_id'],
    'prediction': weights
})

# Ensure weights are within valid range
submission['prediction'] = submission['prediction'].clip(MIN_INVESTMENT, MAX_INVESTMENT)

# Save submission as parquet (required by Kaggle)
# Use pyarrow directly to avoid pandas compatibility issues
import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(submission, preserve_index=False)
pq.write_table(table, 'submission.parquet')

# Also save as CSV for local reference
submission.to_csv('submission.csv', index=False)

print('\nSubmission saved!')
print('Files created: submission.parquet, submission.csv')
print(submission.head(10))


Submission saved!
Files created: submission.parquet, submission.csv
   date_id  prediction
0     8980    1.044841
1     8981    0.939873
2     8982    1.004978
3     8983    1.032191
4     8984    1.036507
5     8985    1.051904
6     8986    1.027712
7     8987    1.038318
8     8988    1.024831
9     8989    0.961920


In [16]:
# Verify submission format
print('\nSubmission verification:')
print(f'  Rows: {len(submission)}')
print(f'  Columns: {list(submission.columns)}')
print(f'  Any NaN: {submission.isnull().any().any()}')
print(f'  Weight range: [{submission["prediction"].min():.4f}, {submission["prediction"].max():.4f}]')

# Verify parquet file exists
import os
if os.path.exists('submission.parquet'):
    print('\n✓ submission.parquet created successfully!')
    parquet_size = os.path.getsize('submission.parquet')
    print(f'  File size: {parquet_size} bytes')
else:
    print('\n✗ ERROR: submission.parquet not found!')


Submission verification:
  Rows: 10
  Columns: ['date_id', 'prediction']
  Any NaN: False
  Weight range: [0.9399, 1.0519]

✓ submission.parquet created successfully!
  File size: 1943 bytes
