In [1]:
# === Full Kaggle notebook cell: Voting ensemble with stabilized MLP ===

import os, glob, warnings
from pathlib import Path
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree             import DecisionTreeRegressor
from sklearn.neural_network   import MLPRegressor
from sklearn.linear_model     import LinearRegression
from sklearn.ensemble         import VotingRegressor
from sklearn.preprocessing    import StandardScaler
from sklearn.compose          import ColumnTransformer
from sklearn.pipeline         import Pipeline
from sklearn.base             import BaseEstimator, RegressorMixin
from sklearn.metrics          import mean_squared_error, mean_absolute_error

import joblib

# Suppress overflow/runtime warnings from MLP during training
warnings.filterwarnings("ignore", category=RuntimeWarning)

# 1. Define paths
DATA_DIR  = Path('/kaggle/input/dsebd')
MODEL_DIR = Path('/kaggle/working/models')
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# 2. Load & merge price JSON files
price_files = sorted(glob.glob(str(DATA_DIR / 'prices_*.json')))
df_list = []
for fp in price_files:
    tmp = pd.read_json(fp, orient='records')
    tmp = tmp.rename(columns={
        'trading_code': 'ticker',
        'closing_price': 'close',
        'yesterdays_closing_price': 'prev_close'
    })
    df_list.append(tmp)
df = pd.concat(df_list, ignore_index=True)

# 3. Feature engineering
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['ticker','date']).reset_index(drop=True)
df['close_lag1']  = df.groupby('ticker')['close'].shift(1)
df['roll_mean_7'] = df.groupby('ticker')['close'].transform(lambda x: x.rolling(7).mean())
df['roll_std_7']  = df.groupby('ticker')['close'].transform(lambda x: x.rolling(7).std())
df = df.dropna(subset=['close_lag1','roll_mean_7','roll_std_7'])

# 4. Prepare X, y
y = df['close'].values
X = df.drop(columns=['close','prev_close','date','ticker'])
feature_names = X.columns.tolist()
X = X.values  # convert to numpy array for ColumnTransformer

# 5. Train/test split
mask = df['date'] < '2022-01-01'
X_train, X_test = X[mask.values], X[~mask.values]
y_train, y_test = y[mask.values], y[~mask.values]

# 6. Preprocessing: scale all features
preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), list(range(len(feature_names))))
], remainder='drop')

# 7. Define Relative Regression on lag feature
class RelReg(BaseEstimator, RegressorMixin):
    def __init__(self, idx):
        self.idx = idx
        self.lr = LinearRegression()
    def fit(self, X, y):
        self.lr.fit(X[:, [self.idx]], y)
        return self
    def predict(self, X):
        return self.lr.predict(X[:, [self.idx]])

idx_close_lag1 = feature_names.index('close_lag1')

# 8. Build pipelines for each learner

# 8a. Decision Tree
dt_pipe = Pipeline([
    ('prep', preprocessor),
    ('model', DecisionTreeRegressor(
        criterion='squared_error',
        max_depth=20,
        random_state=42
    ))
])

# 8b. Neural Net with stabilized settings
mlp_pipe = Pipeline([
    ('prep', preprocessor),
    ('model', MLPRegressor(
        hidden_layer_sizes=(100,),
        solver='adam',
        learning_rate_init=0.01,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        max_iter=500,
        random_state=42
    ))
])

# 8c. Relative Regression
rr_pipe = Pipeline([
    ('prep', preprocessor),
    ('model', RelReg(idx=idx_close_lag1))
])

# 9. Combine via VotingRegressor
voter = VotingRegressor([
    ('dt',  dt_pipe),
    ('mlp', mlp_pipe),
    ('rr',  rr_pipe)
], n_jobs=-1)

# 10. Train the ensemble
voter.fit(X_train, y_train)

# 11. Evaluate
y_pred = voter.predict(X_test)
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"Test MAE : {mean_absolute_error(y_test, y_pred):.4f}")

# 12. Save the model
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
model_path = MODEL_DIR / f"voting_ensemble_{ts}.pkl"
joblib.dump(voter, model_path)
print("Saved model to:", model_path)

# 13. Example: load & predict
loaded = joblib.load(model_path)
print("Sample preds:", loaded.predict(X_test[:5]))



Test RMSE: 16739.8530
Test MAE : 587.7222
Saved model to: /kaggle/working/models/voting_ensemble_20250507_054030.pkl
Sample preds: [20.06570622 24.84468384 21.27151888 23.17871253 28.62366192]
