In [2]:
import optuna
from sklearn.preprocessing import StandardScaler
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import warnings
import pathlib as pl
from typing import Tuple
from typing import List
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class LSTMModel(nn.Module):
    """LSTM model for time series prediction"""
    def __init__(self,input_size,hidden_size=128,num_layers=2, dropout=0.2):
        super().__init__()

        # store parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # RNN Layer (notation: LSTM \in RNN)
        self.lstm = nn.LSTM(
        input_size=input_size,
                hidden_size=hidden_size,
                num_layers=num_layers,
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True
                )
        
        self.dropout = nn.Dropout(dropout)
        # linear layer for output
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self,x):
        lstm_out, _ = self.lstm(x)
        last_output = lstm_out[:, -1, :]
        out = self.dropout(last_output)
        out = self.fc(out)
        return out


class GRUModel(nn.Module):
    """GRU model for time series prediction"""
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.2):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        gru_out, _ = self.gru(x)
        last_output = gru_out[:, -1, :]
        out = self.dropout(last_output)
        out = self.fc(out)
        return out
    

    
class TimeSeriesDataset(Dataset):
    """Custom Dataset for time series data"""
    def __init__(self, X, y, sequence_length=10):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        self.sequence_length = sequence_length
        
    def __len__(self):
        return len(self.X) - self.sequence_length
        
    def __getitem__(self, idx):
        return (
            self.X[idx:idx + self.sequence_length],
            self.y[idx + self.sequence_length]
        )
    


class PyTrainer:
    def __init__(self, model, device=device):
        self.model = model.to(device)
        self.device = device
        self.scaler = StandardScaler()

    def prepare_data(self, X, y, sequence_length=10, batch_size=32):
        """Prepare data for training"""
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Create dataset and dataloader
        dataset = TimeSeriesDataset(X_scaled, y, sequence_length)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        
        return dataloader, self.scaler
    

    def __mae(self, y_pred, y_true):
        return torch.mean(torch.abs(y_pred - y_true))

    def __rmse(self, y_pred, y_true):
        return torch.sqrt(torch.mean((y_pred - y_true) ** 2))

    def __train_epoch(self, model, dataloader, optimizer, criterion):
        model.train()
        total_loss = 0

        for X, y in dataloader:
            batch_X, batch_y = X.to(device), y.to(device)
            optimizer.zero_grad()

            y_pred = model(batch_X)
            loss = criterion(y_pred.squeeze(), batch_y)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        return total_loss / len(dataloader)

    def __evaluate(self, model, dataloader, optimizer, criterion):
        model.eval()
        total_loss = 0
        y_preds, y_trues = [], []
        
        with torch.no_grad():
            for X, y in dataloader:
                batch_X, batch_y = X.to(device), y.to(device)
                optimizer.zero_grad()

                y_pred = model(batch_X)
                total_loss += criterion(y_pred.squeeze(), batch_y).item()
                y_preds.append(y_pred)
                y_trues.append(y)

                
        y_preds = torch.cat(y_preds)
        y_trues = torch.cat(y_trues)

        return (
            total_loss / len(dataloader),
            self.__mae(y_preds, y_trues).item(),
            self.__rmse(y_preds, y_trues).item()
        )
    def train(self, train_loader,
                val_loader,
                epochs,
                lr,
                patience):
        
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=patience/2, factor=0.5)
        best_val_loss = float('inf')
        patience_counter = 0

        train_losses = []
        val_losses = []
        val_maes = []
        val_rmses = []
        val_lrs = []
        # Training loop
        for epoch in range(epochs):
        
            train_loss = self.__train_epoch(self.model, train_loader, optimizer, criterion)
            train_losses.append(train_loss)
            # Validation
            if val_loader is not None:

                val_loss, val_mae, val_rmse = self.__evaluate(self.model, val_loader,optimizer,criterion )
                
                val_losses.append(val_loss)
                val_maes.append(val_mae)
                val_rmses.append(val_rmse)
                current_lr = optimizer.param_groups[0]["lr"]
                val_lrs.append(current_lr)
                # Learning rate scheduling
                scheduler.step(val_loss)
                
                # Early stopping
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    patience_counter = 0
                else:
                    patience_counter += 1
                
                if patience_counter >= patience:
                    print(f"Early stopping at epoch {epoch}")
                    break
                
                
                if epoch % 2 == 0:
                    print(
                        f"Epoch {epoch+1:02d} | "
                        f"LR: {current_lr:.1e} | "
                        f"Train Loss: {train_loss:.4f} | "
                        f"Val Loss: {val_loss:.4f} | "
                        f"MAE: {val_mae:.4f} | "
                        f"RMSE: {val_rmse:.4f}"
                    )
                    # print(f"Epoch {epoch}: Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
            else:
                if epoch % 2 == 0:
                    print(f"Epoch {epoch}: Train Loss: {train_loss:.6f}")    
        
        return train_losses, val_losses , val_lrs, val_maes, val_rmses
        
    

class DeepLearningStudy:
    def __init__(self, n_trials=100, timeout=3600):
        self.n_trials = n_trials
        self.timeout = timeout
        self.best_params = {}
        self.best_scores = {}
        self.study = None

    def create_study(self, study_name, direction='minimize'):
        """Create Optuna study for optimization"""
        sampler = TPESampler(seed=42)
        pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=10)
        
        self.study = optuna.create_study(
            study_name=study_name,
            direction=direction,
            sampler=sampler,
            pruner=pruner
        )
        
        return self.study
    
    def get_model_params(self, model_type, trial):
        """Get hyperparameters for specific model type"""
        if model_type == 'lstm':
            return self.suggest_lstm_params(trial)
        elif model_type == 'gru':
            return self.suggest_gru_params(trial)
        else:
            raise ValueError(f"Unknown model type: {model_type}")
        
    def suggest_lstm_params(self, trial):
        """Suggest hyperparameters for LSTM model"""
        params = {
            'hidden_size': trial.suggest_categorical('hidden_size', [64, 128, 256, 512]),
            'num_layers': trial.suggest_int('num_layers', 1, 4),
            'dropout': trial.suggest_float('dropout', 0.1, 0.5),
            'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
            'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64, 128]),
            'sequence_length': trial.suggest_int('sequence_length', 10, 30),
            'weight_decay': trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
        }
        return params
    
    def suggest_gru_params(self, trial):
        """Suggest hyperparameters for GRU model"""
        params = {
            'hidden_size': trial.suggest_categorical('hidden_size', [64, 128, 256, 512]),
            'num_layers': trial.suggest_int('num_layers', 1, 4),
            'dropout': trial.suggest_float('dropout', 0.1, 0.5),
            'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
            'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64, 128]),
            'sequence_length': trial.suggest_int('sequence_length', 10, 30),
            'weight_decay': trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
        }
        return params
    
    
    def optimize_model(self, model_type, X_train, y_train, X_val, y_val, 
                      input_size, study_name=None, max_epochs=50, patience=10):
        """Optimize hyperparameters for a specific model type"""
        if study_name is None:
            study_name = f"{model_type}_optimization"
        
        # Create study
        study = self.create_study(study_name, direction='minimize')
        
        # Run optimization
        study.optimize(
            lambda trial: self.objective_function(
                trial, model_type, X_train, y_train, X_val, y_val,
                input_size, max_epochs, patience
            ),
            n_trials=self.n_trials,
            timeout=self.timeout
        )
        
        # Store results
        self.best_params[model_type] = study.best_params
        self.best_scores[model_type] = study.best_value
        
        print(f"\n{model_type.upper()} Optimization Results:")
        print(f"Best Score: {study.best_value:.6f}")
        print(f"Best Parameters: {study.best_params}")
        
        return study
    
    def create_model(self, model_type, input_size, params):
        """Create model with given parameters"""
        if model_type == 'lstm':
            return LSTMModel(
                input_size=input_size,
                hidden_size=params['hidden_size'],
                num_layers=params['num_layers'],
                dropout=params['dropout']
            )
        elif model_type == 'gru':
            return GRUModel(
                input_size=input_size,
                hidden_size=params['hidden_size'],
                num_layers=params['num_layers'],
                dropout=params['dropout']
            )
        else:
            raise ValueError(f"Unknown model type: {model_type}")
        

    def objective_function(self, trial, model_type, X_train, y_train, X_val, y_val, 
                          input_size, max_epochs=50, patience=10):
        """Objective function for Optuna optimization"""
        try:
            # Get hyperparameters
            params = self.get_model_params(model_type, trial)
            
            # Create model
            model = self.create_model(model_type, input_size, params)
            
            # Create trainer
        
            trainer = PyTrainer(model)
            
            # Prepare data
            train_loader, scaler = trainer.prepare_data(
                X_train, y_train,
                params['sequence_length'],
                params['batch_size']
            )
            
            # Create validation data
            val_loader, _ = trainer.prepare_data(
                X_val, y_val,
                params['sequence_length'],
                params['batch_size']
            )
            
            # Train model
            train_losses, val_losses = trainer.train(
                train_loader,
                val_loader=val_loader,
                epochs=max_epochs,
                lr=params['learning_rate'],
                patience=patience
            )
            
            # Get best validation loss
            best_val_loss = min(val_losses) if val_losses else float('inf')
            
            # Report intermediate value for pruning
            trial.report(best_val_loss, step=len(val_losses))
            
            return best_val_loss
            
        except Exception as e:
            print(f"Trial failed: {str(e)}")
            return float('inf')

    def optimize_all_models(self, X_train, y_train, X_val, y_val, input_size,
            model_types=['lstm', 'gru'], max_epochs=50, patience=10):
        """Optimize hyperparameters for all model types"""
        results = {}
        
        for model_type in model_types:
            print(f"\n{'='*50}")
            print(f"Optimizing {model_type.upper()} model...")
            print(f"{'='*50}")
            
            try:
                study = self.optimize_model(
                    model_type, X_train, y_train, X_val, y_val,
                    input_size, max_epochs, max_epochs, patience
                )
                results[model_type] = {
                    'study': study,
                    'best_params': study.best_params,
                    'best_score': study.best_value
                }
            except Exception as e:
                print(f"Error optimizing {model_type}: {str(e)}")
                continue
        
        return results

In [10]:
class DataLoader:
    """Charge des fichiers de données"""
    def __init__(self, data_path: str="../data/"):
        self.datapath = data_path
        self.train_path = pl.Path(data_path) / "train.csv"
        self.test_path = pl.Path(data_path) / "test.csv"
        self.train_labels_path = pl.Path(data_path) / "train_labels.csv"
        self.target_pairs_path = pl.Path(data_path) / "target_pairs.csv"
        
    def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        train_df = pd.read_csv(self.train_path, index_col='date_id')
        test_df = pd.read_csv(self.test_path, index_col='date_id')
        train_labels_df = pd.read_csv(self.train_labels_path, index_col='date_id')
        target_pairs_df = pd.read_csv(self.target_pairs_path)
        return train_df, test_df, train_labels_df, target_pairs_df

class PreProcess:
    """PreProcess - regroupement des données par catégory"""
    def __init__(self):
        pass

    def get_train_info(self, df):
        """ Construction d'une data des entêtes de colonnes.
        Args:
            df (pd.DataFrame): Input dataframe d'entrainement.
        Returns:
            pd.DataFrame: Détail des informations de chaque colonne.
        """
        df_names = df.columns
        # Fonction pour nettoyer et split les noms
        def clean_and_split(name):
            name = name.replace("open_interest", "open interest")
            name = name.replace("settlement_price", "settlement price")
            name = name.replace("US_Stock", "US Stock")
            name = name.replace("adj_close", "Close")
            name = name.replace("adj_", "adjusted ")
            name = name.replace("-", "_")
            return name.split("_")

        # Création du DataFrame d'infos
        df_info = pd.DataFrame(
            {
            'Column': df_names,
            'Split': [clean_and_split(name) for name in df_names]
        })

        df_info['Category'] = df_info['Split'].apply(lambda x: x[0])    
        df_info['Ticker'] = df_info['Split'].apply(
        lambda x: "_".join(x[1:-1]) if len(x) > 2 else x[-1] if len(x) == 2 else ""
        )
        df_info['Type'] = df_info['Split'].apply(lambda x: x[-1])

        # Nettoyage final
        df_info['Ticker'] = df_info.apply(
            lambda row: row['Type'] if row['Ticker'] == "" else row['Ticker'], axis=1
        )
        df_info['Column_Id'] = df_info.index + 1

        # Sélection des colonnes finales
        df_info = df_info[['Column_Id', 'Column', 'Category', 'Ticker', 'Type']]    
        return df_info
    
    def get_preprocess_data(self, df, cond):
        # Fonction pour obtenir les données prétraitées en fonction de la condition
        if cond.Column.size > 0:
            return df[cond.Column.values[0]]
        else:
            return None
        
    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        df_processed = df.copy()
        df_processed = df_processed.drop(columns=['is_score'], errors='ignore')
        df_info = self.get_train_info(df_processed)


        df_processed.reset_index(inplace=True)
        # On renomme la date_id en date
        df_processed = df_processed.rename({'date_id': 'date'}, axis='columns')
        # Initialisation du DataFrame résultat
        result = pd.DataFrame(columns=['date', 'id', 'close', 'open', 'high', 'low', 'volume', 'sprice', 'interest'])
        
        for  Category  in df_info.groupby('Category').groups.keys():
            txtCategory=Category.replace(' ','_')
            for label in df_info[(df_info.Category==Category)].groupby('Ticker').groups.keys():
                temp_df = pd.DataFrame()
                temp_df['date'] = df_processed['date']
                temp_df['id'] = f'{txtCategory}_{label}'

                if Category in ['FX','LME']:
                    temp_df['close'] = df_processed[df_info[(df_info.Category==Category) & (df_info.Ticker==label)].Column.values[0]]
                    temp_df['open'] = None
                    temp_df['high'] = None
                    temp_df['low'] = None
                    temp_df['volume'] = None
                    temp_df['sprice'] = None
                    temp_df['interest'] = None
                else:
                    temp_df['close'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['Close', 'adjusted close']))])
                    temp_df['open'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['Open','adjusted open']))])
                    temp_df['high'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['High','adjusted high']))])
                    temp_df['low'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['Low','adjusted low']))])
                    temp_df['volume'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['Volume', 'adjusted volume']))])
                    temp_df['sprice'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['settlement price','adjusted settlement price']))])
                    temp_df['interest'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['open interest','adjusted open interest']))])
                result = pd.concat([result, temp_df], ignore_index=True)
        
        # Réinitialiser l'index
        result = result.reset_index(drop=True)
        
        # Trier par date et id 
        result = result.sort_values(['date', 'id']).reset_index(drop=True)
        
        return result  
    
class FeatureEngineer:
    """Creation des fonctions """
    def __init__(self):
        pass

    def add_lag_features(self,
        df: pd.DataFrame, 
        lags: List[int], 
        date_col: str = 'date'
        ) -> pd.DataFrame:
        """
        Add lag features for specified columns and lags.
        """
        df = df.sort_values(date_col)
        cols = set(df.columns)
        cols.remove('id')
        cols.remove('date')
        for col in cols:
            for lag in lags:
                df[f'{col}_lag{lag}'] = df.groupby('id')[col].shift(lag)
        return df
    
    def add_rolling_features(self,
        df: pd.DataFrame,
        windows: List[int],
        date_col: str = 'date') -> pd.DataFrame:
        """ 
        Add rolling mean and std features for specified columns and windows.
        """
        df = df.sort_values(date_col)
        cols = set(df.columns)
        cols.remove('id')
        cols.remove('date')
        for col in cols:
            for window in windows:
                df[f'{col}_rollmean{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).mean())
                df[f'{col}_rollstd{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).std())
                df[f'{col}_rollmin{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).min())
                df[f'{col}_rollmax{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).max())
        return df
    
    def prepare_features(self, df: pd.DataFrame) ->pd.DataFrame:
        """Engineer features for training and testing data"""
        try:
            # add lag
            df_result = df.copy()
            df_result = self.add_lag_features(df_result, lags=[1, 2, 3, 5, 7])  
            # Add rolling features
            df_result = self.add_rolling_features(df_result, windows=[5, 10, 15])
            # Handle missing values
            df_result = df_result.fillna(method='ffill').fillna(method='bfill').fillna(0)

            return df_result
        except Exception as e:
            print(f"Feature preparation failed: {e}")
            raise
        
class FeatureTarget:
    """Class to handle target feature engineering"""
    def __init__(self):
        pass
    
    def prepare_targets(self, train_labels_df: pd.DataFrame) -> pd.DataFrame:
        """Prepare target information from pairs DataFrame.
        Args:
            pairs (pd.DataFrame): DataFrame containing 'pair' column.   
        Returns:
            pd.DataFrame: DataFrame with target information including price_1, price_2, and is_pair.
        """
        target_cols = [col for col in train_labels_df.columns if col not in ['timestamp', 'id']]
        display(target_cols)
        target_values = train_labels_df[target_cols]
        return target_values, target_cols
    
    def prepare_targets_info(self, pairs: pd.DataFrame) -> pd.DataFrame:
        """Prepare target information from pairs DataFrame.
        Args:
            pairs (pd.DataFrame): DataFrame containing 'pair' column.
        Returns:
            pd.DataFrame: DataFrame with target information including price_1, price_2, and is_pair.
        """

        target_definitions = pairs["pair"].str.split(" - ", expand=True)
        target_info = pairs.copy()

        # Colonnes price_1 et price_2 (équivalent aux colonnes [,1] et [,2])
        target_info["price_1"] = target_definitions[0]
        target_info["price_2"] = target_definitions[1]

        # is.pair = second élément non vide
        target_info['is_pair'] = target_info['price_2'].apply(lambda x:x is not None)

        # Retirer la colonne "pair"
        target_info = target_info.drop(columns=["pair"])
        return target_info

In [11]:
dataLoader = DataLoader()
featureTarget = FeatureTarget()
featureEngineer = FeatureEngineer()
preProcess = PreProcess()
train_df, test_df, train_labels_df, target_pairs_df = dataLoader.load_data()
train_df_process = preProcess.preprocess(train_df)
test_df_process = preProcess.preprocess(test_df)

In [None]:
train_df_feature = featureEngineer.prepare_features(train_df_process)
test_df_feature = featureEngineer.prepare_features(test_df_process)
display(train_df_feature)

In [None]:
target_values,target_cols = featureTarget.prepare_targets(train_labels_df)
display(target_values)

In [None]:
from torch.utils.data import Dataset, DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import torch.optim as optim
import matplotlib.pyplot as plt

def plot_training_curves(train_losses, val_losses, target_name, model_name, save_path=None):
    """Plot training and validation loss curves"""
    plt.figure(figsize=(12, 5))
    
    epochs = range(len(train_losses))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, 'b-', label='Training Loss', alpha=0.8)
    if val_losses:
        plt.plot(epochs, val_losses, 'r-', label='Validation Loss', alpha=0.8)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'{target_name} - {model_name}\nTraining Curves')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    if val_losses:
        plt.subplot(1, 2, 2)
        plt.plot(epochs, np.array(train_losses) - np.array(val_losses), 'g-', alpha=0.8)
        plt.xlabel('Epoch')
        plt.ylabel('Train Loss - Val Loss')
        plt.title(f'{target_name} - {model_name}\nLoss Difference')
        plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
    plt.show()
    
    return plt.gcf()

def plot_evaling_curves(val_lrs, val_mae, val_rmse, target_name, model_name, save_path=None):
    """Plot training and validation loss curves"""
    
    epochs = range(len(val_lrs))

    fig, ax = plt.subplots(1,3,figsize=(16, 6))
    ax[0].plot(epochs, val_lrs, 'b-', label='Rate learnning', alpha=0.8)
    ax[1].plot(epochs, val_mae, 'r-', label='MAE', alpha=0.8)
    ax[2].plot(epochs, val_rmse, 'g-', label='RMSE', alpha=0.8)

    ax[0].grid(True, alpha=0.3)
    ax[1].grid(True, alpha=0.3)
    ax[2].grid(True, alpha=0.3)
    
    ax[0].set_facecolor('#f0f0ff')
    ax[1].set_facecolor('#f0f0ff')
    ax[2].set_facecolor('#f0f0ff')

    ax[0].set_title("Rate learnning")
    ax[1].set_title("MAE")
    ax[2].set_title("RMSE")

    plt.suptitle(f'{target_name} - {model_name}\nEval')
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
    plt.show()
    
    return plt.gcf()