# Prédiction
## Auteur Linda MARTIN
On a les parametres pour sauvegarder les models. on peut tester les prédictions. 

In [155]:

### import libraries
import torch
import joblib
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import warnings
import polars as pl
from typing import Tuple
from typing import List
warnings.filterwarnings("ignore")
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import torch.optim as optim
import gc

In [156]:
class LoaderData:
    """Charge des fichiers de données"""
    def __init__(self, data_path: str="../data/"):
        self.datapath = data_path
        self.train_path = Path(data_path) / "train.csv"
        self.test_path = Path(data_path) / "test.csv"
        self.train_labels_path = Path(data_path) / "train_labels.csv"
        self.target_pairs_path = Path(data_path) / "target_pairs.csv"
        
    def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        train_df = pd.read_csv(self.train_path, index_col='date_id')
        test_df = pd.read_csv(self.test_path, index_col='date_id')
        train_labels_df = pd.read_csv(self.train_labels_path, index_col='date_id')
        target_pairs_df = pd.read_csv(self.target_pairs_path)
        return train_df, test_df, train_labels_df, target_pairs_df

class PreProcess:
    """PreProcess - regroupement des données par catégory"""
    def __init__(self):
        pass

    def get_train_info(self, df):
        """ Construction d'une data des entêtes de colonnes.
        Args:
            df (pd.DataFrame): Input dataframe d'entrainement.
        Returns:
            pd.DataFrame: Détail des informations de chaque colonne.
        """
        df_names = df.columns
        # Fonction pour nettoyer et split les noms
        def clean_and_split(name):
            name = name.replace("open_interest", "open interest")
            name = name.replace("settlement_price", "settlement price")
            name = name.replace("US_Stock", "US Stock")
            name = name.replace("adj_close", "Close")
            name = name.replace("adj_", "adjusted ")
            name = name.replace("-", "_")
            return name.split("_")

        # Création du DataFrame d'infos
        df_info = pd.DataFrame(
            {
            'Column': df_names,
            'Split': [clean_and_split(name) for name in df_names]
        })

        df_info['Category'] = df_info['Split'].apply(lambda x: x[0])    
        df_info['Ticker'] = df_info['Split'].apply(
        lambda x: "_".join(x[1:-1]) if len(x) > 2 else x[-1] if len(x) == 2 else ""
        )
        df_info['Type'] = df_info['Split'].apply(lambda x: x[-1])

        # Nettoyage final
        df_info['Ticker'] = df_info.apply(
            lambda row: row['Type'] if row['Ticker'] == "" else row['Ticker'], axis=1
        )
        df_info['Column_Id'] = df_info.index + 1

        # Sélection des colonnes finales
        df_info = df_info[['Column_Id', 'Column', 'Category', 'Ticker', 'Type']]    
        return df_info
    
    def get_preprocess_data(self, df, cond):
        # Fonction pour obtenir les données prétraitées en fonction de la condition
        if cond.Column.size > 0:
            return df[cond.Column.values[0]]
        else:
            return None
        
    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        df_processed = df.copy()
        df_processed = df_processed.drop(columns=['is_scored'], errors='ignore')
        df_info = self.get_train_info(df_processed)


        df_processed.reset_index(inplace=True)
        # On renomme la date_id en date
        df_processed = df_processed.rename({'date_id': 'date'}, axis='columns')
        # Initialisation du DataFrame résultat
        result = pd.DataFrame(columns=['date', 'id', 'close', 'open', 'high', 'low', 'volume', 'sprice', 'interest'])
        
        for  Category  in df_info.groupby('Category').groups.keys():
            txtCategory=Category.replace(' ','_')
            for label in df_info[(df_info.Category==Category)].groupby('Ticker').groups.keys():
                temp_df = pd.DataFrame()
                temp_df['date'] = df_processed['date']
                temp_df['id'] = f'{txtCategory}_{label}'

                if Category in ['FX','LME']:
                    temp_df['close'] = df_processed[df_info[(df_info.Category==Category) & (df_info.Ticker==label)].Column.values[0]]
                    temp_df['open'] = None
                    temp_df['high'] = None
                    temp_df['low'] = None
                    temp_df['volume'] = None
                    temp_df['sprice'] = None
                    temp_df['interest'] = None
                else:
                    temp_df['close'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['Close', 'adjusted close']))])
                    temp_df['open'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['Open','adjusted open']))])
                    temp_df['high'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['High','adjusted high']))])
                    temp_df['low'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['Low','adjusted low']))])
                    temp_df['volume'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['Volume', 'adjusted volume']))])
                    temp_df['sprice'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['settlement price','adjusted settlement price']))])
                    temp_df['interest'] = self.get_preprocess_data(df_processed,df_info[(df_info.Category==Category) & (df_info.Ticker==label) & (df_info.Type.isin(['open interest','adjusted open interest']))])
                result = pd.concat([result, temp_df], ignore_index=True)
        
        # Réinitialiser l'index
        result = result.reset_index(drop=True)
        
        # Trier par date et id 
        result = result.sort_values(['date', 'id']).reset_index(drop=True)
        
        return result  
    
class FeatureEngineer:
    """Creation des fonctions """
    def __init__(self):
        pass

    def add_lag_features(self,
        df: pd.DataFrame, 
        lags: List[int], 
        date_col: str = 'date'
        ) -> pd.DataFrame:
        """
        Add lag features for specified columns and lags.
        """
        df = df.sort_values(date_col)
        cols = set(df.columns)
        cols.remove('id')
        cols.remove('date')
        for col in cols:
            for lag in lags:
                df[f'{col}_lag{lag}'] = df.groupby('id')[col].shift(lag)
        return df
    
    def add_rolling_features(self,
        df: pd.DataFrame,
        windows: List[int],
        date_col: str = 'date') -> pd.DataFrame:
        """ 
        Add rolling mean and std features for specified columns and windows.
        """
        df = df.sort_values(date_col)
        cols = set(df.columns)
        cols.remove('id')
        cols.remove('date')
        for col in cols:
            for window in windows:
                df[f'{col}_rollmean{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).mean())
                df[f'{col}_rollstd{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).std())
                df[f'{col}_rollmin{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).min())
                df[f'{col}_rollmax{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).max())
        return df
    
    def prepare_features(self, df: pd.DataFrame) ->pd.DataFrame:
        """Engineer features for training and testing data"""
        try:
            # add lag
            df_result = df.copy()
            df_result = self.add_lag_features(df_result, lags=[1, 2, 3, 5, 7])  
            # Add rolling features
            df_result = self.add_rolling_features(df_result, windows=[5, 10, 15])
            # Handle missing values
            df_result = df_result.fillna(method='ffill').fillna(method='bfill').fillna(0)

            return df_result
        except Exception as e:
            print(f"Feature preparation failed: {e}")
            raise

In [157]:
class FeatureTarget:
    """Class to handle target feature engineering"""
    def __init__(self):
        pass
    
    def prepare_targets(self, train_labels_df: pd.DataFrame) -> pd.DataFrame:
        """Prepare target information from pairs DataFrame.
        Args:
            pairs (pd.DataFrame): DataFrame containing 'pair' column.   
        Returns:
            pd.DataFrame: DataFrame with target information including price_1, price_2, and is_pair.
        """
        target_cols = [col for col in train_labels_df.columns if col not in ['timestamp', 'id']]
        target_values = train_labels_df[target_cols]
        return target_values, target_cols
    
    def prepare_targets_info(self, pairs: pd.DataFrame) -> pd.DataFrame:
        """Prepare target information from pairs DataFrame.
        Args:
            pairs (pd.DataFrame): DataFrame containing 'pair' column.
        Returns:
            pd.DataFrame: DataFrame with target information including price_1, price_2, and is_pair.
        """

        target_definitions = pairs["pair"].str.split(" - ", expand=True)
        target_info = pairs.copy()

        # Colonnes price_1 et price_2 (équivalent aux colonnes [,1] et [,2])
        target_info["price_1"] = target_definitions[0]
        target_info["price_2"] = target_definitions[1]

        # is.pair = second élément non vide
        target_info['is_pair'] = target_info['price_2'].apply(lambda x:x is not None)

        # Retirer la colonne "pair"
        target_info = target_info.drop(columns=["pair"])
        return target_info

In [158]:
class LSTMModel(nn.Module):
  """LSTM model for time series prediction"""
  def __init__(self,input_size,hidden_size=128,num_layers=2, dropout=0.2):
    super().__init__()

    # store parameters
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    # RNN Layer (notation: LSTM \in RNN)
    self.lstm = nn.LSTM(
      input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
            )
    
    self.dropout = nn.Dropout(dropout)
    # linear layer for output
    self.fc = nn.Linear(hidden_size, 1)
    self.name = "LSTMModel"
    
  def forward(self,x):
    lstm_out, _ = self.lstm(x)
    last_output = lstm_out[:, -1, :]
    out = self.dropout(last_output)
    out = self.fc(out)
    return out
  
  def predict(self, dataloader, device=None, return_numpy=True):
    self.eval()
    if device is not None:
        self.to(device)

    preds = []

    with torch.no_grad():
        for X_batch, _ in dataloader:
            if device is not None:
                X_batch = X_batch.to(device)

            output = self.forward(X_batch)
            preds.append(output.cpu())

    preds = torch.cat(preds, dim=0)

    if return_numpy:
        return preds.numpy()
    return preds
  

class GRUModel(nn.Module):
    """GRU model for time series prediction"""
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.2):
        super(GRUModel, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, 1)
        self.name = "GRUModel"
        
    def forward(self, x):
        gru_out, _ = self.gru(x)
        last_output = gru_out[:, -1, :]
        out = self.dropout(last_output)
        out = self.fc(out)
        return out
    
    def predict(self, dataloader, device=None, return_numpy=True):
        self.eval()
        if device is not None:
            self.to(device)

        preds = []

        with torch.no_grad():
            for X_batch, _ in dataloader:
                if device is not None:
                    X_batch = X_batch.to(device)

                output = self.forward(X_batch)
                preds.append(output.cpu())

        preds = torch.cat(preds, dim=0)

        if return_numpy:
            return preds.numpy()
        return preds
  
    

class TimeSeriesDataset(Dataset):
    """Custom Dataset for time series data"""
    def __init__(self, X, y, sequence_length=10):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        self.sequence_length = sequence_length
        
    def __len__(self):
        return len(self.X) - self.sequence_length
        
    def __getitem__(self, idx):
        return (
           
            self.X[idx:idx + self.sequence_length],
            self.y[idx + self.sequence_length]

        )
    
def prepare_data_train(X, y, sequence_length=10, batch_size=32):
    """Prepare data for training"""
    # Scale features

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Create dataset and dataloader
    dataset = TimeSeriesDataset(X_scaled, y, sequence_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    return dataloader, scaler

def prepare_test_data(X, y, scaler, sequence_length=10, batch_size=32):
    X_scaled = scaler.transform(X)
    display(X_scaled)
    dataset = TimeSeriesDataset(X_scaled, y, sequence_length)
    display(len(dataset))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    return dataloader

## Reprise de l'entrainement du modèle en ajoutant la prédiction 

In [159]:
def mae(y_pred, y_true):
    return torch.mean(torch.abs(y_pred - y_true))

def rmse(y_pred, y_true):
    return torch.sqrt(torch.mean((y_pred - y_true) ** 2))

def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0

    for X, y in dataloader:
        batch_X, batch_y = X.to(device), y.to(device)
        optimizer.zero_grad()

        y_pred = model(batch_X)
        loss = criterion(y_pred.squeeze(), batch_y)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def evaluate(model, dataloader, optimizer, criterion):
    model.eval()
    total_loss = 0
    y_preds, y_trues = [], []
    
    with torch.no_grad():
        for X, y in dataloader:
            batch_X, batch_y = X.to(device), y.to(device)
            optimizer.zero_grad()

            y_pred = model(batch_X)
            total_loss += criterion(y_pred.squeeze(), batch_y).item()
            y_preds.append(y_pred)
            y_trues.append(y)

            
    y_preds = torch.cat(y_preds)
    y_trues = torch.cat(y_trues)

    return (
        total_loss / len(dataloader),
        mae(y_preds, y_trues).item(),
        rmse(y_preds, y_trues).item()
    )

class ParamTest:
    def __init__(self):
        self.hidden_size:int = 128
        self.num_layers:int = 2
        self.dropout:float = 0.2
        self.learning_rate:float=0.001
        self.batch_size:int = 32
        self.sequence_length:int = 10
        self.weight_decay:float = 0.2

def train_model(params:ParamTest, model_type, X_train, y_train, X_val, y_val ,input_size, max_epochs, patience  ):
# Prepare data
    train_loader, scaler = prepare_data_train(
                    X_train, y_train, 
                    params.sequence_length, params.batch_size
                )
    val_loader, _  = prepare_data_train(
                    X_val, y_val, 
                    params.sequence_length, params.batch_size
                )
    
    # Initialize model
   
    if model_type == 'LSTM':
        model = LSTMModel(input_size, params.hidden_size,params.num_layers, params.dropout)
    elif model_type == 'GRU':
        model = GRUModel(input_size, params.hidden_size,params.num_layers, params.dropout)
    else:
        raise ValueError("Invalid model type. Choose 'LSTM' or 'GRU'.")
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=params.learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=patience/2, factor=0.5)
    best_val_loss = float('inf')
    patience_counter = 0

    train_losses = []
    val_losses = []
    val_maes = []
    val_rmses = []
    val_lrs = []
    # Training loop
    for epoch in range(max_epochs):
       
        train_loss = train_epoch(model, train_loader, optimizer, criterion)
        train_losses.append(train_loss)
        # Validation
        if val_loader is not None:

            val_loss, val_mae, val_rmse = evaluate(model, val_loader,optimizer,criterion )
            
            val_losses.append(val_loss)
            val_maes.append(val_mae)
            val_rmses.append(val_rmse)
            current_lr = optimizer.param_groups[0]["lr"]
            val_lrs.append(current_lr)
            # Learning rate scheduling
            scheduler.step(val_loss)
            
            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break
            
            
            if epoch % 2 == 0:
                print(
                    f"Epoch {epoch+1:02d} | "
                    f"LR: {current_lr:.1e} | "
                    f"Train Loss: {train_loss:.4f} | "
                    f"Val Loss: {val_loss:.4f} | "
                    f"MAE: {val_mae:.4f} | "
                    f"RMSE: {val_rmse:.4f}"
                )
                
        else:
            if epoch % 2 == 0:
                print(f"Epoch {epoch}: Train Loss: {train_loss:.6f}")    

    return model, scaler

# train target values
def convert_to_colname(colname:str)-> str:
    if colname is None:
        return None
    if colname.startswith("FX_"):
        return colname
    colname = colname.replace("_adj","")
    colname = colname.rsplit('_', 1)[0]
    return colname

def getnameprice(indexe_target:int):
    target_name = target_info.loc[indexe_target,'target']
    target_col1 = convert_to_colname(target_info.loc[indexe_target,'price_1'])
    target_col2 = convert_to_colname(target_info.loc[indexe_target,'price_2'])
    return  target_name, target_col1, target_col2

def save_model(model_name, name_target, model, scaler):
    model_dir = Path('../outputs') / 'models'
    model_dir.mkdir(exist_ok=True)
    model_path =  model_dir /  f"{name_target}_{model_name}"
    torch.save({
        "model_state_dict": model.state_dict(),
        "input_size": model.input_size,
        "hidden_size": model.hidden_size,
        "num_layers": model.num_layers
    }, f"{model_path}.ptk")

    from sklearn.preprocessing import StandardScaler
    assert isinstance(scaler, StandardScaler)
    joblib.dump(scaler, f"{model_path}_scaler.ptk")
    
    

def loadmodel(model_name, name_target, device=None):
    model_dir = Path('../outputs') / 'models'
    model_path =  model_dir /  f"{name_target}_{model_name}"
    checkpoint = torch.load(f"{model_path}.ptk", map_location=device)

    if model_name == "GRUModel":
        model = GRUModel(
        input_size=checkpoint["input_size"],
        hidden_size=checkpoint["hidden_size"],
        num_layers=checkpoint["num_layers"]
        )
    else:
        model = LSTMModel(
        input_size=checkpoint["input_size"],
        hidden_size=checkpoint["hidden_size"],
        num_layers=checkpoint["num_layers"]
    )
        
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()
    scaler = joblib.load(f"{model_path}_scaler.ptk")
    
    # Vérification explicite
    from sklearn.preprocessing import StandardScaler
    assert isinstance(scaler, StandardScaler), f"Scaler chargé = {type(scaler)}"

    if device is not None:
        model.to(device)

    return model, scaler
    
def train_features_build(target_col1,target_col2):
# Filter for the two targets
    if target_col2 is not None:
        df_target2 = train_df_feature[train_df_feature['id'] == target_col2]
        df_target1 = train_df_feature[train_df_feature['id'] == target_col1]
        
        # Concatenate the two DataFrames side by side (axis=1)
        result = df_target2.merge(
    df_target1,
    on='date',
    how='left',   # ou 'left'
    suffixes=('_t2', '_t1'))
        result = result.reset_index()
        result = result.drop(['id_t2', 'date_t2','id_t1', 'date_t1', 'index', 'date'], axis=1, errors='ignore')
        
        return result
    else:
    
        df_target1 = train_df_feature[train_df_feature['id'] == target_col1]
        # Concatenate the two DataFrames side by side (axis=1)
        result = df_target1
        result = result.reset_index()
        result = result.drop(['id', 'date', 'index'], axis=1, errors='ignore')
        
        return result
                
def my_train_models_for_target_save(name,
                p_target_values,
                target_col1:str,
                target_col2:str,
                model_type:str='LSTM',
                num_epochs:int=50,
                patience:int=30):
    
    y = p_target_values
    mask = ~y.isna()
    X_all = train_features_build(target_col1,target_col2)
    
    X_valid = X_all.loc[mask]
    y_valid = y[mask]

    # Simple train/validation split (last 20% for validation)
    split_idx = int(len(X_valid) * 0.8)
    X_train = X_valid[:split_idx]
    y_train = y_valid[:split_idx]
    X_val = X_valid[split_idx:]
    y_val = y_valid[split_idx:]

    params = ParamTest()
    model, scaler  = \
        train_model(params, model_type, X_train, y_train.values, X_val, y_val.values, 
                    X_train.shape[1],num_epochs, patience)
    save_model(model.name, name, model, scaler)

loaderdata = LoaderData()
featureTarget = FeatureTarget()
featureEngineer = FeatureEngineer()
preProcess = PreProcess()
train_df, test_df, train_labels_df, target_pairs_df = loaderdata.load_data()
train_df_process = preProcess.preprocess(train_df)
train_df_feature = featureEngineer.prepare_features(train_df_process)
target_info = featureTarget.prepare_targets_info(target_pairs_df)
target_values,target_cols = featureTarget.prepare_targets(train_labels_df)

In [160]:
for idx in range(2):
    tgt=target_cols[idx]
    y = target_values[tgt]
    name,pric1,pric2 =getnameprice(idx)
    my_train_models_for_target_save(name,y,pric1,pric2, model_type='GRU')
    my_train_models_for_target_save(name,y,pric1,pric2)

Epoch 01 | LR: 1.0e-03 | Train Loss: 0.0340 | Val Loss: 0.0134 | MAE: 0.1038 | RMSE: 0.1166
Epoch 03 | LR: 1.0e-03 | Train Loss: 0.0045 | Val Loss: 0.0023 | MAE: 0.0429 | RMSE: 0.0487
Epoch 05 | LR: 1.0e-03 | Train Loss: 0.0022 | Val Loss: 0.0030 | MAE: 0.0446 | RMSE: 0.0557
Epoch 07 | LR: 1.0e-03 | Train Loss: 0.0020 | Val Loss: 0.0011 | MAE: 0.0282 | RMSE: 0.0335
Epoch 09 | LR: 1.0e-03 | Train Loss: 0.0012 | Val Loss: 0.0003 | MAE: 0.0144 | RMSE: 0.0182
Epoch 11 | LR: 1.0e-03 | Train Loss: 0.0010 | Val Loss: 0.0003 | MAE: 0.0131 | RMSE: 0.0165
Epoch 13 | LR: 1.0e-03 | Train Loss: 0.0007 | Val Loss: 0.0002 | MAE: 0.0110 | RMSE: 0.0147
Epoch 15 | LR: 1.0e-03 | Train Loss: 0.0006 | Val Loss: 0.0002 | MAE: 0.0101 | RMSE: 0.0136
Epoch 17 | LR: 1.0e-03 | Train Loss: 0.0005 | Val Loss: 0.0002 | MAE: 0.0115 | RMSE: 0.0153
Epoch 19 | LR: 1.0e-03 | Train Loss: 0.0005 | Val Loss: 0.0002 | MAE: 0.0093 | RMSE: 0.0126
Epoch 21 | LR: 1.0e-03 | Train Loss: 0.0005 | Val Loss: 0.0001 | MAE: 0.0092 | R

In [216]:
for idx in range(len(target_cols)):
    tgt=target_cols[idx]
    y = target_values[tgt]
    name,pric1,pric2 =getnameprice(idx)
    my_train_models_for_target_save(name,y,pric1,pric2, model_type='GRU')
    my_train_models_for_target_save(name,y,pric1,pric2)

Epoch 01 | LR: 1.0e-03 | Train Loss: 0.0613 | Val Loss: 0.0341 | MAE: 0.1610 | RMSE: 0.1820
Epoch 03 | LR: 1.0e-03 | Train Loss: 0.0071 | Val Loss: 0.0043 | MAE: 0.0490 | RMSE: 0.0660
Epoch 05 | LR: 1.0e-03 | Train Loss: 0.0030 | Val Loss: 0.0006 | MAE: 0.0200 | RMSE: 0.0252
Epoch 07 | LR: 1.0e-03 | Train Loss: 0.0021 | Val Loss: 0.0006 | MAE: 0.0181 | RMSE: 0.0223
Epoch 09 | LR: 1.0e-03 | Train Loss: 0.0018 | Val Loss: 0.0003 | MAE: 0.0143 | RMSE: 0.0181
Epoch 11 | LR: 1.0e-03 | Train Loss: 0.0011 | Val Loss: 0.0002 | MAE: 0.0097 | RMSE: 0.0133
Epoch 13 | LR: 1.0e-03 | Train Loss: 0.0010 | Val Loss: 0.0002 | MAE: 0.0101 | RMSE: 0.0137
Epoch 15 | LR: 1.0e-03 | Train Loss: 0.0009 | Val Loss: 0.0004 | MAE: 0.0144 | RMSE: 0.0185
Epoch 17 | LR: 1.0e-03 | Train Loss: 0.0008 | Val Loss: 0.0002 | MAE: 0.0106 | RMSE: 0.0141
Epoch 19 | LR: 1.0e-03 | Train Loss: 0.0008 | Val Loss: 0.0002 | MAE: 0.0120 | RMSE: 0.0157
Epoch 21 | LR: 1.0e-03 | Train Loss: 0.0007 | Val Loss: 0.0003 | MAE: 0.0129 | R

In [173]:
competition_data_dir = Path("../data")
def generate_data_batches():
        test = pl.read_csv(competition_data_dir / 'test.csv')
        test_df = pd.read_csv(competition_data_dir / 'test.csv', index_col='date_id')
        preProcess = PreProcess()
        
        featureEngineer = FeatureEngineer()     
        test_df_process = preProcess.preprocess(test_df)
        test_df_feature = featureEngineer.prepare_features(test_df_process)
        label_lag_dir = competition_data_dir / 'lagged_test_labels'
        label_lags_1 = pl.read_csv(label_lag_dir / 'test_labels_lag_1.csv')
        label_lags_2 = pl.read_csv(label_lag_dir / 'test_labels_lag_2.csv')
        label_lags_3 = pl.read_csv(label_lag_dir / 'test_labels_lag_3.csv')
        label_lags_4 = pl.read_csv(label_lag_dir / 'test_labels_lag_4.csv')

        date_ids = test['date_id'].unique(maintain_order=True).to_list()
        for date_id in date_ids:
            test_batch = test_df_feature[test_df_feature['date'] == date_id]
            # test_batch = test.filter(pl.col('date_id') == date_id)
            label_lags_1_batch = label_lags_1.filter(pl.col('label_date_id') == date_id)
            label_lags_2_batch = label_lags_2.filter(pl.col('label_date_id') == date_id)
            label_lags_3_batch = label_lags_3.filter(pl.col('label_date_id') == date_id)
            label_lags_4_batch = label_lags_4.filter(pl.col('label_date_id') == date_id)

            yield (
                (test_batch, label_lags_1_batch, label_lags_2_batch, label_lags_3_batch, label_lags_4_batch),
                date_id,
            )
            


In [220]:
#  chargement des models
models_gru = {}
scalers_gru = {}
scalers_lstm = {}
models_lstm = {}
for idx in range(len(target_cols)):
     tgt=target_cols[idx]
     models_gru[tgt], scalers_gru[tgt] = loadmodel('GRUModel',tgt)
     models_lstm[tgt], scalers_lstm[tgt] = loadmodel('LSTMModel',tgt)


In [222]:
def test_features_build(p_test_df_feature,target_col1,target_col2):
# Filter for the two targets
    if target_col2 is not None:
        df_target2 = p_test_df_feature[p_test_df_feature['id'] == target_col2]
        df_target1 = p_test_df_feature[p_test_df_feature['id'] == target_col1]
        
        # Concatenate the two DataFrames side by side (axis=1)
        result = df_target2.merge(
    df_target1,
    on='date',
    how='left',   # ou 'left'
    suffixes=('_t2', '_t1'))
        result = result.reset_index()
        result = result.drop(['id_t2', 'date_t2','id_t1', 'date_t1', 'index', 'date'], axis=1, errors='ignore')
        
        return result
    else:
    
        df_target1 = p_test_df_feature[p_test_df_feature['id'] == target_col1]
        # Concatenate the two DataFrames side by side (axis=1)
        result = df_target1
        result = result.reset_index()
        result = result.drop(['id', 'date', 'index'], axis=1, errors='ignore')
        
        return result
    
def update_and_predict(x_new, model, scaler, device=None):
    """
    param x_new: nouvelle fonction
    param model : model de prédiction
    param scaler: 
    """

    # scaler
    X_scaled = scaler.transform(x_new)
    X_tensor = torch.tensor(X_scaled, dtype=torch.float32).unsqueeze(0)
    
    if device is not None:
        X_tensor = X_tensor.to(device)
        model.to(device)

    model.eval()
    with torch.no_grad():
        pred = model(X_tensor)

    return pred.cpu().numpy()[0, 0]

sequence_buffer_gru={}
sequence_buffer_lstm={}

def update_buffer(key, x_new, buffer_dict, sequence_length):
    """
    key : identifiant de la série
    x_new : pd.DataFrame( (n_features,)
    buffer_dict : dict contenant les buffers
    """
  
    # si le key n'existe pas, on initialise le buffer avec des zéros
    if key not in buffer_dict:
        df_zero = pd.DataFrame(np.zeros((sequence_length, x_new.shape[1])),
                       columns=x_new.columns)
        buffer_dict[key] = df_zero

    # décaler le buffer
    buffer_dict[key] = buffer_dict[key].shift(-1)
    buffer_dict[key].iloc[-1] = x_new.values

    return buffer_dict[key]  # renvoie la séquence mise à jour


In [225]:
def predict(data_batch):
    print("\nGenerating predictions...")
    print("=" * 50)
    test_batch, label_lags_1_batch, label_lags_2_batch, label_lags_3_batch, label_lags_4_batch = data_batch

    # Target values by lag
    lag_1_values = label_lags_1_batch.drop(['date_id', 'label_date_id'])
    lag_2_values = label_lags_2_batch.drop(['date_id', 'label_date_id'])
    lag_3_values = label_lags_3_batch.drop(['date_id', 'label_date_id'])
    lag_4_values = label_lags_4_batch.drop(['date_id', 'label_date_id'])
  
    df_label_lags = pl.concat([lag_1_values, lag_2_values,lag_3_values, lag_4_values], how="horizontal")
    lag_values = df_label_lags.to_pandas()
    target_columns = lag_values.columns.tolist()

    # Generate ensemble predictions
    predictions = pd.DataFrame()
    sequence_length = 10

    # effectuer les prédictions
    index: int = 0
    for target_col in target_columns:
        ensemble_preds = []
        name,pric1,pric2 =getnameprice(index)
        

        X_test  = test_features_build(test_batch, pric1,pric2)

        # Cas du GRU  
        sequence = update_buffer(index, X_test, sequence_buffer_gru, sequence_length)
        pred = update_and_predict(sequence, models_gru[target_col],scalers_gru[target_col],device)
        ensemble_preds.append(pred)

        # Cas du LSTM  
        sequence = update_buffer(index, X_test, sequence_buffer_lstm, sequence_length)
        pred = update_and_predict(sequence, models_lstm[target_col],scalers_lstm[target_col],device)
        ensemble_preds.append(pred)

        index += 1

        if ensemble_preds:
            # Average the predictions
            final_pred = np.mean(ensemble_preds)
            predictions[target_col] = [final_pred]
            
        else:
            # Fallback to lag-based prediction
            lag_val = lag_values[target_col].iloc[0] if len(lag_values) > 0 else 0.0
            predictions[target_col] = [lag_val + np.random.normal(0, 0.001)]
        
    # Ensure all values are finite
    predictions = predictions.fillna(0.0)
    predictions = predictions.replace([np.inf, -np.inf], 0.0)
    
    return predictions, df_label_lags

In [226]:
# Calcul de score entre la prédiction et l'objectif

def score(prediction:pd.DataFrame, target:pd.DataFrame) -> float: 
     return np.mean(np.abs(target.values - prediction.values))


In [235]:
df_test_col = train_labels_df.copy().iloc[:0]
df_test_col["date_id"] = df_test_col.index
display(df_test_col.columns)


Index(['target_0', 'target_1', 'target_2', 'target_3', 'target_4', 'target_5',
       'target_6', 'target_7', 'target_8', 'target_9',
       ...
       'target_415', 'target_416', 'target_417', 'target_418', 'target_419',
       'target_420', 'target_421', 'target_422', 'target_423', 'date_id'],
      dtype='object', length=425)

In [236]:
counter_batch = generate_data_batches()
submissions = []
target = []
for data_batch, date_id in counter_batch:
    print (date_id)
    predictions, df_label_lags = predict(data_batch)
   
    print (df_label_lags.shape)
    predictions = predictions.copy()
    predictions["date_id"] = date_id
   
    predictions = predictions[df_test_col.columns]
    # predictions = predictions[["date_id", "target_0", "target_1"]]
    df_label_lags = df_label_lags.with_columns( pl.lit(date_id).alias("date_id"))
    submissions.append(predictions)
    target.append(df_label_lags.to_pandas())

target_df = pd.concat(target, ignore_index=True)
target_df.to_csv('../outputs/target.csv', index=False)

submission_df = pd.concat(submissions, ignore_index=True)
submission_df.to_csv('../outputs/submission.csv', index=False)

display(score(submission_df,target_df))


1827

Generating predictions...
(1, 424)
1828

Generating predictions...
(1, 424)
1829

Generating predictions...
(1, 424)
1830

Generating predictions...
(1, 424)
1831

Generating predictions...
(1, 424)
1832

Generating predictions...
(1, 424)
1833

Generating predictions...
(1, 424)
1834

Generating predictions...
(1, 424)
1835

Generating predictions...
(1, 424)
1836

Generating predictions...
(1, 424)
1837

Generating predictions...
(1, 424)
1838

Generating predictions...
(1, 424)
1839

Generating predictions...
(1, 424)
1840

Generating predictions...
(1, 424)
1841

Generating predictions...
(1, 424)
1842

Generating predictions...
(1, 424)
1843

Generating predictions...
(1, 424)
1844

Generating predictions...
(1, 424)
1845

Generating predictions...
(1, 424)
1846

Generating predictions...
(1, 424)
1847

Generating predictions...
(1, 424)
1848

Generating predictions...
(1, 424)
1849

Generating predictions...
(1, 424)
1850

Generating predictions...
(1, 424)
1851

Generating

np.float64(nan)

In [245]:

def score2(prediction:pd.DataFrame, target:pd.DataFrame) -> float:
    array_x = np.abs(target.values - prediction.values)
    array_x = array_x[~np.isnan(array_x)]
    return np.mean(array_x)



display("Score %.5f" % (score2(submission_df,target_df)))

'Score 0.02380'