In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
from torch.optim import RMSprop
from datetime import datetime, timedelta

In [165]:
def transformed_time(t,T):
    return 2*(t-T)/T+1

def transformed_inventory_action(q0,q,x):
    q_hat=q/q0-1
    x_hat=x/q0
    r=np.sqrt(q_hat**2+x_hat**2)
    if q_hat==0:
        theta=np.arctan(-np.inf)
    else:
        theta=np.arctan(-x_hat/q_hat)
        chi=-x_hat/q_hat
    if theta<=np.pi/4:
        radial_dist=r*np.sqrt((chi**2+1)*(2*np.cos(np.pi/4-theta)**2))
    else:
        radial_dist=r*np.sqrt((chi**(-2)+1)*(2*np.cos(theta-np.pi/4)**2))
    q_transform=-radial_dist*np.cos(theta)
    x_transform=radial_dist*np.sin(theta)
    return q_transform,x_transform

def transformed_price(midprice_series):
    """
    Computes the transformed price feature (P̃) from a time series of midprices.

    Parameters:
        midprice_series (pd.Series): Series indexed by timestamp (datetime), with midprice per second.

    Returns:
        pd.Series: Transformed price series (P̃), same index as input.
    """
    # Ensure the series is sorted by time
    midprice_series = midprice_series.sort_index()

    # Group by hour
    grouped = midprice_series.groupby(midprice_series.index.floor('H'))

    transformed_series = []

    for hour, group in grouped:
        # Subtract opening price of the hour
        opening_price = group.iloc[0]
        centered = group - opening_price

        # Estimate scale to fit mostly within [-1, 1]
        lower, upper = np.percentile(centered, [1, 99])  # clip only outliers
        if lower!=upper:
            # Affine transformation
            transformed = (2/(upper-lower))*(centered-upper)+1
        else:
            transformed =0*centered
        
        transformed_series.append(transformed)

    return pd.concat(transformed_series)

def QV(midprice_series):
    return np.sum(midprice_series.diff()**2)

def transformed_qv(qv,qv_mean,qv_std):
    return (qv-qv_mean)/(2*qv_std)


In [166]:
transformed_inventory_action(10,10,1)

ZeroDivisionError: float division by zero

Old

In [158]:
class TradingEnv:
    def __init__(self, start_date,T,N, delta_t,price_data, initial_inventory=500,a=0.01):
        """Time is expressed in second"""
        self.start_date=start_date
        self.T=T
        self.delta_t=delta_t #period between succesive trades in min 
        self.Tk_list = np.array([T/N*i for i in range(1,N)])
        self.Mk=T/N/delta_t
        self.price_data = price_data
        self.transormed_price=transformed_price(price_data)
        self.initial_inventory = initial_inventory

        self.current_period_index = 0
        self.inventory = initial_inventory
        self.time = self.Tk_list[0]
        self.done = False
        self.a=a

        self.state=self.get_state(self.time)

    def reset(self):
        self.current_period_index = 0
        self.inventory = self.initial_inventory
        self.time = self.Tk_list[0]
        self.done = False
        return self.get_state(self.time)

    def step(self, action):
            x_Tk = action
            # bornes de temps de la période courante
            t0 = self.start_date + timedelta(seconds=self.Tk_list[self.current_period_index])
            t1 = self.start_date + timedelta(seconds=self.Tk_list[self.current_period_index + 1])
            prices = self.price_data.loc[t0:t1]
            n = len(prices)

            # 1) reward de trade intra-période
            if n > 0:
                trade_reward = np.sum((self.inventory / n) * prices.diff())
            else:
                trade_reward = 0.0

            # 2) pénalité quadratique (on ne divise par Mk que si Mk>0)
            if self.Mk > 0:
                penalty = self.a * (x_Tk / self.Mk) ** 2
            else:
                penalty = 0.0

            reward = trade_reward - penalty

            # mise à jour de l'état
            self.inventory -= x_Tk
            self.current_period_index += 1
            if self.inventory <= 0 or self.current_period_index >= len(self.Tk_list) - 1:
                self.done = True

            next_t = self.Tk_list[min(self.current_period_index, len(self.Tk_list)-1)]
            next_state = self.get_state(next_t)
            return next_state, reward, self.done, {}
        
    def get_state(self, T_i):

        if self.current_period_index>0:
            mask=(self.start_date+timedelta(seconds=self.Tk_list[self.current_period_index-1])<=self.price_data.index)&(self.price_data.index<self.start_date+timedelta(seconds=self.Tk_list[self.current_period_index]))
        else:
            mask=(self.start_date<=self.price_data.index)&(self.price_data.index<self.start_date+timedelta(seconds=self.Tk_list[self.current_period_index]))
            
        selected_times=self.price_data.index[mask]
        prices=self.price_data.loc[selected_times]
        qv=QV(prices)

        state = [
            T_i,
            self.inventory,
            self.price_data.loc[self.start_date+timedelta(seconds=T_i):].values[0],
            qv,
        ]
        return np.array(state, dtype=np.float32)

    

New

In [160]:
class TradingEnv:
    def __init__(self, start_date,T,N, delta_t,price_data, initial_inventory=500,a=1):
        """Time is expressed in second"""
        self.start_date=start_date
        self.T=T
        self.delta_t=delta_t #period between succesive trades in min 
        self.Tk_list = np.array([T/N*i for i in range(1,N)])
        self.Mk=T/N/delta_t
        self.price_data = price_data
        self.transormed_price=transformed_price(price_data)
        self.initial_inventory = initial_inventory

        self.current_period_index = 0
        self.inventory = initial_inventory
        self.time = self.Tk_list[0]
        self.done = False
        self.a=a

        self.state=self.get_state(self.time)

        _,self.qv_mean,self.qv_std=self.get_qv_mean_std()

    def reset(self):
        self.current_period_index = 0
        self.inventory = self.initial_inventory
        self.time = self.Tk_list[0]
        self.done = False
        return self.get_state(self.time)

    def step(self, action):
        """
        Applique l'action (ex : quantité à vendre), met à jour l'état, retourne:
        next_state, reward, done, info
        """
        #Quantiti of shares to sell
        x_Tk= action

        #Select prices of the period following the action [T_k,T_k+1[
        if self.current_period_index+1<len(self.Tk_list):

            mask=(self.start_date+timedelta(seconds=self.Tk_list[self.current_period_index])<=self.price_data.index)&(self.price_data.index<self.start_date+timedelta(seconds=self.Tk_list[self.current_period_index+1]))
        else:

            mask=(self.start_date+timedelta(seconds=self.Tk_list[self.current_period_index])<=self.price_data.index)

        selected_times=self.price_data.index[mask]
        prices=self.price_data.loc[selected_times]
  
        reward =np.sum((self.inventory/prices.shape[0]) * prices.diff()-self.a*(x_Tk/self.Mk)**2)


        self.inventory -= x_Tk

        #Update current period
        if self.current_period_index+1<len(self.Tk_list):
            self.current_period_index += 1
            #An episode ends when all the initial inventory has been sold 
            if self.inventory <= 0:
                self.done = True
        else:
            self.done = True
            
        self.time = self.Tk_list[self.current_period_index]
        next_state = self.get_state(self.time)


        self.state=next_state
        return next_state, reward, self.done, {}
    
    def get_state(self, T_i):

        if self.current_period_index>0:
            mask=(self.start_date+timedelta(seconds=self.Tk_list[self.current_period_index-1])<=self.price_data.index)&(self.price_data.index<self.start_date+timedelta(seconds=self.Tk_list[self.current_period_index]))
        else:
            mask=(self.start_date<=self.price_data.index)&(self.price_data.index<self.start_date+timedelta(seconds=self.Tk_list[self.current_period_index]))
            
        selected_times=self.price_data.index[mask]
        prices=self.price_data.loc[selected_times]
        qv=QV(prices)

        state = [
            T_i,
            self.inventory,
            self.price_data.loc[self.start_date+timedelta(seconds=T_i):].values[0],
            qv,
        ]
        return np.array(state, dtype=np.float32)
    
    def get_transformed_state(self,state,action):

        time,q,price,qv=state

        time_tr=transformed_time(time,self.T)
        q_tr,_=transformed_inventory_action(self.initial_inventory,q,action)
        price_tr=self.transormed_price.loc[self.start_date+timedelta(seconds=int(time)):].values[0]
        qv_tr=transformed_qv(qv,self.qv_mean,self.qv_std)

        
        state = [
            time_tr,
            q_tr,
            price_tr,
            qv_tr,
        ]

        return state
    
    def get_qv_mean_std(self):

        list_qv=[]

        for current_period_index in range(len(self.Tk_list)):
            if current_period_index>0:
                mask=(self.start_date+timedelta(seconds=self.Tk_list[current_period_index-1])<=self.price_data.index)&(self.price_data.index<self.start_date+timedelta(seconds=self.Tk_list[current_period_index]))
            else:
                mask=(self.start_date<=self.price_data.index)&(self.price_data.index<self.start_date+timedelta(seconds=self.Tk_list[current_period_index]))
                
            selected_times=self.price_data.index[mask]
            prices=self.price_data.loc[selected_times]
            qv=QV(prices)
            list_qv.append(qv)

        array_qv=np.array(list_qv)
        return array_qv,np.mean(array_qv),np.std(array_qv)


In [78]:
class QNetwork(nn.Module):
    def __init__(self, input_dim):
        super(QNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 20),
            nn.BatchNorm1d(20),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(20, 20),
            nn.BatchNorm1d(20),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(20, 20),
            nn.ReLU(),

            nn.Linear(20, 20),
            nn.ReLU(),

            nn.Linear(20, 20),
            nn.ReLU(),

            nn.Linear(20, 1)  # Q-value output
        )

    def forward(self, x):
        return self.net(x)


In [79]:
class QNetwork(nn.Module):
    def __init__(self, input_dim):
        super(QNetwork, self).__init__()
        
        # Architecture réorganisée selon QNet mais avec vos couches exactes
        self.layers = nn.ModuleList([
            # Couche d'entrée (identique à la vôtre)
            nn.Sequential(
                nn.Linear(input_dim, 20),
                nn.BatchNorm1d(20),
                nn.ReLU(),
                nn.Dropout(0.1)
            ),
            
            # Couches cachées (comme dans votre réseau original)
            nn.Sequential(
                nn.Linear(20, 20),
                nn.BatchNorm1d(20),
                nn.ReLU(),
                nn.Dropout(0.1)
            ),
            
            nn.Sequential(
                nn.Linear(20, 20),
                nn.ReLU()
            ),
            
            nn.Sequential(
                nn.Linear(20, 20),
                nn.ReLU()
            ),
            
            nn.Sequential(
                nn.Linear(20, 20),
                nn.ReLU()
            ),
            
            # Couche de sortie (identique à la vôtre)
            nn.Linear(20, 1)
        ])

    def forward(self, x):
        # Gestion des dimensions pour BatchNorm
        if x.dim() == 1:
            x = x.unsqueeze(0)
            
        # Forward pass à travers toutes les couches
        for layer in self.layers:
            x = layer(x)
            
        return x.squeeze(-1) 

In [153]:
class TradingAgentRL:
    def __init__(self, env,state_dim, epsilon=0.1, tau=0.995, gamma=0.99, batch_size=5, memory_capacity=100, update_target_freq=10, lr=1e-3):
        self.env=env
        self.state_dim = state_dim
        self.epsilon = epsilon
        self.tau = tau
        self.gamma = gamma
        self.batch_size = batch_size
        self.update_target_freq = update_target_freq

        self.memory = deque(maxlen=memory_capacity)
        self.Q_main = QNetwork(state_dim)
        self.Q_target = QNetwork(state_dim)
        self.Q_target.load_state_dict(self.Q_main.state_dict())
        self.optimizer = RMSprop(self.Q_main.parameters(), lr)
        self.loss_fn = nn.MSELoss()

        self.iteration = 0

    def choose_action(self, state):
        q_i = state[1] 
        T_i=state[0]

        ## I we have reached terminal period [TN-1,T], we sell all the inventory 
        if T_i>=self.env.Tk_list[-1]:
            action=q_i
        else:
            if np.random.rand() < self.epsilon:
                action = np.random.binomial(q_i,1/(self.env.T-T_i))
            else:
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state).unsqueeze(0).repeat(int(q_i)+1,1)
                    actions_tensor = torch.arange(0, int(q_i) + 1).float()

                    ## Normalize inputs before using Q-network
                    transformed_states_tensor = [
                        torch.tensor(self.env.get_transformed_state(state, action), dtype=torch.float32)
                        for state, action in zip(state_tensor, actions_tensor)
                    ]
                    transformed_states_tensor=torch.stack(transformed_states_tensor)
                    inputs = torch.cat([transformed_states_tensor, actions_tensor.unsqueeze(1)], dim=1)
                    #inputs = torch.cat([state_tensor, actions_tensor.unsqueeze(1)], dim=1)
                    print(inputs)
                    q_values = self.Q_main(inputs).squeeze()
                    action = torch.argmax(q_values).item()

        return action
    

    def store_transition(self, state, action, reward, next_state):
        self.memory.append((state, action, reward, next_state))

    def train_step(self):

        # We can update Q only if we have seen enough experience (state,action,reward,next_state)
        if len(self.memory) < self.batch_size:
            return

        # We sample batch_size trasnitions from memory
        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states = zip(*minibatch)

        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)

        # Target computation
        with torch.no_grad():

            next_q_values=[]
            for j in range(self.batch_size):

                q_range = int(next_states[j][1])  # inventory
                ns_batch = next_states[j].unsqueeze(0).repeat(q_range + 1, 1)
                actions_batch = torch.arange(0, q_range + 1).float()
                
                ### Normalize inputs before using Q-network
                transformed_ns_batch = [
                    torch.tensor(self.env.get_transformed_state(state, action), dtype=torch.float32)
                    for state, action in zip(ns_batch, actions_batch.unsqueeze(1))
                ]
                transformed_ns_batch=torch.stack(transformed_ns_batch)
                inputs = torch.cat([transformed_ns_batch, actions_batch.unsqueeze(1)], dim=1)
                #inputs = torch.cat([ns_batch, actions_batch.unsqueeze(1)], dim=1)

                if next_states[j][0]==self.env.T:
                    next_q_value = 0
                elif next_states[j][0]==self.env.Tk_list[-1]:
                    #R(s,q)=q(p′ −p)−aq2,
                    q=states[j][1]
                    T=self.env.start_date+timedelta(seconds=self.env.T)
                    delta=self.env.delta_t
                    prices=self.env.price_data 
                    p=prices.loc[T:].values[0]
                    p_prim=prices.loc[T+timedelta(seconds=delta):].values[0]
                    next_q_value = q*(p_prim-p)-self.env.a*q**2
                else:
                    

                    #Compute Q-values with main network
                    self.Q_main.eval()
                    ### Normalize inputs before using Q-network
                    print(inputs)
                    q_values = self.Q_main(inputs).squeeze()
                    ns=torch.tensor(next_states[j])
                    # Select action with highest Q-value
                    action = torch.tensor([torch.argmax(q_values).item()])

                    #Compute future Q-value with target network
                    transformed_ns = torch.tensor(self.env.get_transformed_state(ns, action), dtype=torch.float32)
                    input_target = torch.cat([transformed_ns, action]) 
                    #input_target = torch.cat([ns, action])  # shape: [5]
                    self.Q_target.eval()
                    next_q_value = self.Q_target(input_target.unsqueeze(0)).item()

                next_q_values.append(next_q_value)        
            
            next_q_values = torch.FloatTensor(next_q_values)


        targets=rewards+self.gamma*next_q_values

        ### Normalize inputs before using Q-network
        transformed_states = [
            torch.tensor(self.env.get_transformed_state(state, action), dtype=torch.float32)
            for state, action in zip(states, actions.unsqueeze(1))
        ]
        transformed_states=torch.stack(transformed_states)
        inputs = torch.cat([transformed_states ,actions.unsqueeze(1)], dim=1)
        q_preds = self.Q_main(inputs)
        
        loss = self.loss_fn(q_preds.squeeze(), targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.iteration += 1

    def train(self, num_episodes=100, N=100, update_target_every=10, tau=0.995):
        """
        Entraîne l'agent sur l'environnement `env` pendant `num_episodes` épisodes.
        À chaque épisode, effectue N pas d'interaction.
        """
        last_date=self.env.price_data.index[-1]
        dates=self.env.price_data.loc[:last_date-timedelta(seconds=self.env.T)].index
        for episode in range(num_episodes):
            # One episode is defined as an execution period of lenght T, chosen randomly in the dataset (2018 to 2023)
            self.env.start_date=random.choice(dates)
            # State is reset at the beginning of each episode
            state = self.env.reset()  
            for i in range(N-1): # N is the number of period T0<T1..<TN-1 such that an action is taken at each T_i
                
                #choose action according to epsilon-greedy policy
                action =self.choose_action(state)
                # Update current state of the environment 
                next_state, reward, done, _ = self.env.step(action)
                
                # Save transition for experience replay
                self.store_transition(state, action, reward, next_state)

                state=next_state

                # Update Q with experience replay
                self.train_step()

                if done:
                    break

            # Mise à jour du réseau cible
            if episode % update_target_every == 0:
                self.Q_target.load_state_dict(self.Q_main.state_dict())

            # Décroissance de ε
            self.epsilon = max(self.epsilon * tau, 0.01)

            print(f"Episode {episode+1}/{num_episodes} terminé, ε = {self.epsilon:.4f}")

In [95]:
class TradingAgentRL:
    def __init__(self, env,state_dim, epsilon=0.1, tau=0.95, gamma=0.99, batch_size=5, memory_capacity=100, update_target_freq=10, lr=1e-3):
        self.env=env
        self.state_dim = state_dim
        self.epsilon = epsilon
        self.tau = tau
        self.gamma = gamma
        self.batch_size = batch_size
        self.update_target_freq = update_target_freq

        self.memory = deque(maxlen=memory_capacity)
        self.q_values_memory=[]
        self.Q_main = QNetwork(self.state_dim)
        self.Q_target = QNetwork(self.state_dim)
        self.Q_target.load_state_dict(self.Q_main.state_dict())
        self.optimizer = RMSprop(self.Q_main.parameters(), lr)
        self.loss_fn = nn.MSELoss()

        self.iteration = 0
        
    
    def reset(self):
          
        self.memory = deque(maxlen=self.memory.maxlen)
        self.q_values_memory=[]
        self.Q_main = QNetwork(self.state_dim)
        self.Q_target = QNetwork(self.state_dim)
        self.Q_target.load_state_dict(self.Q_main.state_dict())
        self.iteration = 0

    def choose_action(self, state):
        q_i = state[1] 
        T_i=state[0]

        ## I we have reached terminal period [TN-1,T], we sell all the inventory 
        if T_i>=self.env.Tk_list[-1]:
            action=q_i
        else:
            if np.random.rand() < self.epsilon:
                action = np.random.binomial(q_i,1/(self.env.T-T_i))
            else:
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state).unsqueeze(0).repeat(int(q_i)+1,1)
                    actions_tensor = torch.arange(0, int(q_i) + 1).float()
                    inputs = torch.cat([state_tensor, actions_tensor.unsqueeze(1)], dim=1)
                    q_values = self.Q_main(inputs).squeeze()
                    self.q_values_memory.append((self.env.current_period_index,state,q_values))
                    action = torch.argmax(q_values).item()

        return action
    

    def store_transition(self, state, action, reward, next_state):
        self.memory.append((state, action, reward, next_state))

    def train_step(self):

        # We can update Q only if we have seen enough experience (state,action,reward,next_state)
        if len(self.memory) < self.batch_size:
            return

        # We sample batch_size trasnitions from memory
        minibatch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states = zip(*minibatch)

        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)

        # Target computation
        with torch.no_grad():

            next_q_values=[]
            for j in range(self.batch_size):
                
                q_range = int(next_states[j][1])  # inventory
                ns_batch = next_states[j].unsqueeze(0).repeat(q_range + 1, 1)
                actions_batch = torch.arange(0, q_range + 1).float()
                inputs = torch.cat([ns_batch, actions_batch.unsqueeze(1)], dim=1)

                if next_states[j][0]==self.env.T:
                    next_q_value = 0
                elif next_states[j][0]==self.env.Tk_list[-1]:
                    #R(s,q)=q(p′ −p)−aq2,
                    q=states[j][1]
                    T=self.env.start_date+timedelta(seconds=self.env.T)
                    delta=self.env.delta_t
                    prices=self.env.price_data 
                    p=prices.loc[T:].values[0]
                    p_prim=prices.loc[T+timedelta(seconds=delta):].values[0]
                    next_q_value = q*(p_prim-p)-self.env.a*q**2#f(next_states[j],states[j][1])
                else:
                    

                    #Compute Q-values with main network
                    self.Q_main.eval()
                    q_values = self.Q_main(inputs).squeeze()
                    ns=torch.tensor(next_states[j])
                    # Select action with highest Q-value
                    action = torch.tensor([torch.argmax(q_values).item()])
                    #Compute future Q-value with target network
                    input_target = torch.cat([ns, action])  # shape: [5]
                    self.Q_target.eval()
                    next_q_value = self.Q_target(input_target.unsqueeze(0)).item()

                next_q_values.append(next_q_value)        
            
            next_q_values = torch.FloatTensor(next_q_values)


        targets=rewards+self.gamma*next_q_values
        inputs = torch.cat([states ,actions.unsqueeze(1)], dim=1)
        q_preds = self.Q_main(inputs)
        loss = self.loss_fn(q_preds.squeeze(), targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.iteration += 1

    def train(self, num_episodes=100, N=100, update_target_every=10, tau=0.995):
        """
        Entraîne l'agent sur l'environnement `env` pendant `num_episodes` épisodes.
        À chaque épisode, effectue N pas d'interaction.
        """
        last_date=self.env.price_data.index[-1]
        dates=self.env.price_data.loc[:last_date-timedelta(seconds=self.env.T)].index
        for episode in range(num_episodes):
            # One episode is defined as an execution period of lenght T, chosen randomly in the dataset (2018 to 2023)
            self.env.start_date=random.choice(dates)
            # State is reset at the beginning of each episode
            state = self.env.reset()  
            for i in range(N): # N is the number of period T0<T1..<TN-1 such that an action is taken at each T_i
                print(i,self.env.inventory)
                #choose action according to epsilon-greedy policy
                action =self.choose_action(state)

                # Update current state of the environment 
                next_state, reward, done, _ = self.env.step(action)
                
                # Save transition for experience replay
                self.store_transition(state, action, reward, next_state)

                state=next_state

                # Update Q with experience replay
                self.train_step()

                if done:
                    break

            # Mise à jour du réseau cible
            if episode % update_target_every == 0:
                self.Q_target.load_state_dict(self.Q_main.state_dict())

            # Décroissance de ε
            self.epsilon = max(self.epsilon * tau, 0.01)

            print(f"Episode {episode+1}/{num_episodes} terminé, ε = {self.epsilon:.4f}")



In [10]:
data=pd.read_csv("Data/BTC_ETH_15mn.csv")
data.Date=data.Date.apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
data=data.set_index("Date").fillna(method="ffill")

In [11]:
data

Unnamed: 0_level_0,Price,Volume,Price (ETH),Volume (ETH)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-10-24 00:00:00,6476.540000,3.708780e+09,204.327000,1232950000
2018-10-24 00:15:00,6483.150000,3.709670e+09,204.378000,1236570000
2018-10-24 00:30:00,6474.200000,3.677130e+09,204.439000,1239840000
2018-10-24 00:45:00,6486.190000,3.661460e+09,204.441000,1237220000
2018-10-24 01:00:00,6480.084479,3.641914e+09,204.070635,1227899090
...,...,...,...,...
2023-11-27 08:45:00,37346.223298,1.628826e+10,2046.790071,9322101316
2023-11-27 09:00:00,37401.430405,1.635176e+10,2049.064393,9328586333
2023-11-27 09:15:00,37405.996546,1.648487e+10,2049.523629,9353842431
2023-11-27 09:30:00,37432.627567,1.656480e+10,2051.479080,9362712712


In [108]:
agent.env.inventory

0

In [109]:
def pnl_agent(price_series, start_date, actions, delta_t, a, initial_inventory):
    """
    Eq (3.1)-(3.2) :
      R_{k,i} = q_{t_{k,i}}*(p_{i+1}-p_i) - a*(x_k/M_k)^2
      P&L = sum_{k,i} R_{k,i}  (avec q_{t_{k,0}} = q0 au début de chaque k)
    """
    pnl = 0.0
    q = initial_inventory
    p0 = price_series.loc[start_date]
    pnl -= q * p0  # terme -q0*p0 parfois inclus dans la forme développée (3.4)

    for Tk, xk in actions:
        # récupérer les prix de Tk à Tk+1
        block = price_series.loc[Tk : Tk + timedelta(seconds=delta_t)]
        prices = block.values
        M = len(prices) - 1
        if M <= 0:
            continue

        v = xk / M  # quantité par sub-tick
        for i in range(M):
            dp = prices[i+1] - prices[i]
            pnl += q * dp            # q_{t_{k,i}} * (p_{i+1}-p_i)
            pnl -= a * (v ** 2)      # pénalité quadratique
            q  -= v                  # on diminue q au fil des sous-ticks

    return pnl

In [110]:
def pnl_twap(price_series, start_date, Q0, Tk_list, delta_t, a):
    """
    Eq (7.1): x_k = Q0/N
    Puis on appelle pnl_agent avec ce plan d'actions.
    """
    N = len(Tk_list) - 1
    x_per = Q0 / N
    actions = [
        (start_date + timedelta(seconds=int(Tk_list[k])), x_per)
        for k in range(N)
    ]
    return pnl_agent(price_series, start_date, actions, delta_t, a, Q0)



In [112]:
def evaluate(agent, start_dates):
    """
    Pure greedy evaluation (ε=0) : compare P&L_RL vs P&L_TWAP en bps.
    Renvoie mean, median, glr, prob.
    """
    orig_eps = agent.epsilon
    agent.epsilon = 0.0

    deltas = []
    for sd in start_dates:
        # tronquer la série à [sd, sd+T]
        series = agent.env.price_data.loc[
            sd : sd + timedelta(seconds=agent.env.T)
        ]

        # reset env
        agent.env.start_date = sd
        state = agent.env.reset()

        # collecter actions greedy
        actions = []
        for Tk in agent.env.Tk_list[:-1]:
            x = agent.choose_action(state)
            t_dt = sd + timedelta(seconds=int(Tk))
            actions.append((t_dt, x))
            state, _, done, _ = agent.env.step(x)
            if done: break

        # calcul des P&L
        pnl_rl  = pnl_agent(
            series, sd, actions,
            agent.env.delta_t,
            agent.env.a,
            agent.env.initial_inventory
        )
        pnl_ref = pnl_twap(
            series, sd,
            agent.env.initial_inventory,
            agent.env.Tk_list,
            agent.env.delta_t,
            agent.env.a
        )

        if pnl_ref != 0:
            deltas.append(1e4 * (pnl_rl - pnl_ref) / pnl_ref)

    agent.epsilon = orig_eps

    arr = np.array(deltas, dtype=float)
    return {
        "mean"  : round(arr.mean(),    3),
        "median": round(np.median(arr),3),
        "glr"   : round(arr[arr>0].mean() / (-arr[arr<0].mean() + 1e-8), 3),
        "prob"  : round(float((arr>0).mean() * 100), 3),
    }

In [161]:
env=TradingEnv(data.Price.index[0],15*500*60,100,15*60,data.Price,initial_inventory=100)
agent=TradingAgentRL(env,5)
#agent.reset()
agent.train( num_episodes=1, N=70, update_target_every=10, tau=0.995)

tensor([[ -0.9600,      nan,  -0.1829,   1.3465,   0.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,   1.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,   2.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,   3.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,   4.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,   5.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,   6.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,   7.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,   8.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,   9.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,  10.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,  11.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,  12.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,  13.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,  14.0000],
        [ -0.9600,     -inf,  -0.1829,   1.3465,  15.0000],
        [ -0.9600,     -inf,  -0.1829,  

  ns=torch.tensor(next_states[j])


tensor([[ -0.9200,      nan,   0.4027,  27.0012,   0.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,   1.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,   2.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,   3.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,   4.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,   5.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,   6.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,   7.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,   8.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,   9.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,  10.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,  11.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,  12.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,  13.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,  14.0000],
        [ -0.9200,     -inf,   0.4027,  27.0012,  15.0000],
        [ -0.9200,     -inf,   0.4027,  

In [152]:
agent.q_values_memory[-1]

(5,
 array([2.700000e+04, 3.000000e+00, 8.811591e+03, 9.911086e+02],
       dtype=float32),
 tensor([-138.7704, -138.7676, -138.7649, -138.7622]))

In [139]:
min_time = data.index.min()
max_time = data.index.max() - timedelta(seconds=env.T)
# horaire plein toutes les T secondes
test_dates = pd.date_range(min_time, max_time, freq=f"{env.T}s")
stats = evaluate(agent, test_dates)
print(stats)

{'mean': 4.679, 'median': -1.134, 'glr': 1.082, 'prob': 49.86}


In [142]:
results={"pnl":[],"q_values_memory":[],"inventory":[],"update_rate":[]}
inventories=[10,50,100,200,500,1000,10000]
update_rates=[1,5,10,50]

In [144]:
for q in inventories:
    for update_rate in update_rates:
        env=TradingEnv(data.Price.index[0],15*500*60,100,15*60,data.Price,initial_inventory=q)
        agent=TradingAgentRL(env,5)
        agent.train( num_episodes=20, N=70, update_target_every=10, tau=0.995)
        stats = evaluate(agent, test_dates)
        results["inventory"].append(q)
        results["update_rate"].append(update_rate)
        results["pnl"].append(stats)
        results["q_values_memory"].append(agent.q_values_memory)

0 10
1 10
2 9
3 8
4 8
5 7
Episode 1/20 terminé, ε = 0.0995
0 10
Episode 2/20 terminé, ε = 0.0990
0 10
Episode 3/20 terminé, ε = 0.0985
0 10
1 10
2 10
Episode 4/20 terminé, ε = 0.0980
0 10
Episode 5/20 terminé, ε = 0.0975
0 10
Episode 6/20 terminé, ε = 0.0970
0 10
1 10
Episode 7/20 terminé, ε = 0.0966
0 10
Episode 8/20 terminé, ε = 0.0961
0 10
Episode 9/20 terminé, ε = 0.0956
0 10
Episode 10/20 terminé, ε = 0.0951
0 10
1 10
Episode 11/20 terminé, ε = 0.0946
0 10
1 10
Episode 12/20 terminé, ε = 0.0942
0 10
Episode 13/20 terminé, ε = 0.0937
0 10
Episode 14/20 terminé, ε = 0.0932
0 10
Episode 15/20 terminé, ε = 0.0928
0 10
Episode 16/20 terminé, ε = 0.0923
0 10
Episode 17/20 terminé, ε = 0.0918
0 10
Episode 18/20 terminé, ε = 0.0914
0 10
Episode 19/20 terminé, ε = 0.0909
0 10
1 10
Episode 20/20 terminé, ε = 0.0905


  ns=torch.tensor(next_states[j])


0 10
1 7
2 3
3 3
4 1
5 1
Episode 1/20 terminé, ε = 0.0995
0 10
1 10
2 10
3 10
Episode 2/20 terminé, ε = 0.0990
0 10
1 10
2 10
3 10
4 10
5 10
6 10
Episode 3/20 terminé, ε = 0.0985
0 10
Episode 4/20 terminé, ε = 0.0980
0 10
1 10
2 10
3 10
4 10
5 10
Episode 5/20 terminé, ε = 0.0975
0 10
1 10
Episode 6/20 terminé, ε = 0.0970
0 10
1 10
2 10
3 10
Episode 7/20 terminé, ε = 0.0966
0 10
1 10
2 10
3 10
4 10
5 10
Episode 8/20 terminé, ε = 0.0961
0 10
1 10
2 10
3 10
4 10
Episode 9/20 terminé, ε = 0.0956
0 10
1 10
2 10
3 10
4 10
5 10
6 10
Episode 10/20 terminé, ε = 0.0951
0 10
1 10
2 10
3 10
Episode 11/20 terminé, ε = 0.0946
0 10
1 10
Episode 12/20 terminé, ε = 0.0942
0 10
Episode 13/20 terminé, ε = 0.0937
0 10
1 10
2 10
3 10
4 10
5 10
6 10
7 10
8 10
Episode 14/20 terminé, ε = 0.0932
0 10
1 10
2 10
3 10
4 10
5 10
6 10
7 10
8 10
9 10
Episode 15/20 terminé, ε = 0.0928
0 10
1 10
2 10


  ns=torch.tensor(next_states[j])


3 10
4 10
5 10
6 10
7 10
8 10
Episode 16/20 terminé, ε = 0.0923
0 10
1 10
2 10
Episode 17/20 terminé, ε = 0.0918
0 10
Episode 18/20 terminé, ε = 0.0914
0 10
1 10
2 10
3 10
4 10
5 10
6 10
Episode 19/20 terminé, ε = 0.0909
0 10
1 10
2 10
3 10
4 10
5 10
6 10
7 10
8 10
9 10
10 10
Episode 20/20 terminé, ε = 0.0905


KeyboardInterrupt: 

In [116]:
agent.q_values_memory

[(0,
  array([ 4500.     ,   100.     , 16530.756  ,   190.53699], dtype=float32),
  tensor([-0.0574, -0.0257, -0.0256, -0.0046,  0.0084, -0.0214, -0.0048,  0.0115,
          -0.0346, -0.0038, -0.0604, -0.0069, -0.0154, -0.0283,  0.0162, -0.0447,
          -0.0046, -0.0234,  0.0309, -0.0116, -0.0280,  0.0165, -0.0114,  0.0245,
           0.0135, -0.0316, -0.0139, -0.0057, -0.0103,  0.0150, -0.0111, -0.0037,
           0.0191,  0.0036, -0.0005,  0.0029,  0.0050, -0.0134, -0.0025, -0.0051,
          -0.0093, -0.0026, -0.0034, -0.0078, -0.0072, -0.0045, -0.0144, -0.0051,
          -0.0083, -0.0069, -0.0053, -0.0049, -0.0009, -0.0072, -0.0068, -0.0043,
          -0.0044, -0.0051, -0.0045, -0.0053, -0.0118, -0.0031, -0.0054, -0.0060,
          -0.0088, -0.0180, -0.0138, -0.0123, -0.0047, -0.0099, -0.0078, -0.0159,
          -0.0176, -0.0082,  0.0017, -0.0145, -0.0016, -0.0079,  0.0043,  0.0043,
          -0.0075, -0.0083, -0.0058, -0.0249, -0.0061, -0.0217, -0.0159,  0.0033,
          -0.01