In [90]:
# For Data
import numpy as np
import pandas as pd

# For Environment
import gymnasium as gym
from gymnasium import spaces

# For RL
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Helpers
import random

### Importing and Preprocessing data

In [91]:
# Read Data
data = pd.read_csv('../dataset/raw_data.csv')

# Removing Duplicates
data.drop_duplicates(inplace=True)

# Sorting
data.sort_values(by=['period', 'sn'], inplace=True, ascending=False)
data.sort_values(by=['project_id'], inplace=True, ascending=False)

# Backup
data.to_csv('../dataset/data.csv', index=False)

data.head()

Unnamed: 0,period,sn,price,project_id,begintime,target_number,target_colour
0,20231205,164,60396,3,1701743940,2,0
24297,20231205,164,60392,3,1701743940,2,0
20,20231205,164,44817,2,1701743940,4,0
72731,20231205,164,44814,2,1701743940,4,0
10,20231205,164,30079,1,1701743940,1,1


In [92]:
# Spiltting period into year, month, day
data['year'] = data['period'].apply(lambda x: int(str(x)[:4])-1)
data['month'] = data['period'].apply(lambda x: int(str(x)[4:6])-1)
data['day'] = data['period'].apply(lambda x: int(str(x)[6:])-1)

# Dropping period
data.drop('period', axis=1, inplace=True)

data.head()

Unnamed: 0,sn,price,project_id,begintime,target_number,target_colour,year,month,day
0,164,60396,3,1701743940,2,0,2022,11,4
24297,164,60392,3,1701743940,2,0,2022,11,4
20,164,44817,2,1701743940,4,0,2022,11,4
72731,164,44814,2,1701743940,4,0,2022,11,4
10,164,30079,1,1701743940,1,1,2022,11,4


In [93]:
print(f"{data['year'].unique()}, {data['month'].unique()}, {data['day'].unique()}, {data['project_id'].unique()}, {data['target_colour'].unique()}, {data['target_number'].unique()}")

[2022], [11 10  9], [ 4  3  2  1  0 29 28 27 26 25 24 23 22 21 18 17 16 15 14 13 12 11 10  9
  8  7  6  5 30 20 19], [3 2 1 0], [0 1], [2 4 1 3 0 6 9 7 8 5]


### Creating a Environment

In [94]:
TRADES_PER_EPISODE = 100

In [135]:
def get_states():
    """
    Returns n continueos states for the environment.
        
    Returns:
        states (list): List of n continueos states
    """
    states = []
    
    # Get a random sample
    first_val = data.sample(1)
    
    # find the index of the first_val in the data
    index = first_val.index[0]
    
    # Append the first_val to states
    states.append(first_val)
    
    for state in range(TRADES_PER_EPISODE-1):
        # Append the next_val to states
        index += 1
        states.append(data.iloc[index])
        
    return states

In [136]:
get_states()

[       sn  price  project_id   begintime  target_number  target_colour  year  \
 44549  75  59768           3  1697926320              8              0  2022   
 
        month  day  
 44549      9   21  ,
 sn                       68
 price                 15031
 project_id                0
 begintime        1699566660
 target_number             1
 target_colour             1
 year                   2022
 month                    10
 day                       9
 Name: 11217, dtype: int64,
 sn                       67
 price                 60212
 project_id                3
 begintime        1699566480
 target_number             2
 target_colour             0
 year                   2022
 month                    10
 day                       9
 Name: 35435, dtype: int64,
 sn                       67
 price                 45004
 project_id                2
 begintime        1699566480
 target_number             4
 target_colour             0
 year                   2022
 month      

In [96]:
# Site Environment
class SiteEnv(gym.Env):
    """
    A custom environment for trading sites based on colour and number.
    
    Attributes:
        trade_count (int): The current trade index in the state dataframe.
        action_space (gym.spaces.MultiDiscrete): The action space, consisting of two discrete values: colour and number.
        state (pandas.DataFrame): The state dataframe, containing the year, month, day, sn, project_id, target_colour, and target_number for each trade.
        observation_space (gym.spaces.Dict): The observation space, consisting of a dictionary of 5 discrete values: year, month, day, sn ( serial number ) and project_id.
        length (int): The maximum number of trades per episode.
    """
    def __init__(self, states):
        """
        Initialize the environment with the given state dataframe.
        
        Args:
            states (pandas.DataFrame): The state year, month, day, sn, project_id, target_colour, and target_number for each trade.
        """
        self.trade_count = 0
        
        # To punish the agent for repeating the same action
        self.last_three_actions = []
        
        # To reward the agent for 3 consecutive correct actions
        self.last_three_rewards = []
        
        # Define the action space as a multi-discrete space of two values: colour and number
        # colour = 0: red, 1: green
        # number = 0 - 9
        self.action_space = spaces.MultiDiscrete([2, 10])
        
        # Store the state dataframe as an attribute
        self.state = states
        
        # Define the observation space as a dictionary of three discrete values: period, count_of_the_day, and project_id
        # year = 2023
        # month = 1 - 12
        # day = 1 - 31
        # sn = 0 - 999
        # project_id = 0: Parity, 1: Sapre, 2: Bcone, 3: Emerd
        self.observation_space = spaces.Dict({
            "year": spaces.Discrete(2023),
            "month": spaces.Discrete(12),
            "day": spaces.Discrete(31),
            "sn": spaces.Discrete(999),
            "project_id": spaces.Discrete(4)
        })
        
        # Define the length as the number of trades per episode
        self.length = TRADES_PER_EPISODE
    
    def step(self, action):
        """
        Run one timestep of the environment's dynamics.
        
        Args:
            action (tuple of int): An action provided by the agent, consisting of two values: colour and number.
            
        Returns:
            observation (dict of int): The next observation, consisting of a dictionary of three values: period, count_of_the_day, and project_id.
            reward (float): The amount of reward returned as a result of taking the action.
            terminated (bool): Whether the episode has ended, either because the trade count has reached the length, or the action is invalid.
            truncated (bool): Whether the episode was truncated. ( For now no truncation )
            info (dict): An empty dictionary, for compatibility with the gym interface.
        """
        # Check if the action is valid
        if not self.action_space.contains(action):
            # Raise an exception if the action is invalid
            raise ValueError(f"Invalid action: {action}")
        
        # Get the current state from the state dataframe
        state = self.state.iloc[self.trade_count]
        
        
        # Calculate the reward based on the target colour and number
        reward = 0
        reward += 1 if state['target_colour'] == action[0] else -1
        reward += 3 if state['target_number'] == action[1] else -1
        
        # Reward for 3 consecutive correct actions
        # Append the current reward to the last three rewards
        self.last_three_rewards.append(reward)
        
        # Punishment for repeating the same action
        # Append the current action to the last three actions
        self.last_three_actions.append(action)
        
        # If more than three actions have been appended, remove the first action
        if len(self.last_three_actions) > 3:
            self.last_three_actions.pop(0)
            
        # Check if the last three actions are the same
        # If so, punish the agent
        if len(self.last_three_actions) == 3 and len(set(tuple(a) for a in self.last_three_actions)) == 1:
            # print("Punishment -500")
            reward -= 500
        
        # If more than three rewards have been appended, remove the first reward
        if len(self.last_three_rewards) > 3:
            self.last_three_rewards.pop(0)
            
        # Check if the last three rewards total = 12
        # If so, reward the agent
        if len(self.last_three_rewards) == 3 and sum(self.last_three_rewards) == 12:
            reward += 10
            self.last_three_actions = []
            # print("Reward +10")
        elif len(self.last_three_rewards) == 3 and sum(self.last_three_rewards) == -6:
            reward -= 20
            # print("Punishment -20")
        
        # Increment the trade count
        self.trade_count += 1
        
        # Get the next observation from the state dataframe
        observation = self.state.iloc[self.trade_count][['year', 'month', 'day', 'sn', 'project_id']].to_dict()
        
        # Set the terminated flag to False, unless the trade count has reached the length
        terminated = self.trade_count >= self.length
        
        # No truncation
        truncated = False
        
        # Return an empty info dictionary, for compatibility with the gym interface
        info = {}
            
        return observation, reward, terminated, truncated, info 
    
    def render(self):
        # This method is not implemented, as this environment does not have a visual representation
        pass
    
    def reset(self, states=get_states(), seed=None):
        """
        Reset the environment to an initial state and return the initial observation.
        
        Args:
            states (pandas.DataFrame): The state dataframe, containing the period, count_of_the_day, project_id, target_colour, and target_number for each trade.
            seed (int): The seed for the random number generator.
            
        Returns:
            observation (dict of int): The initial observation, consisting of a dictionary of three values: period, count_of_the_day, and project_id.
        """
        if seed is not None:
            random.seed(seed)
        
        # Reset the trade count to zero
        self.trade_count = 0
        
        # Reset the last three actions to an empty list
        self.last_three_actions = []
        
        # Reset the last three rewards to an empty list
        self.last_three_rewards = []
        
        # Reset the state dataframe with the given states
        self.state = states
        
        # Get the initial observation from the state dataframe
        observation = self.state.iloc[self.trade_count][['year', 'month', 'day', 'sn', 'project_id']].to_dict()
        
        # Reset the length to the number of trades per episode
        self.length = TRADES_PER_EPISODE
        
        # Create an empty info dictionary
        info = {}
        
        return observation, info
    
    def close(self):
        # This method is not implemented, as this environment does not have any resources to release
        pass


### Testing the Environment

In [97]:
# Testing the Environment

test_episodes = 3
for episode in range(test_episodes):
    env = SiteEnv(get_states())
    state = env.reset(get_states())
    terminated = False
    score = 0
    
    while not terminated:
        env.render()
        action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(action)
        score += reward
        print(f"Reward: {reward}, Done: {terminated}")
        
    print('--------------------------')  
    print(f"Episode: {episode}, Score: {score}")
    print('--------------------------')
    env.close()

Reward: -2, Done: False
Reward: -2, Done: False
Reward: 4, Done: False
Reward: 0, Done: False
Reward: -2, Done: False
Reward: -2, Done: False
Punishment -20
Reward: -22, Done: False
Reward: 0, Done: False
Reward: 0, Done: False
Reward: 0, Done: False
Reward: 0, Done: False
Reward: -2, Done: False
Reward: -2, Done: False
Reward: 4, Done: False
Reward: -2, Done: False
Reward: 0, Done: False
Reward: 0, Done: False
Reward: -2, Done: False
Reward: -2, Done: False
Punishment -20
Reward: -22, Done: False
Reward: 4, Done: False
Reward: 0, Done: False
Reward: 0, Done: False
Reward: -2, Done: False
Reward: -2, Done: False
Reward: 0, Done: False
Reward: 2, Done: False
Reward: -2, Done: False
Reward: 0, Done: False
Reward: 0, Done: False
Reward: 2, Done: False
Reward: -2, Done: False
Reward: 4, Done: False
Reward: 0, Done: False
Reward: 0, Done: False
Reward: 0, Done: False
Reward: 0, Done: False
Reward: 0, Done: False
Reward: -2, Done: False
Reward: -2, Done: False
Punishment -20
Reward: -22, Don

### Train Model

In [98]:
# Training the Environment

log_path = '../logs/'
model = PPO('MultiInputPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [99]:
model.learn(total_timesteps=500000)

Logging to ../logs/PPO_6
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20
Punishment -20


<stable_baselines3.ppo.ppo.PPO at 0x1516be88990>

### Evaluation and Saving

In [100]:
# Saving the Model

save_path = '../models/'
model.save(save_path + 'PPO_Mlp_RL_v0.1.1-beta')

del model

In [101]:
model = PPO.load(save_path + 'PPO_Mlp_RL_v0.1.1-beta')

In [102]:
# Evaluating the Model

evaluate_policy(model, env, n_eval_episodes=10)



Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10
Reward +10

(1318.0, 0.0)

### Predicting

In [103]:
def make_observation(period, project_id):
    """
    Make an observation from the given period and project_id.
    
    Args:
        period (int): The period of the trade.
        project_id (int): The project_id of the trade.
        
    Returns:
        observation (dict of int): The observation, consisting of a dictionary of three values: period, count_of_the_day, and project_id.
    """
    return {
        "year": int(str(period)[:4])-1,
        "month": int(str(period)[4:6])-1,
        "day": int(str(period)[6:8])-1,
        "sn": int(str(period)[8:]),
        "project_id": project_id - 1
    }

In [111]:
# Making Predictions
model.predict(make_observation('20231205055', 3))

(array([0, 6], dtype=int64), None)