In [1]:
# For Data
import numpy as np
import pandas as pd

# For Environment
import gymnasium as gym
from gymnasium import spaces

# For RL
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement

# Helpers
import random




### Importing and Preprocessing data

In [8]:
# Read Data
data = pd.read_csv('../dataset/raw_data.csv')

# Sorting
data.sort_values(by='project_id', ascending=False, inplace=True)
data.sort_values(by=['project_id', 'period', 'sn'], inplace=True, ascending=False)

# Removing Duplicates according to project_id, period, and sn
data.drop_duplicates(subset=['project_id', 'period', 'sn'], inplace=True)

# Backup
data.to_csv('../dataset/data.csv', index=False)

data.head()

Unnamed: 0,period,sn,price,project_id,begintime,target_number,target_colour
125033,20231213,191,59814,3,1702440000,3,1
125034,20231213,190,59818,3,1702439820,8,0
125035,20231213,189,59823,3,1702439640,3,1
125036,20231213,188,59846,3,1702439460,6,0
125037,20231213,187,59858,3,1702439280,8,0


In [9]:
# Spilt period into year, month, day
data['year'] = data['period'].apply(lambda x: int(str(x)[:4])-1)
data['month'] = data['period'].apply(lambda x: int(str(x)[4:6])-1)
data['day'] = data['period'].apply(lambda x: int(str(x)[6:])-1)

# Replace year with unique values from 0 to INF
data['year'] = data['year'].astype('category').cat.codes

# Dropping period
data.drop('period', axis=1, inplace=True)

data.head()

Unnamed: 0,sn,price,project_id,begintime,target_number,target_colour,year,month,day
125033,191,59814,3,1702440000,3,1,0,11,12
125034,190,59818,3,1702439820,8,0,0,11,12
125035,189,59823,3,1702439640,3,1,0,11,12
125036,188,59846,3,1702439460,6,0,0,11,12
125037,187,59858,3,1702439280,8,0,0,11,12


In [10]:
print(f"{data['year'].unique()}, {data['month'].unique()}, {data['day'].unique()}, {data['project_id'].unique()}, {data['target_colour'].unique()}, {data['target_number'].unique()}")

[0], [11 10  9], [12 11 10  9  8  7  6  5  4  3  2  1  0 29 28 27 26 25 24 23 22 21 18 17
 16 15 14 13 30 20 19], [3 2 1 0], [1 0], [3 8 6 1 4 9 7 2 5 0]


### Feature Engineering

In [11]:
# Creating a new column showing previous n trade options
# Column: list( 4: list( 25: list(project_id, price, target_number, target_colour) ) )
# Define a function to create the previous trades list

NO_OF_PREVIOUS_TRADES = 25
data['previous_trade'] = ''

def get_previous_trades(indices: list[int]) -> list[int]:
    """
    Return a list of list containing previous trades
    project_id, price, target_number, target_colour

    Args:
        indices (list[int]): list of indices

    Returns:
        list[int]: list of list containing previous trades
    """
    indices_previous_trades = []
    
    for index in indices:
        if index in data.index:
            prev_indices = [i for i in range(max(0, index - NO_OF_PREVIOUS_TRADES), index) if i in data.index]
            prev_trades = data.loc[prev_indices, ['project_id', 'price', 'target_number', 'target_colour']].values.tolist()
            indices_previous_trades.append(prev_trades)
        
    if len(indices_previous_trades) == 4:
        for lis in indices_previous_trades:
            if len(lis) != 25:
                print('Error')
                return []
    else:
        print('Error')
        return []
            
    return indices_previous_trades

groups = data.groupby(['year', 'month', 'day', 'sn']).groups
for key in groups:
    indices = groups[key].tolist()
    # print(indices)
    
    for index in indices:
        if data.at[index, 'previous_trade'] == '':
            previous_trades = get_previous_trades(indices)
            break
    
    for index, previous_trade in zip(indices, previous_trades):
        data.at[index, 'previous_trade'] = previous_trade
        # print(data.at[index, 'previous_trade'])

Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Erro

In [12]:
# Adding Moving Average for target_number and target_colour
data["target_number_ma"] = (
    data["target_number"].rolling(window=NO_OF_PREVIOUS_TRADES).mean()
)
data["target_colour_ma"] = (
    data["target_colour"].rolling(window=NO_OF_PREVIOUS_TRADES).mean()
)

In [13]:
# Remove rows with empty previous_trade
print(f"Before: {data.shape}")
data = data[data['previous_trade'] != '']
print(f"After: {data.shape}")

Before: (112324, 12)
After: (109528, 12)


In [14]:
data.head()

Unnamed: 0,sn,price,project_id,begintime,target_number,target_colour,year,month,day,previous_trade,target_number_ma,target_colour_ma
125058,166,59936,3,1702435500,6,0,0,11,12,"[[3, 59814, 3, 1], [3, 59818, 8, 0], [3, 59823...",5.0,0.52
125059,165,59916,3,1702435320,6,0,0,11,12,"[[3, 59818, 8, 0], [3, 59823, 3, 1], [3, 59846...",4.92,0.52
125060,164,59938,3,1702435140,8,0,0,11,12,"[[3, 59823, 3, 1], [3, 59846, 6, 0], [3, 59858...",5.12,0.48
125061,163,59916,3,1702434960,6,0,0,11,12,"[[3, 59846, 6, 0], [3, 59858, 8, 0], [3, 59866...",5.12,0.48
125062,162,59948,3,1702434780,8,0,0,11,12,"[[3, 59858, 8, 0], [3, 59866, 6, 0], [3, 59841...",5.12,0.48


### Creating an Environment

In [15]:
TRADES_PER_EPISODE = 20
NO_OF_STEPS = int(2e6)

In [16]:
def get_states() -> pd.DataFrame:
    """
    Returns n continues states for the environment.
        
    Returns:
        states (list): List of n continues states
    """
    states = []
    
    # Get a random sample
    first_val = data.sample()
    
    # find the index of the first_val in the data
    index = first_val.index[0]
    
    # Convert it into a list and Append the first_val to states
    states.append(first_val.values.tolist()[0])
    
    for state in range(TRADES_PER_EPISODE+1):
        # Append the next_val to states
        index += 1
        
        # Check whether the project_id is the same as the first_val and the index is less than the length of the data
        if index < len(data) and data.iloc[index]['project_id'] == first_val['project_id'].values[0]:
            states.append(data.iloc[index].values.tolist())
    
    states.pop(0)
    # Check whether the length of the states is equal to the TRADES_PER_EPISODE
    if len(states) != TRADES_PER_EPISODE+1:
        # If not, recursively call the function
        return get_states()
        
    return pd.DataFrame(states, columns=data.columns)

In [17]:
# Site Environment
class SiteEnv(gym.Env):
    """
    A custom environment for trading sites based on colour and number.

    Attributes:
        trade_count (int): The current trade index in the state dataframe.
        action_space (gym.spaces.MultiDiscrete): The action space, consisting of 4 discrete values: multiplier, bet_on, colour and number.
        state (pandas.DataFrame): The state dataframe, containing the year, month, day, sn, project_id, previous_trades, target_colour, and target_number for each trade.
        observation_space (gym.spaces.Dict): The observation space, consisting of a Box of 8 discrete values: year, month, day, sn ( serial number ), project_id, target_number_ma, target_colour_ma and previous_trades.
        length (int): The maximum number of trades per episode.
    """

    def __init__(self, states: pd.DataFrame):
        """
        Initialize the environment with the given state dataframe.

        Args:
            states (pandas.DataFrame): The state year, month, day, sn, project_id, target_colour, and target_number for each trade.
        """
        self.trade_count = 0

        # To punish the agent for repeating the same action
        self.last_three_actions = []

        # To reward the agent for 3 consecutive correct actions
        self.last_three_rewards = []

        # Define the action space as a MultiDiscrete space of four values: multiplier, bet_on, colour and number.
        # multiplier = 0 - 4 ( Show how strongly the agent feels about the trade )
        # bet_on = 0 - 2 ( 0: colour, 1: number, 2: both )
        # colour = 0: red, 1: green
        # number = if colour is 0: 0, 2, 4, 6, 8 | if colour is 1: 1, 3, 5, 7, 9
        self.action_space = spaces.MultiDiscrete([4, 3, 2, 10])

        # Store the state dataframe as an attribute
        self.state = states

        # Define the observation space as a Dict of 8 discrete values: year, month, day, sn ( serial number ), project_id, target_number_ma, target_colour_ma and previous_trades.
        # year = 1
        # month = 1 - 12
        # day = 1 - 31
        # sn = 0 - 999
        # project_id = 0: Parity, 1: Sapre, 2: Bcone, 3: Emerd
        # target_number_ma = 0 - 9 ( float )
        # target_colour_ma = 0 - 1 ( float )
        # previous_trades = list( 25: list(project_id, price, target_number, target_colour) )
        self.observation_space = spaces.Dict(
            {
                "year": spaces.Discrete(1),
                "month": spaces.Discrete(12),
                "day": spaces.Discrete(31),
                "sn": spaces.Discrete(1000),
                "project_id": spaces.Discrete(4),
                "target_number_ma": spaces.Box(low=0, high=9, shape=(1,), dtype=np.float32),
                "target_colour_ma": spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32),
                "previous_trades": spaces.Box(
                    low=0,
                    high=np.full((25, 4), [4, 65000, 9, 1]),
                    shape=(NO_OF_PREVIOUS_TRADES, 4),
                    dtype=np.int32,
                ),
            }
        )

        # Define the length as the number of trades per episode
        self.length = TRADES_PER_EPISODE

    def step(self, action: tuple[int, int]) -> tuple:
        """
        Run one timestep of the environment's dynamics.

        Args:
            action (gym.spaces.MultiDiscrete): The action space, consisting of 4 discrete values: multiplier, bet_on, colour and number.

        Returns:
            observation (gym.spaces.Dict): The next observation, consisting of a dictionary of 8 values: year, month, day, sn ( serial number ), project_id, target_number_ma, target_colour_ma and previous_trades.
            reward (int): The amount of reward returned as a result of taking the action.
            terminated (bool): Whether the episode has ended, either because the trade count has reached the length, or the action is invalid.
            truncated (bool): Whether the episode was truncated. ( For now no truncation )
            info (dict): An empty dictionary, for compatibility with the gym interface.
        """
        
        # Initialise the action
        reward = 0

        # Get the current state from the state dataframe
        state = self.state.iloc[self.trade_count]

        reward = 0
        # Check for Colour, Number rule
        # If the colour is red, the number must be even and vice versa
        if (state["target_colour"] == 0 and state["target_number"] % 2 != 0) or (
            state["target_colour"] == 1 and state["target_number"] % 2 == 0
        ):
            # Punish the agent for betting with multiplier
            reward -= 10 * (action[0] + 1) * (action[1] + 1)
            # print(f"Punishment {reward}")

        # Check on_bet
        if action[1] == 0:
            # Bet on colour
            if action[2] == state["target_colour"]:
                reward += 1 * (action[0] + 1)
            else:
                reward -= 1 * (action[0] + 1)
        elif action[1] == 1:
            # Bet on number
            if action[3] == state["target_number"]:
                reward += 3 * (action[0] + 1)
            else:
                reward -= 1 * (action[0] + 1)
        else:
            # Bet on both
            if (
                action[2] == state["target_colour"]
                and action[3] == state["target_number"]
            ):
                reward += 4 * (action[0] + 1)
            else:
                reward -= 2 * (action[0] + 1)

        # Reward for 3 consecutive correct actions on bet on both
        # Append the current reward to the last three rewards
        self.last_three_rewards.append([action[1], reward])

        # If more than three rewards have been appended, remove the first reward
        if len(self.last_three_rewards) > 3:
            self.last_three_rewards.pop(0)

        # Check if the last three rewards total >= 12
        # If so, reward the agent
        if (
            len(self.last_three_rewards) == 3
            and (
                self.last_three_rewards[0][1]
                + self.last_three_rewards[1][1]
                + self.last_three_rewards[2][1]
            )
            >= 12
        ):
            reward += 5
            # print("Reward +10")
        elif (
            len(self.last_three_rewards) == 3
            and (
                self.last_three_rewards[0][1]
                + self.last_three_rewards[1][1]
                + self.last_three_rewards[2][1]
            )
            <= -6
        ):
            reward -= 5
            # print("Punishment -20")

        # Punishment for repeating the same action
        # Append the current action's colour and number to the last_three_actions
        self.last_three_actions.append(action[2:])

        # If more than three actions have been appended, remove the first action
        if len(self.last_three_actions) > 3:
            self.last_three_actions.pop(0)

        # Check if the last three actions are the same
        # If so, punish the agent
        if (
            len(self.last_three_actions) == 3
            and len(set(tuple(a) for a in self.last_three_actions)) == 1
        ):
            # print("Punishment -500")
            reward -= 50

        # Increment the trade count
        self.trade_count += 1

        # Get the next observation from the state dataframe
        next_state = self.state.iloc[self.trade_count]
        observation = {
            "year": next_state["year"],
            "month": next_state["month"],
            "day": next_state["day"],
            "sn": next_state["sn"],
            "project_id": next_state["project_id"],
            "target_number_ma": next_state["target_number_ma"],
            "target_colour_ma": next_state["target_colour_ma"],
            "previous_trades": next_state["previous_trade"],
        }

        # Set the terminated flag to False, unless the trade count has reached the length or the reward is less than 1
        terminated = False if self.trade_count >= self.length or reward < 1 else True

        # No truncation
        truncated = False

        # Return an empty info dictionary, for compatibility with the gym interface
        info = {}

        return observation, reward, terminated, truncated, info

    def render(self):
        # This method is not implemented, as this environment does not have a visual representation
        pass

    def reset(self, states=get_states(), seed=None):
        """
        Reset the environment to an initial state and return the initial observation.

        Args:
            states (pandas.DataFrame): The state dataframe, containing the period, count_of_the_day, project_id, target_colour, and target_number for each trade.
            seed (int): The seed for the random number generator.

        Returns:
            observation (gym.spaces.Box): The initial observation, consisting of a dictionary of 5 values: year, month, day, sn ( serial number ) and project_id.
        """
        if seed is not None:
            random.seed(seed)

        # Reset the trade count to zero
        self.trade_count = 0

        # Reset the last three actions to an empty list
        self.last_three_actions = []

        # Reset the last three rewards to an empty list
        self.last_three_rewards = []

        # Reset the state dataframe with the given states
        self.state = states

        # Get the initial observation from the state dataframe
        state = self.state.iloc[self.trade_count]
        observation = {
            "year": state["year"],
            "month": state["month"],
            "day": state["day"],
            "sn": state["sn"],
            "project_id": state["project_id"],
            "target_number_ma": state["target_number_ma"],
            "target_colour_ma": state["target_colour_ma"],
            "previous_trades": state["previous_trade"],
        }

        # Reset the length to the number of trades per episode
        self.length = TRADES_PER_EPISODE

        # Create an empty info dictionary
        info = {}

        return observation, info

    def close(self):
        # This method is not implemented, as this environment does not have any resources to release
        pass

### Testing the Environment

In [18]:
# Testing the Environment

test_episodes = 3
for episode in range(test_episodes):
    env = SiteEnv(get_states())
    state = env.reset(get_states())
    terminated = False
    score = 0
    
    while not terminated:
        env.render()
        action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(action)
        score += reward
        print(f"Reward: {reward}, Done: {terminated}")
        
    print('--------------------------')  
    print(f"Episode: {episode}, Score: {score}")
    print('--------------------------')
    env.close()

Reward: -1, Done: False
Reward: -1, Done: False
Reward: -3, Done: False
Reward: 3, Done: False
Reward: -58, Done: False
Reward: 1, Done: False
Reward: -51, Done: False
Reward: -4, Done: False
Reward: 3, Done: False
Reward: -1, Done: False
Reward: -4, Done: False
Reward: 4, Done: False
Reward: -56, Done: False
Reward: -2, Done: False
Reward: 6, Done: False
Reward: -2, Done: False
Reward: 59, Done: False
Reward: -2, Done: False
Reward: -1, Done: False
Reward: 2, Done: True
--------------------------
Episode: 0, Score: -108
--------------------------
Reward: 1, Done: False
Reward: -2, Done: False
Reward: 3, Done: False
Reward: -4, Done: False
Reward: -4, Done: False
Reward: -58, Done: False
Reward: -54, Done: False
Reward: -53, Done: False
Reward: -53, Done: False
Reward: -58, Done: False
Reward: 12, Done: False
Reward: 1, Done: False
Reward: -8, Done: False
Reward: -552, Done: False
Reward: -58, Done: False
Reward: 6, Done: False
Reward: -58, Done: False
Reward: -1, Done: False
Reward: -

### Train Model

In [19]:
# Training the Environment

log_path = '../logs/'
model = RecurrentPPO('MultiInputLstmPolicy', env, verbose=0, ent_coef=0.1, tensorboard_log=log_path)

In [20]:
# Setting up the Callbacks
stop_training_callback = StopTrainingOnNoModelImprovement(max_no_improvement_evals=10, min_evals=30, verbose=1)
eval_callback = EvalCallback(env, eval_freq=10000, callback_after_eval=stop_training_callback, best_model_save_path='../models/', verbose=1)

In [21]:
model.learn(total_timesteps=NO_OF_STEPS, callback=eval_callback)
# model.learn(total_timesteps=NO_OF_STEPS)



Eval num_timesteps=10000, episode_reward=-8996.00 +/- 0.00
Episode length: 20.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-8996.00 +/- 0.00
Episode length: 20.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=-8996.00 +/- 0.00
Episode length: 20.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=-8996.00 +/- 0.00
Episode length: 20.00 +/- 0.00
Eval num_timesteps=50000, episode_reward=-8996.00 +/- 0.00
Episode length: 20.00 +/- 0.00
Eval num_timesteps=60000, episode_reward=-8996.00 +/- 0.00
Episode length: 20.00 +/- 0.00
Eval num_timesteps=70000, episode_reward=-9004.00 +/- 0.00
Episode length: 20.00 +/- 0.00
Eval num_timesteps=80000, episode_reward=-8996.00 +/- 0.00
Episode length: 20.00 +/- 0.00
Eval num_timesteps=90000, episode_reward=-8996.00 +/- 0.00
Episode length: 20.00 +/- 0.00
Eval num_timesteps=100000, episode_reward=-8996.00 +/- 0.00
Episode length: 20.00 +/- 0.00
Eval num_timesteps=110000, episode_reward=-8996.00 +/- 0.00
Episode length: 2

KeyboardInterrupt: 

In [None]:
# Loading the logs
# !tensorboard --logdir ../logs/

### Evaluation and Saving

In [None]:
# Saving the Model

save_path = '../models/'
model.save(save_path + 'PPO_MIP_RL_v0.1.3-beta')

del model

In [None]:
save_path = '../models/'
model = PPO.load(save_path + 'PPO_best_model.zip')

In [None]:
# Evaluating the Model

evaluate_policy(model, env, n_eval_episodes=10)

### Predicting

In [None]:
def make_observation(period: str, project_id: int) -> list:
    """
    Make an observation from the given period and project_id.
    
    Args:
        period (int): The period of the trade.
        project_id (int): The project_id of the trade.
        
    Returns:
        observation (gym.spaces.Box): The observation, consisting of a dictionary of 5 values: year, month, day, sn ( serial number ) and project_id.
    """
    # Get the year, month, day, and sn from the period
    year = int(str(period)[:4])-1
    month = int(str(period)[4:6])-1
    day = int(str(period)[6:8])-1
    sn = int(str(period)[8:])
    
    # Convert the year into a category
    year = pd.Series(year).astype('category').cat.codes[0]
    
    # Return the observation
    return [year, month, day, sn, project_id]

In [None]:
# Making Predictions
model.predict(make_observation('20231207274', 2))