In [None]:
# For Data
import numpy as np
import pandas as pd

# For Environment
import gymnasium as gym
from gymnasium import spaces

# For RL
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement

# Helpers
import random

### Importing and Preprocessing data

In [None]:
# Read Data
data = pd.read_csv('../dataset/raw_data.csv')

# Removing Duplicates
data.drop_duplicates(inplace=True)

# Sorting
data.sort_values(by='project_id', ascending=False, inplace=True)
data.sort_values(by=['project_id', 'period', 'sn'], inplace=True, ascending=False)

# Backup
data.to_csv('../dataset/data.csv', index=False)

data.head()

In [None]:
# Spilt period into year, month, day
data['year'] = data['period'].apply(lambda x: int(str(x)[:4])-1)
data['month'] = data['period'].apply(lambda x: int(str(x)[4:6])-1)
data['day'] = data['period'].apply(lambda x: int(str(x)[6:])-1)

# Replace year with unique values from 0 to INF
data['year'] = data['year'].astype('category').cat.codes

# Dropping period
data.drop('period', axis=1, inplace=True)

data.head()

In [None]:
print(f"{data['year'].unique()}, {data['month'].unique()}, {data['day'].unique()}, {data['project_id'].unique()}, {data['target_colour'].unique()}, {data['target_number'].unique()}")

### Creating an Environment

In [None]:
TRADES_PER_EPISODE = 20
NO_OF_STEPS = int(2e6)

In [None]:
def get_states() -> pd.DataFrame:
    """
    Returns n continues states for the environment.
        
    Returns:
        states (list): List of n continues states
    """
    states = []
    
    # Get a random sample
    first_val = data.sample()
    
    # find the index of the first_val in the data
    index = first_val.index[0]
    
    # Convert it into a list and Append the first_val to states
    states.append(first_val.values.tolist()[0])
    
    for state in range(TRADES_PER_EPISODE+1):
        # Append the next_val to states
        index += 1
        
        # Check whether the project_id is the same as the first_val and the index is less than the length of the data
        if index < len(data) and data.iloc[index]['project_id'] == first_val['project_id'].values[0]:
            states.append(data.iloc[index].values.tolist())
    
    states.pop(0)
    # Check whether the length of the states is equal to the TRADES_PER_EPISODE
    if len(states) != TRADES_PER_EPISODE+1:
        # If not, recursively call the function
        return get_states()
        
    return pd.DataFrame(states, columns=data.columns)

In [None]:
# Site Environment
class SiteEnv(gym.Env):
    """
    A custom environment for trading sites based on colour and number.
    
    Attributes:
        trade_count (int): The current trade index in the state dataframe.
        action_space (gym.spaces.MultiDiscrete): The action space, consisting of 4 discrete values: multiplier, bet_on, colour and number.
        state (pandas.DataFrame): The state dataframe, containing the year, month, day, sn, project_id, target_colour, and target_number for each trade.
        observation_space (gym.spaces.Box): The observation space, consisting of a Box of 5 discrete values: year, month, day, sn ( serial number ) and project_id.
        length (int): The maximum number of trades per episode.
    """
    def __init__(self, states: pd.DataFrame):
        """
        Initialize the environment with the given state dataframe.
        
        Args:
            states (pandas.DataFrame): The state year, month, day, sn, project_id, target_colour, and target_number for each trade.
        """
        self.trade_count = 0
        
        # To punish the agent for repeating the same action
        self.last_three_actions = []
        
        # To reward the agent for 3 consecutive correct actions
        self.last_three_rewards = []
        
        # Define the action space as a multi-discrete space of two values: colour and number
        # multiplier = 0 - 4 ( Show how strongly the agent feels about the trade )
        # bet_on = 0 - 2 ( 0: colour, 1: number, 2: both )
        # colour = 0: red, 1: green
        # number = 0 - 9
        self.action_space = spaces.MultiDiscrete([5, 3, 2, 10])
        
        # Store the state dataframe as an attribute
        self.state = states
        
        # Define the observation space as a Box of 5 discrete values: year, month, day, sn ( serial number ) and project_id.
        # year = 1
        # month = 1 - 12
        # day = 1 - 31
        # sn = 0 - 999
        # project_id = 0: Parity, 1: Sapre, 2: Bcone, 3: Emerd
        self.observation_space = spaces.Box(low=np.array([1, 1, 1, 0, 0]), high=np.array([1, 12, 31, 999, 3]), dtype=np.int32)
        
        # Define the length as the number of trades per episode
        self.length = TRADES_PER_EPISODE
    
    def step(self, action: tuple[int, int]) -> tuple:
        """
        Run one timestep of the environment's dynamics.
        
        Args:
            action (gym.spaces.MultiDiscrete): The action space, consisting of 4 discrete values: multiplier, bet_on, colour and number.
            
        Returns:
            observation (gym.spaces.Box): The next observation, consisting of a dictionary of 5 values: year, month, day, sn ( serial number ) and project_id.
            reward (int): The amount of reward returned as a result of taking the action.
            terminated (bool): Whether the episode has ended, either because the trade count has reached the length, or the action is invalid.
            truncated (bool): Whether the episode was truncated. ( For now no truncation )
            info (dict): An empty dictionary, for compatibility with the gym interface.
        """
        # Check if the action is valid
        if not self.action_space.contains(action):
            # Raise an exception if the action is invalid
            raise ValueError(f"Invalid action: {action}")
        
        # Get the current state from the state dataframe
        state = self.state.iloc[self.trade_count]
        
        reward = 0
        # Check on_bet
        if action[1] == 0:
            # Bet on colour
            if action[2] == state['target_colour']:
                reward += (1 * (action[0] + 1))
            else:
                reward -= (1 * (action[0] + 1))
        elif action[1] == 1:
            # Bet on number
            if action[3] == state['target_number']:
                reward += (3 * (action[0] + 1))
            else:
                reward -= (1 * (action[0] + 1))
        else:
            # Bet on both
            if action[2] == state['target_colour'] and action[3] == state['target_number']:
                reward += (4 * (action[0] + 1))
            else:
                reward -= (2 * (action[0] + 1))
        
        # Reward for 3 consecutive correct actions on bet on both
        # Append the current reward to the last three rewards
        self.last_three_rewards.append([action[1], reward])
        
        # If more than three rewards have been appended, remove the first reward
        if len(self.last_three_rewards) > 3:
            self.last_three_rewards.pop(0)
            
        # Check if the last three rewards total >= 12
        # If so, reward the agent
        if len(self.last_three_rewards) == 3 and (self.last_three_rewards[0][1] + self.last_three_rewards[1][1] + self.last_three_rewards[2][1]) >= 12:
            reward += 50
            # print("Reward +10")
        elif len(self.last_three_rewards) == 3 and (self.last_three_rewards[0][1] + self.last_three_rewards[1][1] + self.last_three_rewards[2][1]) <= -6:
            reward -= 50
            # print("Punishment -20")
        
        # Punishment for repeating the same action
        # Append the current action's colour and number to the last_three_actions
        self.last_three_actions.append(action[2:])
        
        # If more than three actions have been appended, remove the first action
        if len(self.last_three_actions) > 3:
            self.last_three_actions.pop(0)
            
        # Check if the last three actions are the same
        # If so, punish the agent
        if len(self.last_three_actions) == 3 and len(set(tuple(a) for a in self.last_three_actions)) == 1:
            # print("Punishment -500")
            reward -= 500
        
        # Increment the trade count
        self.trade_count += 1
        
        # Get the next observation from the state dataframe
        observation = self.state.iloc[self.trade_count][['year', 'month', 'day', 'sn', 'project_id']].to_list()
        
        # Set the terminated flag to False, unless the trade count has reached the length
        terminated = self.trade_count >= self.length
        
        # No truncation
        truncated = False
        
        # Return an empty info dictionary, for compatibility with the gym interface
        info = {}
            
        return observation, reward, terminated, truncated, info 
    
    def render(self):
        # This method is not implemented, as this environment does not have a visual representation
        pass
    
    def reset(self, states=get_states(), seed=None):
        """
        Reset the environment to an initial state and return the initial observation.
        
        Args:
            states (pandas.DataFrame): The state dataframe, containing the period, count_of_the_day, project_id, target_colour, and target_number for each trade.
            seed (int): The seed for the random number generator.
            
        Returns:
            observation (gym.spaces.Box): The initial observation, consisting of a dictionary of 5 values: year, month, day, sn ( serial number ) and project_id.
        """
        if seed is not None:
            random.seed(seed)
        
        # Reset the trade count to zero
        self.trade_count = 0
        
        # Reset the last three actions to an empty list
        self.last_three_actions = []
        
        # Reset the last three rewards to an empty list
        self.last_three_rewards = []
        
        # Reset the state dataframe with the given states
        self.state = states
        
        # Get the initial observation from the state dataframe
        observation = self.state.iloc[self.trade_count][['year', 'month', 'day', 'sn', 'project_id']].to_list()
        
        # Reset the length to the number of trades per episode
        self.length = TRADES_PER_EPISODE
        
        # Create an empty info dictionary
        info = {}
        
        return observation, info
    
    def close(self):
        # This method is not implemented, as this environment does not have any resources to release
        pass


### Testing the Environment

In [None]:
# Testing the Environment

test_episodes = 3
for episode in range(test_episodes):
    env = SiteEnv(get_states())
    state = env.reset(get_states())
    terminated = False
    score = 0
    
    while not terminated:
        env.render()
        action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(action)
        score += reward
        print(f"Reward: {reward}, Done: {terminated}")
        
    print('--------------------------')  
    print(f"Episode: {episode}, Score: {score}")
    print('--------------------------')
    env.close()

### Train Model

In [None]:
# Training the Environment

log_path = '../logs/'
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

In [None]:
# Setting up the Callbacks
stop_training_callback = StopTrainingOnNoModelImprovement(max_no_improvement_evals=10, min_evals=30, verbose=1)
eval_callback = EvalCallback(env, eval_freq=10000, callback_after_eval=stop_training_callback, best_model_save_path='../models/', verbose=1)

In [None]:
model.learn(total_timesteps=NO_OF_STEPS, callback=eval_callback)    

In [None]:
# Loading the logs
# !tensorboard --logdir ../logs/

### Evaluation and Saving

In [None]:
# Saving the Model

save_path = '../models/'
model.save(save_path + 'PPO_Mlp_RL_v0.1.2-beta')

del model

In [None]:
save_path = '../models/'
model = PPO.load(save_path + 'PPO_Mlp_RL_v0.1.2-beta')

In [None]:
# Evaluating the Model

evaluate_policy(model, env, n_eval_episodes=10)

### Predicting

In [None]:
def make_observation(period: str, project_id: int) -> list:
    """
    Make an observation from the given period and project_id.
    
    Args:
        period (int): The period of the trade.
        project_id (int): The project_id of the trade.
        
    Returns:
        observation (gym.spaces.Box): The observation, consisting of a dictionary of 5 values: year, month, day, sn ( serial number ) and project_id.
    """
    # Get the year, month, day, and sn from the period
    year = int(str(period)[:4])-1
    month = int(str(period)[4:6])-1
    day = int(str(period)[6:8])-1
    sn = int(str(period)[8:])
    
    # Convert the year into a category
    year = pd.Series(year).astype('category').cat.codes[0]
    
    # Return the observation
    return [year, month, day, sn, project_id]

In [None]:
# Making Predictions
model.predict(make_observation('20231205164', 4))