In [18]:
import gym
import tensorflow as tf
import numpy as np
from gym import spaces
import pandas as pd
from tf_agents.environments import py_environment
from tf_agents import specs
from tf_agents.specs import array_spec
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network
from tf_agents.agents.dqn import dqn_agent
from tf_agents.utils import common
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.policies import random_tf_policy
from tf_agents.trajectories import time_step as ts

btc_data = pd.read_csv('C:/Users/user/Desktop/btc-usd-max.csv', parse_dates=['snapped_at'], index_col='snapped_at')
btc_data = btc_data.iloc[250:]

class BitcoinTradingEnv(py_environment.PyEnvironment):
    def __init__(self, df, lookback_window_size=50):
        super().__init__()
        self.df = df.dropna().reset_index(drop=True)
        self.lookback_window_size = lookback_window_size

        # Initialize the variables for trading
        self.initial_balance = 10000
        self.balance = self.initial_balance
        self.net_worth = self.initial_balance
        self.amount_held = 0
        self.spent_gains = 0
        self.current_step = 0

        # Define max values for normalization
        self.max_price = np.max(df['price'])
        self.max_volume = np.max(df['total_volume'])
        self.max_amount = 100  # This is a chosen maximum value
        self.max_spent_gains = 10000  # This is a chosen maximum value

        self.action_spec_ = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=2, name='action')

            # Modify your observation spec to align with your implementation of next_observation
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(4, self.lookback_window_size + 1), dtype=np.float32, minimum=0, name='observation')

        self._state = self._next_observation()

    def action_spec(self):
        return self.action_spec_

    def observation_spec(self):
        return self._observation_spec

    def _next_observation(self):
        # Get the data for the next observation
        frame = np.array([
            self.df.loc[self.current_step: self.current_step +
                        self.lookback_window_size, 'price'].values / self.max_price,
            self.df.loc[self.current_step: self.current_step +
                        self.lookback_window_size, 'total_volume'].values / self.max_volume,
        ])

        # append some other factors to frame like amount_held and such
        obs = np.append(frame, [
        np.full((self.lookback_window_size+1,), self.amount_held / self.max_amount),
        np.full((self.lookback_window_size+1,), self.spent_gains / self.max_spent_gains),
    ], axis=0)


        return obs[np.newaxis, :]  # Add an extra dimension

# Modify your reset method to account for the changes in next_observation
    def _reset(self):
            # Reset the trading environment
        self.current_step = self.lookback_window_size
        self.amount_held = 0
        self.spent_gains = 0
        self.current_price = self.df.loc[self.current_step, 'price']
        self.net_worths = [self.initial_balance]

        return ts.restart(self._next_observation())

    def _step(self, action):
            # Execute one time step within the environment
        if self._episode_ended:
                # The last action ended the episode. Ignore the current action and start a new episode.
            return self.reset()

            # Make sure episodes don't go on forever.
            if action is not None:
                if tf.is_tensor(action):
                    action = action.numpy()
                # Set the current price to a random price within the time step
                current_price = random.uniform(
                    self.df.loc[self.current_step, 'low'],
                    self.df.loc[self.current_step, 'high'])

                action_type = action[0]
                amount = action[1] / 10

                if action_type < 1:
                    # Buy amount % of balance in shares
                    total_possible = self.balance / current_price
                    shares_bought = total_possible * amount
                    recent_cost = shares_bought * current_price * (1 + self.commission)
                    self.balance -= recent_cost
                    self.shares_held += shares_bought
                    self.net_worths.append(self.balance + self.shares_held * current_price)

                elif action_type < 2:
                    # Sell amount % of shares held
                    shares_sold = self.shares_held * amount
                    self.balance += shares_sold * current_price * (1 - self.commission)
                    self.shares_held -= shares_sold
                    self.net_worths.append(self.balance + self.shares_held * current_price)

                self.current_step += 1

                if self.current_step >= len(self.df.loc[:, 'open'].values) - 1:
                    # End the episode if we've run out of data
                    self._episode_ended = True

            if self._episode_ended:
                return ts.termination(self._next_observation(), self.balance)
            else:
                return ts.transition(self._next_observation(), self.balance, discount=1.0)
    # Now using the custom environment in the learning process

env = BitcoinTradingEnv(btc_data)
env = tf_py_environment.TFPyEnvironment(env)

fc_layer_params = (100,)

q_net = q_network.QNetwork(
    env.observation_spec(),
    env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    env.time_step_spec(),
    env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=env.batch_size,
    max_length=50000)

random_policy = random_tf_policy.RandomTFPolicy(env.time_step_spec(),
                                                env.action_spec())

def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    buffer.add_batch(traj)

def collect_data(env, policy, buffer, steps):
    for _ in range(steps):
        collect_step(env, policy, buffer)

collect_data(env, random_policy, replay_buffer, steps=100)

# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=env.batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
# Evaluate the agent's policy once before training.
def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):
        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

num_iterations = 20000 
collect_steps_per_iteration = 1
log_interval = 200
eval_interval = 1000

returns = []

for _ in range(num_iterations):
    # Collect a few steps using collect_policy and save to the replay buffer.
    for _ in range(collect_steps_per_iteration):
        collect_step(env, agent.collect_policy, replay_buffer)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(env, agent.policy, num_episodes=10)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

ValueError: Received a mix of batched and unbatched Tensors, or Tensors are not compatible with Specs.  num_outer_dims: 1.
Saw tensor_shapes:
   TimeStep(
{'discount': TensorShape([1]),
 'observation': TensorShape([1, 1, 4, 51]),
 'reward': TensorShape([1]),
 'step_type': TensorShape([1])})
And spec_shapes:
   TimeStep(
{'discount': TensorShape([]),
 'observation': TensorShape([4, 51]),
 'reward': TensorShape([]),
 'step_type': TensorShape([])})