In [None]:
import tensorflow as tf
from collections import deque
import random
import numpy as np

class Agent:
    def __init__(self, state_size, window_size, trend, skip, batch_size):
        self.state_size = state_size
        self.window_size = window_size
        self.half_window = window_size // 2
        self.trend = trend
        self.skip = skip
        self.action_size = 3
        self.batch_size = batch_size
        self.memory = np.empty((1000,), dtype=object)
        self.inventory = np.empty((0,))

        self.gamma = 0.95
        self.epsilon = 0.5
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999

        self.model = self._build_model()

    def _build_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(32, input_dim=self.state_size, activation='relu'))
        model.add(tf.keras.layers.Dense(256,activation='relu'))
        model.add(tf.keras.layers.Dense(128,activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='relu'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1))
        return model

    def act(self, state):
        state = tf.reshape(state, (1, -1))  # Reshape state to match expected input shape
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.model.output_shape[-1])
        else:
            act_values = self.model(state, training=False)
            return tf.argmax(act_values[0])


    def get_state(self, t):
        window_size = self.window_size + 1
        d = t - window_size + 1
        block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1]
        res = []
        for i in range(window_size - 1):
            res.append(block[i + 1] - block[i])
        return np.array([res])

    def replay(self, batch_size):
        indices = np.random.choice(len(self.memory), size=batch_size, replace=False)
        mini_batch = self.memory[indices]
        states = np.zeros((batch_size, self.state_size))
        targets = np.zeros((batch_size, self.action_size))
        for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
            states[i] = state[0]
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model(next_state.reshape(1, -1))[0]))
            targets[i] = self.model.predict(state.reshape(1, -1))
            targets[i][action] = target
        dataset = tf.data.Dataset.from_tensor_slices((states, targets)).batch(batch_size)
        for batch in dataset:
            self.model.train_on_batch(*batch)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def buy(self, initial_money):
        starting_money = initial_money
        states_sell = []
        states_buy = []
        inventory = []
        state = self.get_state(0)
        for t in range(0, len(self.trend) - 1, self.skip):
            action = self.act(state)
            next_state = self.get_state(t + 1)

            if action == 1 and initial_money >= self.trend[t] and t < (len(self.trend) - self.half_window):
                inventory.append(self.trend[t])
                initial_money -= self.trend[t]
                states_buy.append(t)
                print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money))

            elif action == 2 and len(inventory):
                bought_price = self.inventory = np.delete(self.inventory, 0)

                initial_money += self.trend[t]
                states_sell.append(t)
                try:
                    invest = ((close[t] - bought_price) / bought_price) * 100
                except:
                    invest = 0
                print(
                    'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,'
                    % (t, close[t], invest, initial_money)
                )

            state = next_state
        invest = ((initial_money - starting_money) / starting_money) * 100
        total_gains = initial_money - starting_money
        return states_buy, states_sell, total_gains, invest

    def train(self, iterations, checkpoint, initial_money):
        for i in range(iterations):
            total_profit = 0.0
            inventory = []
            state = self.get_state(0)
            starting_money = tf.cast(initial_money, tf.float32)  # Ensure starting_money is a float
            for t in range(0, len(self.trend) - 1, self.skip):
                action = self.act(state[0])  # Reshape state to match expected input shape
                next_state = self.get_state(t + 1)

                if action == 1 and starting_money >= self.trend[t] and t < (len(self.trend) - self.half_window):
                    inventory.append(self.trend[t])
                    starting_money -= self.trend[t]

                elif action == 2 and len(inventory) > 0:
                    bought_price = self.inventory = np.delete(self.inventory, 0)

                    total_profit += self.trend[t] - bought_price
                    starting_money += self.trend[t]

                invest = ((starting_money - initial_money) / initial_money)
                self.memory = np.roll(self.memory, 1)
                self.memory[0] = (state, action, invest, next_state, starting_money < initial_money)
                state = next_state
                batch_size = min(self.batch_size, len(self.memory) - 1)
                self.replay(batch_size)
            if (i+1) % checkpoint == 0:
                print('epoch: %d, total rewards: %f.3, total money: %f,epsilon%f'%(i + 1, total_profit, starting_money,self.epsilon))