<a href="https://colab.research.google.com/github/KordelFranceTech/GoldiloxFund/blob/main/SuperSimpleGoldiloxRLAgent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gym==0.9.4
!pip3 install yfinance
!pip3 install keras

from collections import deque
import random
import numpy as np
# from model import mlp
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
import itertools
import datetime

import pickle
import time
import numpy as np
import argparse
import re

from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import yfinance as yf

from google.colab import drive
drive.mount('/content/drive')
%pwd
%ls
%cd 'drive/My Drive/Colab Notebooks/data'
%pwd

DAY_TRADING_ENABLED: bool = False

startDate = datetime.datetime.now() - datetime.timedelta(days=55)
startDate = datetime.datetime(startDate.year, startDate.month, startDate.day)
endDate = datetime.datetime(datetime.datetime.today().year, datetime.datetime.today().month,
                            datetime.datetime.today().day)

#############################################################################
#############################################################################
# AGENT
#############################################################################
#############################################################################

class DQNAgent(object):
  """ A simple Deep Q agent """
  def __init__(self, state_size, action_size):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = deque(maxlen=2000)
    self.gamma = 0.95  # discount rate
    self.epsilon = 1.0  # exploration rate
    self.epsilon_min = 0.01
    self.epsilon_decay = 0.995
    self.model = mlp(state_size, action_size)


  def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))


  def act(self, state):
    # print(f'agent is acting for predicted state: {state}')
    if np.random.rand() <= self.epsilon:
      return random.randrange(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0])  # returns action


  def replay(self, batch_size=32):
    """ vectorized implementation; 30x speed up compared with for loop """
    minibatch = random.sample(self.memory, batch_size)

    states = np.array([tup[0][0] for tup in minibatch])
    actions = np.array([tup[1] for tup in minibatch])
    rewards = np.array([tup[2] for tup in minibatch])
    next_states = np.array([tup[3][0] for tup in minibatch])
    done = np.array([tup[4] for tup in minibatch])

    # Q(s', a)
    target = rewards + self.gamma * np.amax(self.model.predict(next_states), axis=1)
    # end state target is reward itself (no lookahead)
    target[done] = rewards[done]

    # Q(s, a)
    target_f = self.model.predict(states)
    # make the agent to approximately map the current state to future discounted reward
    target_f[range(batch_size), actions] = target

    self.model.fit(states, target_f, epochs=1, verbose=0)

    if self.epsilon > self.epsilon_min:
      self.epsilon *= self.epsilon_decay


  def load(self, name):
    self.model.load_weights(name)


  def save(self, name):
    self.model.save_weights(name)

#############################################################################
#############################################################################
# ENVIRONMENT
#############################################################################
#############################################################################


class TradingEnv(gym.Env):
  """
  A 3-stock (MSFT, IBM, QCOM) trading environment.

  State: [# of stock owned, current stock prices, cash in hand]
    - array of length n_stock * 2 + 1
    - price is discretized (to integer) to reduce state space
    - use close price for each stock
    - cash in hand is evaluated at each step based on action performed

  Action: sell (0), hold (1), and buy (2)
    - when selling, sell all the shares
    - when buying, buy as many as cash in hand allows
    - if buying multiple stock, equally distribute cash in hand and then utilize the balance
  """
  def __init__(self, train_data, init_invest=20000):
    # data
    self.stock_price_history = np.around(train_data) # round up to integer to reduce state space
    self.n_stock, self.n_step = self.stock_price_history.shape

    # instance attributes
    self.init_invest = init_invest
    self.cur_step = None
    self.stock_owned = None
    self.stock_price = None
    self.cash_in_hand = None

    # action space
    self.action_space = spaces.Discrete(3**self.n_stock)

    # observation space: give estimates in order to sample and build scaler
    stock_max_price = self.stock_price_history.max(axis=1)
    stock_range = [[0, init_invest * 2 // mx] for mx in stock_max_price]
    price_range = [[0, mx] for mx in stock_max_price]
    cash_in_hand_range = [[0, init_invest * 2]]
    self.observation_space = spaces.MultiDiscrete(stock_range + price_range + cash_in_hand_range)

    # seed and start
    self._seed()
    self._reset()


  def _seed(self, seed=None):
    self.np_random, seed = seeding.np_random(seed)
    return [seed]


  def _reset(self):
    self.cur_step = 0
    self.stock_owned = [0] * self.n_stock
    self.stock_price = self.stock_price_history[:, self.cur_step]
    self.cash_in_hand = self.init_invest
    print('reset environment so:')
    print(f'\n\t[0] - stock owned = {self.stock_owned}')
    print(f'\n\t[1] - stock price = {list(self.stock_price)}')
    print(f'\n\t[2] - cash in hand: {self.cash_in_hand}')
    return self._get_obs()


  def _step(self, action):
    assert self.action_space.contains(action)
    prev_val = self._get_val()
    self.cur_step += 1
    self.stock_price = self.stock_price_history[:, self.cur_step] # update price
    self._trade(action)
    cur_val = self._get_val()
    reward = cur_val - prev_val
    done = self.cur_step == self.n_step - 1
    info = {'cur_val': cur_val}
    return self._get_obs(), reward, done, info


  def _get_obs(self):
    obs = []
    obs.extend(self.stock_owned)
    obs.extend(list(self.stock_price))
    obs.append(self.cash_in_hand)
    return obs


  def _get_val(self):
    return np.sum(self.stock_owned * self.stock_price) + self.cash_in_hand


  def _trade(self, action):
    # all combo to sell(0), hold(1), or buy(2) stocks
    # action_combo = map(list, itertools.product([0, 1, 2], repeat=self.n_stock))
    # action_combo = map(list, itertools.product([0, 1, 2], repeat=self.n_stock))
    action_combo = list(map(list, itertools.product([0, 1, 2], repeat=self.n_stock))) #fix for above line for > python 2.7
    action_vec = action_combo[action]

    # one pass to get sell/buy index
    sell_index = []
    buy_index = []
    for i, a in enumerate(action_vec):
      if a == 0:
        sell_index.append(i)
      elif a == 2:
        buy_index.append(i)

    # two passes: sell first, then buy; might be naive in real-world settings
    if sell_index:
      for i in sell_index:
        self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
        self.stock_owned[i] = 0
    if buy_index:
      can_buy = True
      while can_buy:
        for i in buy_index:
          if self.cash_in_hand > self.stock_price[i]:
            self.stock_owned[i] += 1 # buy one share
            self.cash_in_hand -= self.stock_price[i]
          else:
            can_buy = False

#############################################################################
#############################################################################
# MODEL
#############################################################################
#############################################################################

# #kkf version 1  - BEST RETURNS
#run at 2000 epochs
#returns very consistent profits on legacy stock csvs after 1500 train episodes
#profits on modern day prices of same company stocks show mixed results
def mlp(n_obs, n_action, n_hidden_layer=1, n_neuron_per_layer=32,
        activation='relu', loss='mse'):
  """ A multi-layer perceptron """
  model = Sequential()
  model.add(Dense(n_neuron_per_layer, input_dim=n_obs, activation=activation))
  for _ in range(n_hidden_layer):
    model.add(Dense(n_neuron_per_layer, activation=activation))
  model.add(Dense(n_neuron_per_layer, activation=activation))
  model.add(Dense(n_neuron_per_layer, activation=activation))
  model.add(Dense(n_neuron_per_layer, activation=activation))
  model.add(Dense(n_action, activation='linear'))
  model.compile(loss=loss, optimizer=Adam())
  print(model.summary())
  return model

#############################################################################
#############################################################################
# UTILS
#############################################################################
#############################################################################

def get_data(col='close'):
  """ Returns a 3 x n_step array """
  msft = pd.read_csv(open('rldata/daily_MSFT.csv'), usecols=[col])
  ibm = pd.read_csv(open('rldata/daily_IBM.csv'), usecols=[col])
  qcom = pd.read_csv(open('rldata/daily_QCOM.csv'), usecols=[col])
  # recent price are at top; reverse it
  return np.array([msft[col].values[::-1],
                   ibm[col].values[::-1],
                   qcom[col].values[::-1]])

def get_data_test_legacy(col='close'):
  """ Returns a 3 x n_step array """
  msft = pd.read_csv(open('rldata/daily_MSFT_test.csv'), usecols=[col])
  ibm = pd.read_csv(open('rldata/daily_IBM_test.csv'), usecols=[col])
  qcom = pd.read_csv(open('rldata/daily_QCOM_test.csv'), usecols=[col])
  # recent price are at top; reverse it
  return np.array([msft[col].values[::-1],
                   ibm[col].values[::-1],
                   qcom[col].values[::-1]])


def get_data_test_industry(col='close'):
  """ Returns a 3 x n_step array """
  msft = pd.read_csv(open('rldata/daily_TXN_test.csv'), usecols=[col])
  ibm = pd.read_csv(open('rldata/daily_NVDA_test.csv'), usecols=[col])
  qcom = pd.read_csv(open('rldata/daily_AMD_test.csv'), usecols=[col])
  # recent price are at top; reverse it
  return np.array([msft[col].values[::-1],
                   ibm[col].values[::-1],
                   qcom[col].values[::-1]])


def get_data_test_industry_single_long_term(col='Close'):
  """ Returns a 3 x n_step array """
#   qcom = pd.read_csv(open('rldata/daily_NVDA_short_test.csv'), usecols=[col])
  # qcom = pd.read_csv(open('rldata/daily_AMD_test.csv', usecols=[col])
  # recent price are at top; reverse it
#   df = yf.download(tickers='AMD', start=startDate, end=endDate, interval='1h')
  df = yf.download('AMD', start=startDate, end=endDate)
  print(df)
  df.reset_index(level=0, inplace=True)
#   df = df[:200]
  return np.array([df[col].values[::-1]])


def get_data_test_industry_single_short_term(col='Close'):
  """ Returns a 3 x n_step array """
  df = yf.download(tickers='AMD', start=startDate, end=endDate, interval='1h')
  print(df)
  df.reset_index(level=0, inplace=True)
  return np.array([df[col].values[::-1]])


def get_data_test_industry_branch(col='Close'):
  """ Returns a 3 x n_step array """
  amd = yf.download('AMD', start=startDate, end=endDate)
  intc = yf.download('INTC', start=startDate, end=endDate)
  smtc = yf.download('SMTC', start=startDate, end=endDate)
  # recent price are at top; reverse it
  return np.array([amd[col].values[::-1],
                   intc[col].values[::-1],
                   smtc[col].values[::-1]])

def get_scaler(env):
  """ Takes a env and returns a scaler for its observation space """
  low = [0] * (env.n_stock * 2 + 1)

  high = []
  max_price = env.stock_price_history.max(axis=1)
  min_price = env.stock_price_history.min(axis=1)
  max_cash = env.init_invest * 3 # 3 is a magic number...
  max_stock_owned = max_cash // min_price
  for i in max_stock_owned:
    high.append(i)
  for i in max_price:
    high.append(i)
  high.append(max_cash)

  scaler = StandardScaler()
  scaler.fit([low, high])
  return scaler


def maybe_make_dir(directory):
  if not os.path.exists(directory):
    os.makedirs(directory)


def print_state(state, next_state, action, reward, done, info):
    print(f'\tcurrent state after transform: {state}')
    print(f'\taction: {ACTION_CODES[action]}')
    print(f'\tnext state:')
    print(f'\t\tshares owned: {next_state[0]}, share price: {next_state[1]}, cash: {next_state[2]}')
    print(f'\treward: {str(reward)}')
    print(f'\tdone: {str(done)}')
    print(f'\tinfo: {str(info)}\n\n')


#############################################################################
#############################################################################
# TRAIN MODEL
#############################################################################
#############################################################################

EPISODES = 2000
BATCH_SIZE = 8
MODE = 'train'
WEIGHTS = 'weights/202105160751-dqn.h5'
INITIAL_INVEST = 20000
ACTION_CODES = ['SELL', 'HOLD', 'BUY']

if __name__ == '__main__':

  maybe_make_dir('weights')
  maybe_make_dir('portfolio_val')

  timestamp = time.strftime('%Y%m%d%H%M')

# v1 long-term
  if not DAY_TRADING_ENABLED:
    # data = np.around(get_data())
    # data = np.around(get_data_test_legacy())
    data = np.around(get_data_test_industry_branch())
    # data = np.around(get_data_test_industry_single_long_term())
    train_data = data[:, :50]
    test_data = data[:, 50:]
    ########################################################################################################################
  else:
    # v2 short-term
    data = np.around(get_data_test_industry_single_short_term())
    train_data = data[:, :200]
    test_data = data[:, 200:]
    print(f'train data: {train_data}')
  ######################################################################################################################

  env = TradingEnv(train_data, INITIAL_INVEST)
  state_size = env.observation_space.shape
  action_size = env.action_space.n
  agent = DQNAgent(state_size, action_size)
  scaler = get_scaler(env)

  portfolio_value = []

  if MODE == 'test':
    # remake the env with test data
    env = TradingEnv(test_data, INITIAL_INVEST)
    # load trained weights
    agent.load(WEIGHTS)
    # when test, the timestamp is same as time when weights was trained
    timestamp = re.findall(r'\d{12}', WEIGHTS)[0]

  for e in range(EPISODES):
    state = env.reset()
    if MODE != 'train':
      print(f'state before scaler transform: {state}')
    # scaler transform normalizes stock according to its observation space
    state = scaler.transform([state])
    if MODE != 'train':
      print(f'state after scaler transform: {state}')
    for time in range(env.n_step):
      # print(f'acting on state: {state}')
      action = agent.act(state)
      next_state, reward, done, info = env.step(action)
      if MODE != 'train':
        print_state(state, next_state, action, reward, done, info)
      next_state = scaler.transform([next_state])
      if MODE == 'train':
        agent.remember(state, action, reward, next_state, done)
      state = next_state
      if done:
        # print('action: ' + str(TradingEnv._trade(action)))
        print("episode: {}/{}, episode end value: {}".format(
          e + 1, EPISODES, info['cur_val']))
        portfolio_value.append(info['cur_val']) # append episode end portfolio value
        break
      if MODE == 'train' and len(agent.memory) > BATCH_SIZE:
        agent.replay(BATCH_SIZE)
    if MODE == 'train' and (e + 1) % 10 == 0:  # checkpoint weights
      if DAY_TRADING_ENABLED:
        agent.save('weights/{}-DAY-dqn.h5'.format(timestamp))
      else:
        agent.save('weights/{}-dqn.h5'.format(timestamp))

  # save portfolio value history to disk
  with open('portfolio_val/{}-{}.p'.format(timestamp, MODE), 'wb') as fp:
    pickle.dump(portfolio_value, fp)

    state = env.reset()
    state = [2,200,15000]
    state = scaler.transform([state])
    action = agent.act(state)
    next_state, reward, done, info = env.step(action)
    if MODE != 'train':
      print_state(state, next_state, action, reward, done, info)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
'daily_NVDA_short_test (1).csv'   [0m[01;34mrldata[0m/
'daily_NVDA_short_test (2).csv'   [01;34mrobin_stocks[0m/
'daily_NVDA_short_test (3).csv'   [01;34mschool[0m/
'daily_NVDA_short_test (4).csv'   sec_sentiment_512.h5
'daily_NVDA_short_test (5).csv'   sec_sentiment_512.model
 daily_NVDA_short_test.csv        sec_sentiment_LSTM.h5
 did.bin                          sec_sentiment_LSTM.model
 [01;34mFish[0m/                            sentiment.csv
 glove.6B.100d.txt                [01;34mSocialDistanceDetector[0m/
'IMDB Dataset.csv'                [01;34mstock_cnn_data[0m/
 [01;34mImportData[0m/                      [01;34mstock_spectogram_data[0m/
 [01;34mportfolio_val[0m/                   [01;34mweights[0m/
 requirements.txt
[Errno 2] No such file or directory: 'drive/My Drive/Colab Notebooks/data'
/content/drive/My Drive/Colab Notebook