In [2]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import deque
import random
import numpy as np
import copy

In [18]:
class DQN_network(nn.Module):
    def __init__(self,lr,input_dims,output_dims):
        super(DQN_network,self).__init__()
        self.fc_1=nn.Linear(*input_dims,128)
        self.fc_2=nn.Linear(128,256)
        self.fc_3=nn.Linear(256,256)
        self.fc_4=nn.Linear(256,128)
        self.fc_5=nn.Linear(128,output_dims)
        
        self.lr=lr
        self.loss=nn.HuberLoss()
        self.optimizer=optim.Adam(self.parameters(),lr=lr)
        
        self.device=T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self,state):
        x=F.relu(self.fc_1(state))
        x=F.relu(self.fc_2(x))
        x=F.relu(self.fc_3(x))
        x=F.relu(self.fc_4(x))
        actions=self.fc_5(x)
        
        return actions
    
    
        
        

In [76]:
np.argmax(b)

tensor(3)

In [124]:
class Agent:
    def __init__(self,state_size,strategy="t-dqn",reset_every=1000,pretrained=False,model_name=None,epsilon_decay=0.995):
        self.strategy=strategy
        
        self.state_size=state_size
        self.action_size=3
        self.model_name=model_name
        self.inventory=[]
        self.memory=deque(maxlen=10000)
        self.first_iter=True
        self.loss_val=0
        
        
        self.model_name=model_name
        self.gamma=0.95
        self.epsilon=1
        self.epsilon_decay=epsilon_decay
        self.epsilon_min=0.01
        self.learning_rate=0.001
        self.loss=nn.HuberLoss()
        self.custom_objects={"huber_loss":nn.HuberLoss()}
        
        
        if pretrained and self.model_name is not None:
            self.model=self.load()
        else:
            self.model=DQN_network(lr=self.learning_rate,input_dims=self.state_size,output_dims=self.action_size)
            
        if self.strategy in ["t-dqn","double-dqn"]:
            self.n_iter=1
            self.reset_every=reset_every
            
            self.target_model=copy.deepcopy(self.model)
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
        
    def act(self,state,is_eval=False):
        if not is_eval and random.random() <=self.epsilon:
            return random.randrange(self.action_size)
        
        if self.first_iter:
            self.first_iter=False
            return 1
        device=T.device("cuda:0" if T.cuda.is_available() else "cpu")
        state_tensor=T.FloatTensor(state).to(device)
        action_probs=self.model.forward(state_tensor)
        return T.argmax(action_probs).item()
    
    def train_exp_relay(self,batch_size):
        batch=random.sample(self.memory,batch_size)
        X_train,y_train = [],[]
        states,actions,rewards,next_states,dones=zip(*batch)
        
        batch_index=np.arange(batch_size,dtype=np.int32)
        device=T.device("cuda:0" if T.cuda.is_available() else "cpu")
        states = T.FloatTensor(states).to(device)
        next_states = T.FloatTensor(next_states).to(device)
        rewards = T.FloatTensor(rewards).to(device)
        dones = T.FloatTensor(dones).to(device)
        
        if self.strategy=="dqn":
            with T.no_grad():
                max_next_q_values=self.model.forward(next_states).max(dim=1)[0]
            max_next_q_values[dones==1]=0.0
            target_values=rewards+self.gamma*max_next_q_values
            
            
        elif self.strategy=="t-dqn":
            if self.n_iter%self.reset_every==0:
                self.target_model.load_state_dict(self.model.state_dict())
            with T.no_grad():
                max_next_q_values=self.target_model.forward(next_states).max(dim=1)[0]
            max_next_q_values[dones==1]=0.0
            target_values=rewards+self.gamma*max_next_q_values
            
        elif self.strategy=="double-dqn":
            if self.n_iter%self.reset_every==0:
                self.target_model.load_state_dict(self.model.state_dict())
            with T.no_grad():
                action=T.argmax(self.model.forward(next_states),dim=1)
                max_next_q_values=self.target_model.forward(next_states)[batch_index,action]
            max_next_q_values[dones==1]=0.0
            target_values=rewards+self.gamma*max_next_q_values
            
        else:
            raise NotImplementedError()
        

        predicted_vals=self.model.forward(states)[batch_index,actions]
        
        self.model.optimizer.zero_grad()
    
        loss=self.loss(predicted_vals,target_values)
        self.loss_val=loss
        loss.backward()
        self.model.optimizer.step()
        
        if self.epsilon>self.epsilon_min:
            self.epsilon*=self.epsilon_decay
            
        return loss
                
            
        
            
            
                
                
            
                
        
        

In [52]:
print(random.randrange(4))

2


In [48]:
batch_index=np.arange(32,dtype=np.int32)
tensor = T.tensor([[1, 2, 3],
                       [4, 5, 6],
                       [7, 8, 9]])
a=2

In [49]:
tensor[:,a]

tensor([3, 6, 9])

In [9]:
print(T.cuda.is_available())

True


In [149]:
class StockTradingEnv:
    def __init__(self,data,initial_balance=10000):
        self.data=data
        self.initial_balance=initial_balance
        self.current_step=0
        self.balance=initial_balance
        self.holdings=0
        self.net_worth=initial_balance
        self.max_steps=len(data)
    
    def reset(self):
        self.current_step=0
        self.balance=self.initial_balance
        self.holdings=0
        self.net_worth=self.initial_balance
        state=self.data.iloc[self.current_step].values[1:]
        return list(state)
    
    def get_state(self):
        state=self.data.iloc[self.current_step].values[1:]
        return list(state)
    
    def step(self,action):
        
        current_price = self.data.iloc[self.current_step]['Close']
        self.current_price=current_price
        reward=0
        if action==1:#buy
            if self.balance>current_price:
                self.previous_balance=self.balance
                self.holdings+=self.balance/current_price
                self.balance=0
                
        elif action==2:#sell
            if self.holdings>0:
                self.balance+=self.holdings*current_price
                self.holdings=0
                
                self.net_worth=self.balance+self.holdings*current_price
                reward=self.net_worth-self.initial_balance
        
        self.current_step+=1
        done=self.current_step>=self.max_steps-1
        
        return self.get_state(),reward,done,{}
        

In [150]:
import pandas as pd

data=pd.read_csv('MSFT.csv')
data=data[-1000:]
episodes=100
batch_size=32
env=StockTradingEnv(data)
agent=Agent(state_size=[6],epsilon_decay=0.9995,strategy="double-dqn")
reward_history=[]
for episode in range(episodes):
    state=env.reset()
    total_reward=0
    while True:
        action=agent.act(state)
        next_state,reward,done,info=env.step(action)
        agent.remember(state,action,reward,next_state,done)
        
        state=next_state
        total_reward+=reward
        
        if done:
            break
        
        if len(agent.memory)>batch_size:
            agent.train_exp_relay(batch_size)
    reward_history.append(total_reward)
    if episode>100:
        avg_reward=sum(reward_history[-100:])/100
    else:
        avg_reward=sum(reward_history[-100:])/(episode+1)
    print(f'Episode: {episode+1}, Total Reward: {total_reward},avg_reward: {avg_reward}, Epsilon:{agent.epsilon},Loss:{agent.loss_val},Money earned:{env.balance+env.holdings*env.current_price}')
        

Episode: 1, Total Reward: 85090.97444939101,avg_reward: 85090.97444939101, Epsilon:0.6168553087570796,Loss:302167.125,Money earned:16665.08961023402
Episode: 2, Total Reward: 220038.503922344,avg_reward: 152564.7391858675, Epsilon:0.37446925262555913,Loss:275048.4375,Money earned:18929.92473126836
Episode: 3, Total Reward: 160457.13632224407,avg_reward: 155195.53823132635, Epsilon:0.22732595338198971,Loss:296950.375,Money earned:20321.91637342433
Episode: 4, Total Reward: 49802.836210972135,avg_reward: 128847.3627262378, Epsilon:0.13800088717218023,Loss:227507.6875,Money earned:16731.39750207785
Episode: 5, Total Reward: 46727.46154194088,avg_reward: 112423.38248937842, Epsilon:0.08377505769571193,Loss:159605.046875,Money earned:15124.928354663909
Episode: 6, Total Reward: 32746.583322885926,avg_reward: 99143.91596162967, Epsilon:0.05085663169080453,Loss:167078.9375,Money earned:18221.0442022839
Episode: 7, Total Reward: -1766.6617585586646,avg_reward: 84728.11914445991, Epsilon:0.0308

KeyboardInterrupt: 

In [63]:
import pandas as pd

data=pd.read_csv('MSFT.csv')

24.49

In [105]:
env=StockTradingEnv(data)

In [78]:
print(env.holdings)

408.32993058391185


In [81]:
env.data.iloc[0].shape

(7,)

In [109]:
state=env.step(0)


In [114]:
print(list(state[0]))

[25.709999, 26.08, 25.610001, 25.84, 21.241688, 59514000]
