# Reinforcement Learning Based Trading Agent

Follow the instructions step by step and fill in the TODOs


## 1. Install and Import Libraries

In [21]:

# Uncomment only if needed
!pip install yfinance numpy pandas matplotlib

import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt




## 2. Download Market Data

In [33]:

# TODO: choose a stock symbol (e.g., "AAPL", "MSFT", "GOOG")
symbol = "AAPL"

# TODO: download historical stock price data using yfinance
# Hint: use yf.download with a start and end date
data = yf.download(symbol, start="2018-01-01", end="2020-01-01")

# TODO: extract ONLY the closing prices
# IMPORTANT: flatten the array so each price is a scalar (fixes NumPy state error)
# Train-test split
prices = data["Close"].dropna()

train_size = int(0.7 * len(prices))
train_prices = prices[:train_size]
test_prices = prices[train_size:]

print("Training samples:", len(train_prices))
print("Testing samples:", len(test_prices))


  data = yf.download(symbol, start="2018-01-01", end="2020-01-01")
[*********************100%***********************]  1 of 1 completed

Training samples: 352
Testing samples: 151





## 3. Trading Environment

In [34]:
# TODO: Define a custom trading environment for Reinforcement Learning

class TradingEnv:
    def __init__(self, prices):
        # TODO: store historical prices
        self.prices = prices.values

        # TODO: reset environment to initial state
        self.reset()

    def reset(self):
        # TODO: initialize time step
        self.t = 0

        # TODO: initialize starting cash
        self.cash = 500

        # TODO: initialize stock holding
        # 0 = no stock, 1 = holding stock
        self.stock = 0
        self.hold_time = 0

        # TODO: initialize done flag
        self.done = False

        # TODO: return initial state
        return self._get_state()

    def _get_state(self):
        # TODO: define state as a NumPy array
        # State should contain:
        # 1. current price
        # 2. stock holding (0 or 1)
        if self.t < 5:
            return (0, 0, self.stock)

        short_ma = np.mean(self.prices[self.t-3:self.t])
        long_ma = np.mean(self.prices[self.t-10:self.t])

        ma_signal = int(short_ma > long_ma)
        momentum = int(self.prices[self.t] > self.prices[self.t-1])

        hold_bucket = min(self.hold_time, 5)
        return (ma_signal, momentum, hold_bucket, self.stock)
    def step(self, action):
        # TODO: get current stock price
        price = self.prices[self.t]

        # TODO: define action logic
        # Action 0 → Hold (do nothing)
        # Action 1 → Buy (only if enough cash)
        # Action 2 → Sell (only if holding stock)
        prev_value = self.cash + self.stock * price
        if self.stock == 1:
            self.hold_time += 1
        else:
            self.hold_time = 0
        if action == 0:
          pass

        elif action == 1 and self.cash >= price and self.stock == 0:
            self.stock = 1
            self.cash -= price

        elif action == 2 and self.stock == 1:
            self.stock = 0
            self.cash += price
        if self.stock == 1:
           self.hold_time += 1
        else:
           self.hold_time = 0
        # TODO: move to next time step
        self.t += 1
        new_price = self.prices[self.t]
        # TODO: check termination condition
        if self.t >= len(self.prices) - 1:
            done = True
            new_price = price
        else:
            done = False
            new_price = self.prices[self.t]

        # TODO: define reward (portfolio value)
        reward = 0

        if action == 1 and self.stock == 1:
            reward += 0.001   # small incentive to enter market

        if self.stock == 1:
            reward += np.log(new_price / price)

        if done and self.stock == 1:
            self.cash += new_price
            self.stock = 0
        # TODO: return next_state, reward, done
        return self._get_state(), reward, done


## 4. Q-Learning Setup

In [35]:

# TODO: Initialize the Q-table
# Hint: number of states = number of time steps
# Hint: number of actions = 3 (Hold, Buy, Sell)

# State space: (price_trend, holding)
states = [(a,b,c,d)
          for a in [0,1]
          for b in [0,1]
          for c in range(6)
          for d in [0,1]]

# Actions: 0 = Hold, 1 = Buy, 2 = Sell
actions = [0, 1, 2]

from collections import defaultdict

Q = defaultdict(lambda: {a: 0.0 for a in actions})

# TODO: set learning rate (alpha)
alpha = 0.1

# TODO: set discount factor (gamma)
gamma = 0.95

# TODO: set exploration rate (epsilon)
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.995



## 5. Train the Agent

In [36]:


# TODO: create trading environment
env = TradingEnv(train_prices)

# TODO: set number of training episodes
episodes = 1000

# TODO: training loop
for episode in range(episodes):

    # TODO: reset environment at start of each episode
    state = tuple(env.reset())
    done = False
    # TODO: loop until episode ends
    while not done:

        # TODO: get current state index (time step)
        t = env.t

        # TODO: epsilon-greedy action selection
        if np.random.rand() < epsilon:
            action = np.random.choice(actions)
        else:
            action = max(Q[state], key=Q[state].get)

        # TODO: take action in environment
        next_state, reward, done = env.step(action)
        next_state = tuple(next_state)

        # TODO: update Q-value using Bellman equation
        Q[state][action] += alpha * (
            reward + gamma * max(Q[next_state].values()) - Q[state][action]
        )

        state = next_state
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

# TODO: indicate training completion
print("Training completed.")



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  momentum = int(self.prices[self.t] > self.prices[self.t-1])


Training completed.


## 6. Evaluate Trained Agent

In [37]:

# TODO: create a new environment for evaluation
env = TradingEnv(test_prices)
state = env.reset()
done = False

portfolio_values = []

# TODO: run the trained agent without exploration
while not done:

    # TODO: get current state index (time step)
    t = env.t

    # TODO: select best action from Q-table
    action = max(Q[state], key=Q[state].get)

    # TODO: apply action in environment
    state, _, done = env.step(action)
    portfolio_values.append(env.cash + env.stock * env.prices[min(env.t, len(env.prices)-1)])

# TODO: compute final portfolio value
final_value = portfolio_values[-1]

# TODO: print final result
print("RL Agent Final Portfolio Value:", final_value)



RL Agent Final Portfolio Value: [517.18130493]


  momentum = int(self.prices[self.t] > self.prices[self.t-1])


## 7. Buy and Hold Baseline

In [19]:
# TODO: implement Buy-and-Hold baseline strategy
# Instructions:
# - Buy one stock on the first day
# - Hold it until the last day
# - Start with initial cash of 10000


initial_cash = 500

shares = 1

buy_and_hold_value = shares * test_prices.iloc[-1] + initial_cash - shares * test_prices.iloc[0]

# TODO: print Buy-and-Hold portfolio value
print("Buy-and-Hold Final Portfolio Value:", buy_and_hold_value)



Buy-and-Hold Final Portfolio Value: Ticker
AAPL    491.319199
dtype: float64
