### Importing Packages which are required...

In [32]:
from langchain_cerebras import ChatCerebras
from langchain_community.document_loaders.notebook import NotebookLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_community.vectorstores import FAISS
from pathlib import Path
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os
from langchain_core.output_parsers import StrOutputParser

load_dotenv()


True

### Initialize loaders


In [3]:
pdf_loader = PyPDFDirectoryLoader(path="./PDF_DOC/PDF")


In [6]:
pdf_documents = pdf_loader.load()


Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 49 0 (offset 0)
Ignoring wrong pointing object 64 0 (offset 0)
Ignoring wrong pointing object 124 0 (offset 0)
Ignoring wrong pointing object 151 0 (offset 0)
Ignoring wrong pointing object 193 0 (offset 0)
Ignoring wrong pointing object 200 0 (offset 0)
Ignoring wrong pointing object 400 0 (offset 0)
Ignoring wrong pointing object 411 0 (offset 0)
Ignoring wrong pointing object 506 0 (offset 0)
Ignoring wrong pointing object 655 0 (offset 0)
Ignoring wrong pointing object 683 0 (offset 0)
Ignoring wrong pointing object 271 0 (offset 0)
Ignoring wrong pointing object 334 0 (offset 0)
Ignoring wrong pointing object 355 0 (offset 0)
Ignoring wrong pointing object 463 0 (offset 0)
Ignoring wrong pointing object 549 0 (offset 0)
Ignoring wrong pointing object 553 0 (offset 0)
Ignoring wrong pointing object 557 0 (offset 0)
Ignoring wrong pointing object 561 0 (offset

In [9]:
def load_notebooks_from_folder(folder_path, include_outputs=False, max_output_length=10, remove_newline=False, traceback=False):
    folder = Path(folder_path)
    docs = []
    for ipynb_path in folder.glob("*.ipynb"):
        loader = NotebookLoader(
            str(ipynb_path),
            include_outputs=include_outputs,
            max_output_length=max_output_length,
            remove_newline=remove_newline,
            traceback=traceback,
        )
        docs.extend(loader.load())
    return docs


In [10]:
documents = load_notebooks_from_folder(
    "./PDF_DOC/Notebook/",
    include_outputs=True,
    max_output_length=1000,
    remove_newline=False,
)


In [None]:
documents


[Document(metadata={'source': 'PDF_DOC\\Notebook\\Ch_3_MDP.ipynb'}, page_content='\'markdown\' cell: \'[\'# Finite Markov Decision Processes\\n\', \'\\n\', \'In this chapter,\\n\', \'- You will learn about the core components of reinforcement learning.\\n\', \'- You will learn to represent sequential decision-making problems as reinforcement learning environments using a mathematical framework known as Markov decision processes.\\n\', \'- You will build from scratch environments that reinforcement learning agents learn to solve in later chapters.\']\'\n\n\'markdown\' cell: \'[\'## The Agent–Environment Interface\\n\', \'\\n\', \'![image.png](attachment:de944f75-b6e9-452a-ab51-5c8f57a981df.png)\\n\', \'\\n\', \'To better understand the interactions between the agent and the environment, we can unroll the interaction loop as follows, as shown in the following figure.\\n\', \'\\n\', \'![image.png](attachment:937ed64a-077d-4921-b6ff-f3cea47d9b5d.png)\\n\', \'\\n\', \'- The agent observes t

In [None]:
pdf_documents


[Document(metadata={'producer': 'iOS Version 14.7.1 (Build 18G82) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20210907021931Z00'00'", 'moddate': '2025-06-16T20:08:03+05:30', 'source': 'PDF_DOC\\PDF\\RL Chapter 02.pdf', 'total_pages': 28, 'page': 0, 'page_label': '1'}, page_content='2.\nT-nmedia-teReinforcementl-earn.mg/Leycharacteristicsof an RL problem:-\n-\n① Learningto act in manysituations.\n②Delayedrewardsandcreditassignment.\n③ Exploration/Exploitationdilemma.\nExploration/Exploitationdilemmais a uniquechallenge\nin RL that doesnot exist in other learning\nparadigms.\nThis is due to the natureof feedback\nto the RL agent.\nTwotypesof feedback/ Instructivefeedback\n⊥Evaluativefeedback.\no→a-→ At t'),
 Document(metadata={'producer': 'iOS Version 14.7.1 (Build 18G82) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20210907021931Z00'00'", 'moddate': '2025-06-16T20:08:03+05:30', 'source': 'PDF_DOC\\PDF\\RL Chapter 02.pdf', 'total_pages': 28, 'page': 1, 'page_

In [None]:
final_documents = pdf_documents + documents


In [23]:
# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
texts = text_splitter.split_documents(final_documents)


In [None]:
# Create embeddings
embeddings = NVIDIAEmbeddings(nvidia_api_key = os.getenv("NVIDIA_API_KEY")) 


In [27]:
# Create FAISS vector store
db = FAISS.from_documents(texts, embeddings)


In [None]:
db.save_local("faiss_index_rl_learn")


In [29]:
rl_vectorstore = FAISS.load_local("faiss_index_rl_learn", embeddings, allow_dangerous_deserialization=True)


In [None]:
llm = ChatCerebras(model = "gpt-oss-120b",api_key = os.getenv("CEREBRAS_API_KEY"))  


In [66]:
prompt = ChatPromptTemplate.from_template(
"""

Use Given Context to assist user query.

Ensure response must be from context and domain specific outof domain query not allowed here domain is context.

Brainstorm User with your context based responses.

INPUTS:
<context>
{context}
</context>

<query>
{query}
</query>
"""
)


In [67]:
retriever = db.as_retriever()


In [68]:
chain = prompt | llm | StrOutputParser()


In [69]:
# 4. Retrieval + injection
def answer_with_context(query):
    docs = retriever.get_relevant_documents(query)
    context = "\n\n".join(d.page_content for d in docs)
    return chain.invoke({"context": context, "query": query})


In [72]:
from IPython.display import Markdown
Markdown(answer_with_context("Explain bandit algorithm use code examples"))


Below is a short “play‑by‑play’’ of the bandit ideas that appear in the excerpt you gave (Figure 2.1, Figure 2.6 and the code fragment at the end).  
I’ll walk through the most common bandit strategies that are mentioned in the text – **ε‑greedy**, **optimistic‑initialization**, **UCB** and **gradient‑bandit** – and give a tiny Python example for each.  
All of the examples reuse the same simple **Bernoulli‑bandit** environment that the snippet shows:

```python
import numpy as np
import random

# ----- 3‑armed Bernoulli bandit (the “true” click‑through‑rates) -----
true_ctr = [0.05, 0.10, 0.20]                     # q* (a) for A, B, C
articles = ['A', 'B', 'C']

def bandit_step(action):
    """Return 1 (reward) with probability true_ctr[action],
       otherwise 0. This is the stochastic reward generator."""
    return 1 if np.random.rand() < true_ctr[action] else 0
```

The **action‑value estimate** for each arm, \(Q_t(a)\), is the running average of the rewards observed for that arm.  
In the textbook notation \(q_*(a)\) is the unknown true value (the numbers above), while \(Q_t(a)\) is the learner’s current guess.

---

## 1. ε‑greedy (the “greedy‑with‑exploration” rule)

*Idea from the text*: “greedy with optimistic initialization” or “ε‑greedy” is the baseline method plotted in Figure 2.6.  
With probability \(ε\) we explore (pick a random arm); otherwise we exploit the arm with the highest estimated value.

```python
def epsilon_greedy(eps=0.1, steps=1000):
    n_actions = len(true_ctr)
    Q = np.zeros(n_actions)          # Q_0(a) = 0 for all a
    N = np.zeros(n_actions)          # count of pulls per arm
    total_reward = 0

    for t in range(steps):
        # --- choose action ---
        if random.random() < eps:                 # explore
            a = random.randint(0, n_actions-1)
        else:                                      # exploit
            a = np.argmax(Q)

        # --- take step and observe reward ---
        r = bandit_step(a)
        total_reward += r

        # --- update estimates (sample‑average) ---
        N[a] += 1
        Q[a] += (r - Q[a]) / N[a]                 # incremental average

    print(f'ε‑greedy (ε={eps}) avg. reward: {total_reward/steps:.3f}')
    print('Final Q values:', Q)
```

Running `epsilon_greedy()` will converge toward the best arm (C, with true CTR 0.20) while still occasionally trying the poorer arms.

---

## 2. Optimistic‑initialization (a variant of ε‑greedy)

*Idea from the text*: “greedy with optimistic initialization” starts each \(Q_0(a)\) at a high value (e.g., 5).  
Because all arms look attractive initially, the algorithm is forced to **explore** until the estimates drop down to realistic levels.

```python
def optimistic_greedy(initial=5.0, steps=1000):
    n_actions = len(true_ctr)
    Q = np.full(n_actions, initial)   # all Q_0(a) = 5.0 (optimistic)
    N = np.zeros(n_actions)
    total_reward = 0

    for t in range(steps):
        a = np.argmax(Q)                     # always greedy
        r = bandit_step(a)
        total_reward += r

        N[a] += 1
        Q[a] += (r - Q[a]) / N[a]

    print(f'optimistic‑greedy avg. reward: {total_reward/steps:.3f}')
    print('Final Q values:', Q)
```

Because the true rewards are at most 1, the optimistic start quickly pushes the estimate of each arm down after a few pulls, automatically generating exploration without an explicit ε.

---

## 3. Upper‑Confidence‑Bound (UCB)

*Idea from the text*: Figure 2.6 includes a “UCB” curve.  
UCB selects the arm with the highest **upper confidence bound**:

\[
a_t = \arg\max_a\Bigl(Q_t(a) + c\sqrt{\frac{\ln t}{N_t(a)}}\Bigr)
\]

The second term is large for rarely‑chosen arms, guaranteeing systematic exploration.

```python
def ucb(c=2.0, steps=1000):
    n_actions = len(true_ctr)
    Q = np.zeros(n_actions)
    N = np.zeros(n_actions)
    total_reward = 0

    # Pull each arm once to avoid division‑by‑zero
    for a in range(n_actions):
        r = bandit_step(a)
        Q[a] = r
        N[a] = 1
        total_reward += r

    for t in range(n_actions, steps):
        ucb_values = Q + c * np.sqrt(np.log(t) / N)
        a = np.argmax(ucb_values)

        r = bandit_step(a)
        total_reward += r

        N[a] += 1
        Q[a] += (r - Q[a]) / N[a]

    print(f'UCB (c={c}) avg. reward: {total_reward/steps:.3f}')
    print('Final Q values:', Q)
```

UCB usually attains a higher average reward than plain ε‑greedy because it balances exploration **optimally** based on statistical confidence.

---

## 4. Gradient Bandit (policy‑gradient style)

*Idea from the text*: “gradient‑bandit” appears as another curve in Figure 2.6.  
Instead of estimating values, we keep a **preference** \(H_t(a)\) for each arm and turn them into a stochastic policy via a soft‑max:

\[
\pi_t(a) = \frac{e^{H_t(a)}}{\sum_b e^{H_t(b)}} .
\]

After each reward we update the preferences toward actions that performed better than the *average* reward.

```python
def gradient_bandit(alpha=0.1, steps=1000):
    n_actions = len(true_ctr)
    H = np.zeros(n_actions)          # preferences H_0(a) = 0
    avg_reward = 0.0
    total_reward = 0

    for t in range(1, steps+1):
        # compute soft‑max policy
        exp_H = np.exp(H - np.max(H))      # numeric stability
        pi = exp_H / np.sum(exp_H)

        # sample action according to pi
        a = np.random.choice(n_actions, p=pi)

        # observe reward
        r = bandit_step(a)
        total_reward += r

        # incremental average reward
        avg_reward += (r - avg_reward) / t

        # update preferences
        for i in range(n_actions):
            indicator = 1.0 if i == a else 0.0
            H[i] += alpha * (r - avg_reward) * (indicator - pi[i])

    print(f'gradient‑bandit (α={alpha}) avg. reward: {total_reward/steps:.3f}')
    print('Final preferences H:', H)
    print('Resulting policy π:', np.exp(H)/np.sum(np.exp(H)))
```

The gradient method directly learns a **probability distribution** over actions.  
When a pull yields a reward larger than the running average, the probability of that arm is increased; otherwise it is decreased.

---

## 5. Putting it together – a quick comparison

```python
if __name__ == "__main__":
    steps = 2000
    epsilon_greedy(eps=0.1, steps=steps)
    optimistic_greedy(initial=5.0, steps=steps)
    ucb(c=2.0, steps=steps)
    gradient_bandit(alpha=0.1, steps=steps)
```

Running the block will output something like:

```
ε‑greedy (ε=0.1) avg. reward: 0.156
optimistic‑greedy avg. reward: 0.162
UCB (c=2.0) avg. reward: 0.167
gradient‑bandit (α=0.1) avg. reward: 0.155
```

(The exact numbers vary because the environment is stochastic, just as the Figure 2.6 plot shows a spread of results for the same algorithms.)

---

### What the figures in the text illustrate

* **Figure 2.1** – a concrete 10‑armed testbed (the true values \(q_*(a)\) are plotted).  
  Our three‑armed example is a miniature version of that same set‑up.

* **Figure 2.6** – a **parameter study**: each curve (ε‑greedy, UCB, gradient, optimistic) is the average reward over the first 1 000 steps for many runs.  
  The code snippets above reproduce the same idea on a much smaller scale (you can change `steps` to 1000 to match the figure).

* **Index entries** – the list mentions “k‑armed bandits”, “action preferences”, “gradient bandit”, etc.; the functions `gradient_bandit` and `ucb` directly correspond to those entries.

---

## TL;DR

| Algorithm | Core Idea | Exploration Mechanism | Typical Code Hook |
|-----------|-----------|----------------------|-------------------|
| ε‑greedy | Keep sample‑average \(Q_t(a)\). Choose best arm most of the time, random arm with prob ε. | Random jumps (ε). | `if random()<ε: random action else: argmax(Q)` |
| Optimistic‑init. | Same as greedy, but start \(Q_0(a)\) high → forced exploration. | Implicit via high initial values. | `Q = np.full(k, high_value)` |
| UCB | Upper‑confidence bound adds bonus \(\propto \sqrt{\frac{\ln t}{N_t(a)}}\). | Systematic, theoretically efficient. | `a = argmax(Q + c*sqrt(log(t)/N))` |
| Gradient | Maintain preferences \(H_t(a)\); derive policy via soft‑max. | Policy‑gradient update toward better‑than‑average actions. | `H += α*(r−avg)*(I[a]==1−π)` |

All four methods operate on the same **bandit** framework that the excerpt describes: a set of actions (arms), stochastic rewards, and a learning rule that updates estimates or preferences after each pull. Feel free to experiment with the `steps`, `ε`, `c`, or `α` parameters to see how the curves in Figure 2.6 would move. Happy pulling!

### Streaming tab


In [None]:
# LLM with streaming
llm_streaming = ChatCerebras(model = "gpt-oss-120b",api_key = os.getenv("CEREBRAS_API_KEY"))  


In [73]:
# Chain
chain_streaming = prompt | llm_streaming | StrOutputParser()


In [79]:
# Streaming function
def stream_answer(query):
    docs = retriever.get_relevant_documents(query)
    context = "\n\n".join(d.page_content for d in docs)

    for chunk in chain_streaming.stream({"context": context, "query": query}):
        print(chunk, end="", flush=True)  # stream tokens as they arrive


In [80]:
stream_answer("Generate python code for  SARSA problem from scratch")


Below is a **stand‑alone Python script** that implements the SARSA(λ) algorithm (plain SARSA is a special case with λ = 0) from scratch for the classic *FrozenLake* environment.  
All of the pieces shown in the provided context are used:

* the `sarsa` function signature (`env, num_episodes=5000, alpha=0.1, gamma=0.95, epsilon=0.1, log_interval=100`)  
* creation of a random 4 × 4 lake (`CustomFrozenLakeEnv` with `generate_random_map`)  
* optional recording of episode statistics (`gym.wrappers.RecordEpisodeStatistics`)  
* a progress bar (`tqdm`)  

You can copy‑paste the whole block into a file (e.g. `sarsa_frozenlake.py`) and run it with a standard Python 3 interpreter.

```python
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
SARSA implementation for a (random) FrozenLake environment.
The code follows the template that appears in the supplied context,
but is written from scratch so that it can be executed as a
stand‑alone script.

Requirements
------------
- numpy
- gym (>=0.21