In [1]:
from pathlib import Path
import PyPDF2
import re
import json
from typing import Dict, List, Optional
import spacy

In [2]:
PDF_PATH = Path(r"data\raw\BartoSutton.pdf") # add your own path
PROCESSED_DIR = Path("data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
class BookParser:
    def __init__(self):
        # Pattern for main chapter starts
        self.chapter_pattern = re.compile(r'^Chapter\s+(\d+)\s*$\s*([^\n]+)', re.MULTILINE)
        
        # Pattern for sections 
        self.section_pattern = re.compile(r'^(\d+\.\d+)\s+([^\n]+)')
        
        # Pattern to remove headers/footers (matches format from Image 3)
        self.header_pattern = re.compile(r'^\d+\s+Chapter \d+:.+$', re.MULTILINE)
        
        # Pattern to identify part headers (like "I Tabular Solution Methods")
        self.part_pattern = re.compile(r'^(I{1,3}|IV)\s+([^\n]+)$', re.MULTILINE)
        
    def clean_text(self, text: str) -> str:
        """Remove headers, footers, and extra whitespace"""
        # Remove headers/footers
        text = self.header_pattern.sub('', text)
        # Remove multiple spaces and normalize newlines
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def parse_chapter(self, text: str, chapter_num: int) -> Dict:
        """Parse a single chapter's content and structure"""
        sections = {}
        current_section = None
        section_content = []
        
        # Split into lines and process
        lines = text.split('\n')
        for line in lines:
            section_match = self.section_pattern.match(line)
            if section_match:
                # Save previous section
                if current_section:
                    sections[current_section[0]] = {
                        'title': current_section[1],
                        'content': '\n'.join(section_content)
                    }
                # Start new section
                current_section = (section_match.group(1), section_match.group(2))
                section_content = [line]
            elif current_section:
                section_content.append(line)
        
        # Don't forget last section
        if current_section:
            sections[current_section[0]] = {
                'title': current_section[1],
                'content': '\n'.join(section_content)
            }
            
        return sections

    def process_book(self, text: str) -> Dict[int, Dict]:
        """Process entire book"""
        chapters = {}
        
        # Find all chapter starts
        for match in self.chapter_pattern.finditer(text):
            chapter_num = int(match.group(1))
            chapter_title = match.group(2).strip()
            
            # Get chapter content
            start_pos = match.start()
            end_pos = len(text)
            # Find next chapter start if exists
            next_match = self.chapter_pattern.search(text, pos=match.end())
            if next_match:
                end_pos = next_match.start()
                
            chapter_content = text[start_pos:end_pos]
            
            # Parse chapter sections
            sections = self.parse_chapter(chapter_content, chapter_num)
            
            chapters[chapter_num] = {
                'title': chapter_title,
                'content': self.clean_text(chapter_content),
                'sections': sections
            }
        
        return chapters

In [4]:
def process_book():
    # Read PDF
    print("Reading PDF...")
    with open(PDF_PATH, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    
    # Process book
    parser = BookParser()
    chapters = parser.process_book(text)
    
    # Save results
    for chapter_num, chapter_data in sorted(chapters.items()):
        # Save chapter content
        chapter_file = PROCESSED_DIR / f"chapter_{chapter_num:02d}.txt"
        with open(chapter_file, 'w', encoding='utf-8') as f:
            f.write(chapter_data['content'])
        
        # Save metadata
        metadata_file = PROCESSED_DIR / f"chapter_{chapter_num:02d}_meta.json"
        metadata = {
            'title': chapter_data['title'],
            'sections': {num: data['title'] for num, data in chapter_data['sections'].items()}
        }
        with open(metadata_file, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=2)
        
        # Print summary
        print(f"\nChapter {chapter_num}: {chapter_data['title']}")
        print("Sections:")
        for section_num, section_data in sorted(chapter_data['sections'].items()):
            print(f"  {section_num} {section_data['title']}")
            
    return chapters

In [5]:
if __name__ == "__main__":
    chapters = process_book()

Reading PDF...

Chapter 1: Introduction
Sections:
  1.1 Reinforcement Learning
  1.2 The example of Phil’s breakfast in this chapter was inspired by Agre (1988).
  1.3 Elements of Reinforcement Learning
  1.4 Limitations and Scope
  1.5 The temporal-di↵erence method used in the tic-tac-toe example is developed in
  1.6 Summary
  1.7 Early History of Reinforcement Learning

Chapter 2: Multi-armed Bandits
Sections:
  2.1 Bandit problems have been studied in statistics, engineering, and psychology. In
  2.10 Bellman (1956) was the ﬁrst to show how dynamic programming could be used
  2.2 Action-value methods for our k-armed bandit problem were ﬁrst proposed by
  2.3 The 10-armed Testbed
  2.4 Incremental Implementation
  2.5 Tracking a Nonstationary Problem
  2.6 Optimistic initialization was used in reinforcement learning by Sutton (1996).
  2.7 Early work on using estimates of the upper conﬁdence bound to select actions
  2.8 Gradient bandit algorithms are a special case of the gradient-

the chapters are separated correctly but the sections titles are not, we will fix that problem in the upcoming cells

In [6]:
class TextCleaner:
    def __init__(self):
        # Patterns for cleaning
        self.equation_pattern = re.compile(r'NULL?:?[A-Za-z0-9\(\)\[\]\{\}\+\-\*\/\=\>\<\!\~\#\$\%\^\&\_\|\.\,\;\:]+')
        self.null_pattern = re.compile(r'NULL\s*')
        self.multiple_spaces = re.compile(r'\s+')
        self.header_footer = re.compile(r'^\d+\s*Chapter \d+:.+$', re.MULTILINE)
        
    def clean_equations(self, text: str) -> str:
        """Clean equations and mathematical expressions"""
        # Replace equation-like patterns with [EQUATION]
        text = self.equation_pattern.sub(' [EQUATION] ', text)
        return text
        
    def clean_text(self, text: str) -> str:
        """Main cleaning function"""
        # Remove NULL markers
        text = self.null_pattern.sub('', text)
        
        # Clean equations
        text = self.clean_equations(text)
        
        # Remove headers and footers
        text = self.header_footer.sub('', text)
        
        # Clean up whitespace
        text = self.multiple_spaces.sub(' ', text)
        
        # Clean up line breaks
        lines = text.split('\n')
        cleaned_lines = []
        for line in lines:
            line = line.strip()
            if line:
                cleaned_lines.append(line)
        
        return '\n'.join(cleaned_lines)

    def clean_file(self, input_file: Path, output_file: Path):
        """Clean a single chapter file"""
        with open(input_file, 'r', encoding='utf-8') as f:
            text = f.read()
        
        cleaned_text = self.clean_text(text)
        
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)

In [7]:
def clean_chapters():
    cleaner = TextCleaner()
    
    # Process each chapter file
    for chapter_file in sorted(PROCESSED_DIR.glob("chapter_*.txt")):
        print(f"Cleaning {chapter_file.name}...")
        
        # Create cleaned version
        cleaned_file = PROCESSED_DIR / f"cleaned_{chapter_file.name}"
        cleaner.clean_file(chapter_file, cleaned_file)
        
        # Verify the cleaning
        with open(cleaned_file, 'r', encoding='utf-8') as f:
            cleaned_text = f.read()
            print(f"  Original size: {chapter_file.stat().st_size}")
            print(f"  Cleaned size: {cleaned_file.stat().st_size}")
            
            # Show a sample of the cleaned text
            print("\nSample of cleaned text:")
            print(cleaned_text[:200] + "...\n")

In [8]:
if __name__ == "__main__":
    clean_chapters()

Cleaning chapter_01.txt...
  Original size: 76372
  Cleaned size: 76372

Sample of cleaned text:
Chapter 1 Introduction The idea that we learn by interacting with our environment is probably the ﬁrst to occur to us when we think about the nature of learning. When an infant plays, waves its arms, ...

Cleaning chapter_02.txt...
  Original size: 52647
  Cleaned size: 52647

Sample of cleaned text:
Chapter 2 Multi-armed Bandits The most important feature distinguishing reinforcement learning from other types of learning is that it uses training information that evaluates the actions taken rather...

Cleaning chapter_03.txt...
  Original size: 73503
  Cleaned size: 73503

Sample of cleaned text:
Chapter 3 Finite Markov Decision Processes In this chapter we introduce the formal problem of ﬁnite Markov decision processes, or ﬁnite MDPs, which we try to solve in the rest of the book. This proble...

Cleaning chapter_04.txt...
  Original size: 46430
  Cleaned size: 46430

Sample of cleaned tex

For section detection, we need to handle:


- Full sections (e.g., "** 1.1 Reinforcement Learning**")
- Referenced sections (e.g., "Section 17.3")


For NULL markers, we need to:


 - Replace NULL markers in equations context
 - Handle the special case where NULL represents "ffi"

In [30]:
import json
import re
from pathlib import Path
from typing import Dict, List

class RLTextProcessor:
    def __init__(self):
        self.replacements = {
            'NUL': 'ffi',
            '↵': 'ff',
            '  ': ' ',
            'ﬁ': 'fi',
            'ﬂ': 'fl'
        }

    def clean_text(self, text: str) -> str:
        """Basic text cleaning with specific replacements"""
        cleaned = text
        for old, new in self.replacements.items():
            cleaned = cleaned.replace(old, new)
        return re.sub(r'\s+', ' ', cleaned).strip()

    def find_section_content(self, text: str, section_num: str, next_section_num: str = None) -> str:
        """Find content between current section and next section"""
        # Create pattern for current section
        section_pattern = f"{section_num}\\s+"
        section_start = re.search(section_pattern, text)
        
        if not section_start:
            return ""
            
        start_pos = section_start.end()
        
        # If we have a next section, find its position
        if next_section_num:
            next_pattern = f"{next_section_num}\\s+"
            next_match = re.search(next_pattern, text[start_pos:])
            if next_match:
                end_pos = start_pos + next_match.start()
            else:
                end_pos = len(text)
        else:
            end_pos = len(text)
            
        content = text[start_pos:end_pos].strip()
        return self.clean_text(content)

    def process_chapter(self, text: str, metadata: Dict) -> Dict:
        """Process chapter using metadata sections"""
        # Get ordered list of sections
        sections = metadata["sections"]
        section_nums = sorted(sections.keys())
        
        processed_sections = {}
        
        # Process each section
        for i, section_num in enumerate(section_nums):
            next_section = section_nums[i + 1] if i < len(section_nums) - 1 else None
            content = self.find_section_content(text, section_num, next_section)
            processed_sections[section_num] = content
            
        return {
            "title": metadata["title"],
            "sections": processed_sections
        }

def process_all_chapters(base_dir: str = "./") -> None:
    """Process all chapters using metadata files"""
    processor = RLTextProcessor()
    
    # Process chapters 1 through 17
    for chapter_num in range(1, 18):
        chapter_num_str = f"{chapter_num:02d}"
        try:
            # Read metadata
            meta_path = Path(base_dir) / f"chapter_{chapter_num_str}_meta.json"
            with open(meta_path, 'r', encoding='utf-8') as f:
                metadata = json.load(f)
                
            # Read chapter text
            text_path = Path(base_dir) / f"chapter_{chapter_num_str}.txt"
            with open(text_path, 'r', encoding='utf-8') as f:
                text = f.read()
                
            # Process chapter
            result = processor.process_chapter(text, metadata)
            
            # Save processed content
            output_path = Path(base_dir) / f"chapter_{chapter_num_str}_processed.json"
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
                
            print(f"Processed Chapter {chapter_num_str}")
            # Debug output
            print(f"Title: {result['title']}")
            print("Sections found:")
            for section_num, content in result['sections'].items():
                print(f"  {section_num}: {len(content)} characters")
                
        except Exception as e:
            print(f"Error processing Chapter {chapter_num_str}: {str(e)}")
            raise e

if __name__ == "__main__":
    process_all_chapters("data/processed")

Processed Chapter 01
Title: Introduction
Sections found:
  1.1: 10761 characters
  1.2: 4220 characters
  1.3: 4874 characters
  1.4: 3451 characters
  1.5: 15546 characters
  1.6: 1410 characters
  1.7: 33870 characters
Processed Chapter 02
Title: Multi-armed Bandits
Sections found:
  2.1: 37829 characters
  2.10: 8382 characters
  2.2: 2737 characters
  2.3: 5535 characters
  2.4: 2799 characters
  2.5: 4359 characters
  2.6: 4639 characters
  2.7: 3313 characters
  2.8: 1023 characters
  2.9: 21126 characters
Processed Chapter 03
Title: Finite Markov Decision
Sections found:
  3.1: 11689 characters
  3.2: 134 characters
  3.3: 5097 characters
  3.4: 7672 characters
  3.5: 1242 characters
  3.6: 280 characters
  3.7: 578 characters
  3.8: 44244 characters
Processed Chapter 04
Title: Dynamic Programming
Sections found:
  4.1: 6026 characters
  4.2: 518 characters
  4.3: 11471 characters
  4.4: 293 characters
  4.5: 278 characters
  4.6: 336 characters
  4.7: 6093 characters
  4.8: 184

In [33]:
import json
import re
from pathlib import Path
from typing import Dict, List

class TextCleaner:
    def __init__(self):
        # Unicode replacements
        self.unicode_map = {
            '\ufb01': 'fi',  # fi ligature
            '\ufb02': 'fl',  # fl ligature
            '\u21b5': 'ff',  # ↵ to ff
            '\u0000': '',    # null character
            '\u21b5': 'ff',  # another variant of ↵
            '\u0003': '',    # end of text character
            '\u0002': '',    # start of text character
            '\u0001': '',    # start of heading
            '\u0004': '',    # end of transmission
            '\u2019': "'",   # right single quotation mark
            '\u201c': '"',   # left double quotation mark
            '\u201d': '"',   # right double quotation mark
            '\u2013': '-',   # en dash
            '\u2014': '--',  # em dash
            '\u00ac': '-',   # not sign
            '\u2022': '*',   # bullet
            '\u00b5': 'μ',   # micro sign
            '\u03c0': 'π',   # pi
            '\u03b5': 'ε',   # epsilon
            '\u03b1': 'α',   # alpha
            '\u03b2': 'β',   # beta
            '\u03b3': 'γ',   # gamma
            '\u03b4': 'δ',   # delta
            '\u03bb': 'λ',   # lambda
            '\u03c3': 'σ',   # sigma
            '\u03c4': 'τ',   # tau
            '\u03c9': 'ω',   # omega
            '\u2192': '->',  # rightward arrow
            '\u2190': '<-',  # leftward arrow
            '\u2194': '<->',  # bidirectional arrow
            '\u21d2': '=>',  # rightward double arrow
            '\u21d0': '<=',  # leftward double arrow
            '\u21d4': '<=>',  # bidirectional double arrow
            '\u2208': 'in',  # element of
            '\u2209': 'not in',  # not an element of
            '\u220b': 'ni',  # contains as member
            '\u2229': 'intersection',  # intersection
            '\u222a': 'union',  # union
            '\u2264': '<=',  # less-than or equal to
            '\u2265': '>=',  # greater-than or equal to
            '\u221e': 'infinity',  # infinity
            '\u2248': '≈',  # almost equal to
            '\u2260': '!=',  # not equal to
            '\u00d7': 'x',  # multiplication sign
            '\u2217': '*',  # asterisk operator
            '\u221a': 'sqrt',  # square root
            '\u223c': '~',  # tilde operator
            '\u2026': '...',  # horizontal ellipsis
            '\u00a0': ' ',  # non-breaking space
            '\u02c6': '^',  # circumflex
            '\u02dc': '~',  # small tilde
            '\u02d8': 'u',  # breve
            '\u02d9': '.',  # dot above
            '\u02da': 'o',  # ring above
            '\u02db': ',',  # ogonek
            '\u02dc': '~',  # small tilde
            '\u02dd': '"',  # double acute accent
        }
        
        # Common text patterns to fix
        self.text_replacements = {
            'NUL': 'ffi',  # Common NUL replacement
            '  ': ' ',     # Double spaces
            ' ,': ',',     # Space before comma
            ' .': '.',     # Space before period
            '( ': '(',     # Space after opening parenthesis
            ' )': ')',     # Space before closing parenthesis
            '\n\n\n': '\n\n',  # Multiple newlines
            '--': '-',     # Double hyphens
        }

    def clean_unicode(self, text: str) -> str:
        """Replace Unicode characters with their ASCII equivalents"""
        cleaned = text
        for unicode_char, replacement in self.unicode_map.items():
            cleaned = cleaned.replace(unicode_char, replacement)
        return cleaned

    def clean_text_patterns(self, text: str) -> str:
        """Clean common text patterns"""
        cleaned = text
        for pattern, replacement in self.text_replacements.items():
            cleaned = cleaned.replace(pattern, replacement)
        # Remove excessive whitespace
        cleaned = re.sub(r'\s+', ' ', cleaned).strip()
        return cleaned

    def clean(self, text: str) -> str:
        """Full text cleaning pipeline"""
        cleaned = self.clean_unicode(text)
        cleaned = self.clean_text_patterns(cleaned)
        return cleaned

def process_raw_chapters(base_dir: str = "./") -> None:
    """Extract and clean chapter content"""
    cleaner = TextCleaner()
    base_path = Path(base_dir)
    raw_output_dir = base_path / "raw_json"
    raw_output_dir.mkdir(exist_ok=True)
    
    for chapter_num in range(1, 17):
        chapter_num_str = f"{chapter_num:02d}"
        try:
            # Read chapter text
            input_path = base_path / f"chapter_{chapter_num_str}.txt"
            with open(input_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            # Clean the text
            cleaned_text = cleaner.clean(text)
            
            # Create structure
            raw_chapter = {
                "chapter": chapter_num_str,
                "content": cleaned_text
            }
            
            # Save to JSON
            output_path = raw_output_dir / f"chapter_{chapter_num_str}_raw.json"
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(raw_chapter, f, indent=2, ensure_ascii=False)
                
            print(f"Processed Chapter {chapter_num_str}")
            print(f"Original length: {len(text)} characters")
            print(f"Cleaned length: {len(cleaned_text)} characters")
            
        except Exception as e:
            print(f"Error processing Chapter {chapter_num_str}: {str(e)}")
            raise e

if __name__ == "__main__":
    process_raw_chapters("data/processed")

Processed Chapter 01
Original length: 75562 characters
Cleaned length: 75762 characters
Processed Chapter 02
Original length: 52092 characters
Cleaned length: 52067 characters
Processed Chapter 03
Original length: 72439 characters
Cleaned length: 72390 characters
Processed Chapter 04
Original length: 45724 characters
Cleaned length: 45650 characters
Processed Chapter 05
Original length: 70263 characters
Cleaned length: 70160 characters
Processed Chapter 06
Original length: 58676 characters
Cleaned length: 58687 characters
Processed Chapter 07
Original length: 38040 characters
Cleaned length: 37856 characters
Processed Chapter 08
Original length: 104392 characters
Cleaned length: 104501 characters
Processed Chapter 09
Original length: 125415 characters
Cleaned length: 125447 characters
Processed Chapter 10
Original length: 30514 characters
Cleaned length: 30433 characters
Processed Chapter 11
Original length: 102574 characters
Cleaned length: 102328 characters
Processed Chapter 12
Origi

In [3]:
import json
from pathlib import Path
import re
from typing import Dict, Tuple

def find_section_boundaries(content: str, sections: Dict[str, str]) -> Dict[str, Tuple[int, int]]:
    boundaries = {}
    ordered_sections = sorted(sections.keys())
    
    for i, section_num in enumerate(ordered_sections):
        section_title = sections[section_num]
        # Escape special characters in title and use raw string for pattern
        safe_title = re.escape(section_title)
        pattern = rf"{re.escape(section_num)}\s*{safe_title}"
        match = re.search(pattern, content)
        
        if match:
            start_pos = match.start()
            end_pos = len(content)
            
            if i < len(ordered_sections) - 1:
                next_section = ordered_sections[i + 1]
                next_title = sections[next_section]
                next_pattern = rf"{re.escape(next_section)}\s*{re.escape(next_title)}"
                next_match = re.search(next_pattern, content)
                if next_match:
                    end_pos = next_match.start()
            
            boundaries[section_num] = (start_pos, end_pos)
    
    return boundaries

def process_sections(base_dir: str = "./") -> None:
    base_path = Path(base_dir)
    output_dir = base_path / "final_json"
    output_dir.mkdir(exist_ok=True)
    
    for chapter_num in range(1, 17):
        chapter_num_str = f"{chapter_num:02d}"
        try:
            chapter_file = base_path / f"chapter_{chapter_num_str}.txt"
            with open(chapter_file, 'r', encoding='utf-8') as f:
                content = f.read()
            
            meta_file = base_path / f"chapter_{chapter_num_str}_meta.json"
            with open(meta_file, 'r', encoding='utf-8') as f:
                metadata = json.load(f)
            
            section_boundaries = find_section_boundaries(content, metadata["sections"])
            
            sections = {}
            for section_num, (start, end) in section_boundaries.items():
                sections[section_num] = content[start:end].strip()
            
            chapter_data = {
                "title": metadata["title"],
                "sections": sections
            }
            
            output_file = output_dir / f"chapter_{chapter_num_str}_sections.json"
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(chapter_data, f, indent=2, ensure_ascii=False)
            
            print(f"Processed Chapter {chapter_num_str}: {metadata['title']}")
            print(f"Found {len(sections)} sections")
            
        except Exception as e:
            print(f"Error processing Chapter {chapter_num_str}: {str(e)}")

if __name__ == "__main__":
    process_sections("data/processed")

Processed Chapter 01: Introduction
Found 7 sections
Processed Chapter 02: Multi-armed Bandits
Found 9 sections
Processed Chapter 03: Finite Markov Decision Processes
Found 6 sections
Processed Chapter 04: Dynamic Programming
Found 7 sections
Processed Chapter 05: Monte Carlo Methods
Found 7 sections
Processed Chapter 06: Temporal-Difference Learning
Found 8 sections
Processed Chapter 07: n-step Bootstrapping
Found 5 sections
Processed Chapter 08: Planning and Learning with Tabular Methods
Found 13 sections
Processed Chapter 09: On-policy Prediction with Approximation
Found 11 sections
Processed Chapter 10: On-policy Control with Approximation
Found 5 sections
Processed Chapter 11: *Off-policy Methods with Approximation
Found 9 sections
Processed Chapter 12: Policy Gradient Methods
Found 7 sections
Processed Chapter 13: Psychology
Found 7 sections
Processed Chapter 14: Neuroscience
Found 12 sections
Processed Chapter 15: Applications and Case Studies
Found 6 sections
Processed Chapter 1