## 0. Setup & Configuration

- Fill the `CONFIG` paths for your two books (plain text).
- Toggle stopwords and thresholds as needed.


In [1]:
# ===== Imports & Config =====
import re, os, math, json, collections
from pathlib import Path
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (9, 4.5)
plt.rcParams["axes.grid"] = True

CONFIG = {
    "book1_path": "../data/Crime-punishment.txt",  # <-- change
    "book2_path": "../data/The-Brotherskaramazov.txt",  # <-- change
    "language": "en",                # e.g. 'en','de','ru','el'
    "use_stopwords": False,          # toggle
    "min_ngram_count": 5,            # threshold (where applicable)
    "top_k": 20                      # top items to show
}

# Unicode-aware token regex: words with optional internal ' or -
WORD_RE = re.compile(r"[^\W\d_]+(?:[-'][^\W\d_]+)*", flags=re.UNICODE)

# Optional: supply your own stopwords set per language
STOPWORDS = set()


ModuleNotFoundError: No module named 'pandas'

## 1. Load & Normalize Text

- Fix hyphenated line breaks (e.g., end-of-line hyphens).
- Normalize whitespace.
- Lowercase consistently.

Our books are a part of Project Gutenberg, which means there are some extra texts in each txt file to be cleaned.

In [None]:
# --- Robust Project Gutenberg boilerplate stripper --------------------------
_GB_START_MARKERS = [
    r"\*\*\*\s*START OF (THIS|THE) PROJECT GUTENBERG EBOOK",   # modern
    r"START OF (THIS|THE) PROJECT GUTENBERG EBOOK",             # fallback
    r"End of the Project Gutenberg(?:'s)? Etext",               # very old variants sometimes inverted
]
_GB_END_MARKERS = [
    r"\*\*\*\s*END OF (THIS|THE) PROJECT GUTENBERG EBOOK",      # modern
    r"END OF (THIS|THE) PROJECT GUTENBERG EBOOK",                # fallback
    r"End of Project Gutenberg(?:'s)? (?:Etext|eBook)",          # older variants
    r"\*\*\*\s*END: FULL LICENSE\s*\*\*\*",                      # license block end (older)
]

# Chapters (heuristic fallback if markers missing; English-centric but works often)
_CHAPTER_HINTS = [
    r"^\s*chapter\s+[ivxlcdm0-9]+[\.\: ]",   # CHAPTER I / Chapter 1
    r"^\s*book\s+[ivxlcdm0-9]+[\.\: ]",      # BOOK I etc.
    r"^\s*part\s+[ivxlcdm0-9]+[\.\: ]",
]

def strip_gutenberg(text: str) -> str:
    """
    Returns text between Gutenberg START and END markers (case-insensitive).
    If markers aren't found, heuristically trims to first chapter-like heading.
    Works for most EN/DE/RU/EL releases since headers are in English.
    """
    t = text.replace("\ufeff", "")  # strip BOM if present

    # Find START
    start_idx = None
    for pat in _GB_START_MARKERS:
        m = re.search(pat, t, flags=re.IGNORECASE)
        if m:
            # start AFTER the matched line
            start_idx = t.find("\n", m.end())
            if start_idx == -1:
                start_idx = m.end()
            break

    # Find END
    end_idx = None
    for pat in _GB_END_MARKERS:
        m = re.search(pat, t, flags=re.IGNORECASE)
        if m:
            # end BEFORE the matched line
            end_idx = m.start()
            break

    if start_idx is not None and end_idx is not None and end_idx > start_idx:
        core = t[start_idx:end_idx]
    else:
        # Fallback: try to start at first chapter-like heading
        core = t
        for pat in _CHAPTER_HINTS:
            m = re.search(pat, core, flags=re.IGNORECASE | re.MULTILINE)
            if m:
                core = core[m.start():]
                break
        # And trim off the standard license tail if present
        for pat in _GB_END_MARKERS:
            m = re.search(pat, core, flags=re.IGNORECASE)
            if m:
                core = core[:m.start()]
                break

    # Remove license/contact blocks that sometimes sneak inside
    core = re.sub(r"\n\s*End of the Project Gutenberg.*", "", core, flags=re.IGNORECASE)
    core = re.sub(r"\*\*\*\s*START: FULL LICENSE\s*\*\*\*.*", "", core, flags=re.IGNORECASE | re.DOTALL)

    # Clean leftover cruft: URLs, repeated separators
    core = re.sub(r"https?://\S+", "", core)
    core = re.sub(r"[ \t]+\n", "\n", core)   # trailing spaces before newline
    core = re.sub(r"\n{3,}", "\n\n", core)   # collapse big blank blocks
    return core.strip()


In [None]:
def load_text(p: str) -> str:
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def normalize_text(t: str) -> str:
    # 1) strip Gutenberg header/footer FIRST
    t = strip_gutenberg(t)
    # 2) join hyphenated line breaks (e.g., "won-\nderful")
    t = re.sub(r"-\s*\n", "", t)
    # 3) normalize whitespace
    t = re.sub(r"\s+", " ", t)
    return t

text1 = normalize_text(load_text(CONFIG["book1_path"]))
text2 = normalize_text(load_text(CONFIG["book2_path"]))

tokens1 = WORD_RE.findall(text1.lower())
tokens2 = WORD_RE.findall(text2.lower())

if CONFIG["use_stopwords"]:
    tokens1 = [t for t in tokens1 if t not in STOPWORDS]
    tokens2 = [t for t in tokens2 if t not in STOPWORDS]

tokens = tokens1 + tokens2

len(tokens1), len(tokens2), len(tokens)

len(tokens), tokens[:12]


In [7]:
import json
import re
import matplotlib.pyplot as plt
from transformers import pipeline
import nltk
nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize

# Assuming load_text and normalize_text functions (based on common implementations)
def load_text(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Your provided setup
CONFIG = {
    "book1_path": "../data/Crime-punishment.txt",  # <-- change
    "book2_path": "../data/The-Brotherskaramazov.txt",  # <-- change
    "language": "en",                # e.g. 'en','de','ru','el'
    "use_stopwords": False,          # toggle
    "min_ngram_count": 5,            # threshold (where applicable)
    "top_k": 20                      # top items to show
}

text1 = normalize_text(load_text(CONFIG["book1_path"]))
text2 = normalize_text(load_text(CONFIG["book2_path"]))

# Load character data from JSON files
# Assuming JSON structure like: {"main_char": "Rodion_Raskolnikov"} for Crime and Punishment
# And {"main_chars": ["Dmitri_Karamazov", "Ivan_Karamazov", "Alexei_Karamazov"]} for Brothers Karamazov
with open('Crime_punishment.json', 'r') as f:
    char_data1 = json.load(f)
    main_char1 = char_data1.get('main_char', 'Rodion_Raskolnikov').lower()

with open('The_brothers.json', 'r') as f:
    char_data2 = json.load(f)
    main_chars2 = [char.lower() for char in char_data2.get('main_chars', ["Dmitri_Karamazov", "Ivan_Karamazov", "Alexei_Karamazov"])]

# Define keywords related to decisions and murder/story elements (customize as needed based on stories)
decision_keywords = ['decide', 'decision', 'chose', 'choice', 'plan', 'intend', 'resolve']
murder_keywords = ['murder', 'kill', 'crime', 'death', 'guilt', 'confess', 'punish', 'trial', 'accuse']

# Initialize sentiment analysis pipeline from transformers
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Function to analyze sentiment for a book and its characters
def analyze_book_sentiment(text, characters, book_title):
    sentences = sent_tokenize(text)
    sentiment_scores = []
    positions = []  # To track position in the book (sentence index)

    for idx, sentence in enumerate(sentences):
        sentence_lower = sentence.lower()
        # Check if any character is mentioned
        char_mentioned = any(char in sentence_lower for char in characters)
        # Check for decision or murder-related keywords
        keyword_match = any(kw in sentence_lower for kw in decision_keywords + murder_keywords)
        
        if char_mentioned and keyword_match:
            # Perform sentiment analysis
            result = sentiment_pipeline(sentence)[0]
            label = result['label']
            score = result['score']
            # Map to positive/negative value: positive up, negative down
            sentiment_value = score if label == 'POSITIVE' else -score
            sentiment_scores.append(sentiment_value)
            positions.append(idx)

    # Plot the graph
    if sentiment_scores:
        plt.figure(figsize=(10, 6))
        plt.plot(positions, sentiment_scores, marker='o', linestyle='-', color='b')
        plt.title(f'Sentiment Trajectory for {book_title} - Character Decisions (Murder/Story Related)')
        plt.xlabel('Position in Book (Sentence Index)')
        plt.ylabel('Sentiment Score (Positive: Up, Negative: Down)')
        plt.grid(True)
        plt.show()
    else:
        print(f"No relevant decision points found for {book_title}.")

# Analyze Crime and Punishment (single main character)
analyze_book_sentiment(text1, [main_char1], "Crime and Punishment")

# Analyze The Brothers Karamazov (multiple main characters)
analyze_book_sentiment(text2, main_chars2, "The Brothers Karamazov")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


FileNotFoundError: [Errno 2] No such file or directory: 'Crime_punishment.json'

In [8]:
import json
import re
import os
from typing import List, Dict, Tuple, Any
import matplotlib.pyplot as plt

import spacy
from spacy.tokens import Doc, Span
from spacy.matcher import Matcher

import nltk
from nltk.sentiment import vader
from collections import defaultdict

# --------- Config (uses your existing CONFIG and load/normalize) ---------
CONFIG = {
    "book1_path": "../data/Crime-punishment.txt",
    "book2_path": "../data/The-Brotherskaramazov.txt",
    "language": "en",
    "use_stopwords": False,
    "min_ngram_count": 5,
    "top_k": 20
}

# If you already have these functions defined, comment these stubs out.
def load_text(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def normalize_text(text: str) -> str:
    # Basic normalization: collapse whitespace, normalize quotes; keep punctuation for NLP.
    text = text.replace('\u201c', '"').replace('\u201d', '"').replace('\u2019', "'")
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --------- Character metadata loaders ---------
def load_character_file(json_path: str) -> Dict[str, Any]:
    with open(json_path, "r", encoding="utf-8") as f:
        return json.load(f)

def build_aliases(char_dict: Dict[str, Any]) -> Dict[str, List[str]]:
    # Expected format:
    # {
    #   "main_char": "Rodion_Raskolnikov",
    #   "aliases": ["Raskolnikov", "Rodion", "Rodya", "Romanovitch", ...]
    # }
    # For Brothers:
    # {
    #   "main_chars": ["Dmitri_Karamazov", "Ivan_Karamazov", "Alexei_Karamazov"],
    #   "aliases": {
    #       "Dmitri_Karamazov": ["Mitya", "Dmitri", "Karamazov (Dmitri context)"],
    #       ...
    #   }
    # }
    aliases_map = {}
    if "main_char" in char_dict:
        mc = char_dict["main_char"]
        aliases_map[mc] = [mc] + char_dict.get("aliases", [])
    elif "main_chars" in char_dict and "aliases" in char_dict:
        for mc in char_dict["main_chars"]:
            aliases_map[mc] = [mc] + char_dict["aliases"].get(mc, [])
    else:
        # Fallback: if JSON is simply { "Rodion_Raskolnikov": ["Raskolnikov", ...], ... }
        for mc, als in char_dict.items():
            if isinstance(als, list):
                aliases_map[mc] = [mc] + als
    # Normalize alias strings (lowercase stripped)
    for k in aliases_map:
        aliases_map[k] = sorted(set([a.strip() for a in aliases_map[k] if a.strip()]))
    return aliases_map

# --------- NLP initialization ---------
def init_nlp():
    nlp = spacy.load("en_core_web_sm")
    # Add sentence boundary if needed (spaCy has sentencizer in pipeline)
    return nlp

def init_sentiment():
    try:
        sid = vader.SentimentIntensityAnalyzer()
    except LookupError:
        nltk.download('vader_lexicon')
        sid = vader.SentimentIntensityAnalyzer()
    return sid

# --------- Decision lexicons and weights ---------
DECISION_VERBS_NEG = {
    "kill": -1.0, "murder": -1.0, "slay": -0.9, "assassinate": -1.0,
    "rob": -0.8, "steal": -0.8, "theft": -0.8,
    "betray": -0.9, "deceive": -0.7, "lie": -0.6,
    "confess to murder": -0.6, "commit crime": -0.9,
    "attempt murder": -0.9, "strike": -0.5, "beat": -0.7, "violence": -0.9
}

DECISION_VERBS_POS = {
    "help": +0.7, "save": +0.9, "protect": +0.8, "forgive": +0.8,
    "repent": +0.6, "confess": +0.4, "donate": +0.6, "pray": +0.4,
    "reconcile": +0.5, "apologize": +0.5, "care": +0.6, "comfort": +0.6
}

# Multi-word phrases mapped to polarity
DECISION_PHRASES = {
    "commit murder": -1.0, "plan murder": -0.9, "confess murder": -0.6,
    "go to police": +0.3, "turn himself in": +0.5,
    "help her": +0.6, "help him": +0.6, "help them": +0.6
}

# Neutral/administrative triggers (optionally ignored or low weight)
DECISION_VERBS_NEU = {
    "decide": 0.0, "choose": 0.0, "think": 0.0, "consider": 0.0
}

# --------- Utility: character match ---------
def matches_character(span_text: str, aliases: List[str]) -> bool:
    st = span_text.lower()
    for al in aliases:
        if al.lower() in st:
            return True
    return False

def sentence_mentions_character(sent: Span, aliases: List[str]) -> bool:
    # Check NER PERSON entities and raw text for alias substrings
    text_l = sent.text.lower()
    for al in aliases:
        if al.lower() in text_l:
            return True
    # NER check
    for ent in sent.ents:
        if ent.label_ == "PERSON" and ent.text.lower() in [a.lower() for a in aliases]:
            return True
    return False

# --------- Decision detection ---------
def detect_decisions_in_sentence(sent: Span) -> List[Dict[str, Any]]:
    # Extract lemmatized verbs and detect phrases
    tokens = [t for t in sent if not t.is_space]
    lemmas = [t.lemma_.lower() for t in tokens]

    found = []

    # Phrase detection
    text_l = sent.text.lower()
    for phrase, w in DECISION_PHRASES.items():
        if phrase in text_l:
            found.append({"type": "phrase", "key": phrase, "weight": w})

    # Verb detection
    for t in tokens:
        if t.pos_ == "VERB":
            lemma = t.lemma_.lower()
            if lemma in DECISION_VERBS_NEG:
                found.append({"type": "verb", "key": lemma, "weight": DECISION_VERBS_NEG[lemma], "token": t})
            elif lemma in DECISION_VERBS_POS:
                found.append({"type": "verb", "key": lemma, "weight": DECISION_VERBS_POS[lemma], "token": t})
            elif lemma in DECISION_VERBS_NEU:
                # Usually skip neutral unless combined with crime nouns
                pass

    # Crime nouns (supporting evidence)
    crime_nouns = {"murder", "crime", "blood", "axe", "weapon", "theft", "robbery"}
    nouns = set([t.lemma_.lower() for t in tokens if t.pos_ in ("NOUN", "PROPN")])
    has_crime_context = len(crime_nouns.intersection(nouns)) > 0

    # If only neutral verbs + crime context, treat as negative decision
    if has_crime_context and not found:
        found.append({"type": "context", "key": "crime_context", "weight": -0.4})

    return found

# --------- Link decision to subject (character agency) ---------
def character_is_agent_of_decision(sent: Span, char_aliases: List[str]) -> bool:
    # Try dependency: look for subject of the decision verb matching character mention
    # Fallback: if character is mentioned and first-person pronouns could link (not perfect).
    # Simple heuristic: character must be mentioned in sentence.
    if not sentence_mentions_character(sent, char_aliases):
        return False

    # Dependency heuristic
    # If a verb is present, check nsubj text overlap with aliases
    for t in sent:
        if t.pos_ == "VERB":
            for child in t.children:
                if child.dep_ in ("nsubj", "nsubjpass"):
                    subj_text = child.text.lower()
                    # Check alias exact match or substring
                    for al in char_aliases:
                        if al.lower() in subj_text:
                            return True

    # If not found, we still allow if character is mentioned and verbs present (looser heuristic)
    return True

# --------- Sentiment scoring ---------
def sentiment_impact_score(sent: Span, sid: vader.SentimentIntensityAnalyzer, base_weight: float) -> float:
    # Use VADER sentiment around sentence. Combine base decision polarity with sentiment polarity.
    s = sid.polarity_scores(sent.text)
    compound = s["compound"]  # [-1, 1]
    # Impact: base_weight + compound*|base_weight| (amplify/reduce based on sentiment)
    impact = base_weight + compound * abs(base_weight) * 0.6
    return impact

# --------- Processing pipeline ---------
def process_book(text: str, nlp, sid, aliases_map: Dict[str, List[str]]) -> Dict[str, List[Tuple[int, float, Dict[str, Any], str]]]:
    """
    Returns: dict character -> list of (sentence_index, impact_score, decision_meta, sentence_text)
    """
    doc = nlp(text)
    results = defaultdict(list)
    for i, sent in enumerate(doc.sents):
        decisions = detect_decisions_in_sentence(sent)
        if not decisions:
            continue
        # For each character, check agency in this sentence
        for char, aliases in aliases_map.items():
            if character_is_agent_of_decision(sent, aliases):
                for d in decisions:
                    impact = sentiment_impact_score(sent, sid, d["weight"])
                    results[char].append((i, impact, d, sent.text))
    return results

# --------- Aggregate and plot ---------
def build_time_series(decisions: List[Tuple[int, float, Dict[str, Any], str]]) -> Tuple[List[int], List[float], List[Dict[str, Any]]]:
    # Sort by sentence index and compute cumulative impact
    decisions_sorted = sorted(decisions, key=lambda x: x[0])
    times = []
    cum = []
    meta_list = []
    total = 0.0
    for idx, impact, meta, sent_text in decisions_sorted:
        total += impact
        times.append(idx)
        cum.append(total)
        meta_list.append({"impact": impact, "meta": meta, "sentence": sent_text, "index": idx})
    return times, cum, meta_list

def plot_character_series(char: str, times: List[int], series: List[float], meta_list: List[Dict[str, Any]], out_dir: str):
    plt.figure(figsize=(10, 5))
    plt.plot(times, series, marker='o', linewidth=2, label=char)
    plt.title(f"Decision Impact Over Narrative Time: {char}")
    plt.xlabel("Sentence index (narrative time)")
    plt.ylabel("Cumulative impact")
    plt.grid(True, alpha=0.3)
    # Annotate key points (top negative and top positive)
    if meta_list:
        # pick top 3 by |impact|
        top_points = sorted(meta_list, key=lambda m: abs(m["impact"]), reverse=True)[:3]
        for tp in top_points:
            plt.scatter(tp["index"], series[times.index(tp["index"])], color='red' if tp["impact"] < 0 else 'green')
            label = f'{tp["meta"]["key"]} ({tp["impact"]:+.2f})'
            plt.annotate(label, (tp["index"], series[times.index(tp["index"])]),
                         textcoords="offset points", xytext=(5,5), fontsize=8)
    plt.legend()
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{char}_impact.png")
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

# --------- Main runner ---------
def main():
    # Init NLP tools
    nlp = init_nlp()
    sid = init_sentiment()

    # Load and normalize texts (from your setup)
    text1 = normalize_text(load_text(CONFIG["book1_path"]))
    text2 = normalize_text(load_text(CONFIG["book2_path"]))

    # Load character JSONs
    crime_char_json = load_character_file("./notebooks/Character Library/Crime_punishment.json")
    bros_char_json = load_character_file("./notebooks/Character Library/The_brothers.json")

    crime_aliases = build_aliases(crime_char_json)   # expects Rodion_Raskolnikov aliases
    bros_aliases = build_aliases(bros_char_json)     # expects Dmitri, Ivan, Alexei maps

    # Process books
    print("Processing Crime and Punishment...")
    decisions_crime = process_book(text1, nlp, sid, crime_aliases)

    print("Processing The Brothers Karamazov...")
    decisions_bros = process_book(text2, nlp, sid, bros_aliases)

    # Build and plot
    out_dir = "./outputs"
    for char, decs in decisions_crime.items():
        times, series, meta_list = build_time_series(decs)
        plot_character_series(char, times, series, meta_list, out_dir)
        print(f"{char}: {len(decs)} decisions detected. Plot saved to {out_dir}/{char}_impact.png")

    for char, decs in decisions_bros.items():
        times, series, meta_list = build_time_series(decs)
        plot_character_series(char, times, series, meta_list, out_dir)
        print(f"{char}: {len(decs)} decisions detected. Plot saved to {out_dir}/{char}_impact.png")

    # Optional: export raw decision logs for inspection
    def export_logs(book_name: str, decisions_map: Dict[str, List[Tuple[int, float, Dict[str, Any], str]]]):
        os.makedirs(out_dir, exist_ok=True)
        out_path = os.path.join(out_dir, f"{book_name}_decision_log.json")
        serializable = {}
        for char, decs in decisions_map.items():
            serializable[char] = [
                {"sentence_index": idx, "impact": impact, "meta": meta, "sentence": sent}
                for idx, impact, meta, sent in decs
            ]
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(serializable, f, ensure_ascii=False, indent=2)
        print(f"Exported decision log: {out_path}")

    export_logs("crime_and_punishment", decisions_crime)
    export_logs("brothers_karamazov", decisions_bros)

if __name__ == "__main__":
    main()

FileNotFoundError: [Errno 2] No such file or directory: './notebooks/Character Library/Crime_punishment.json'

In [None]:
def load_text(p: str) -> str:
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def normalize_text(t: str) -> str:
    # 1) strip Gutenberg header/footer FIRST
    t = strip_gutenberg(t)
    # 2) join hyphenated line breaks (e.g., "won-\nderful")
    t = re.sub(r"-\s*\n", "", t)
    # 3) normalize whitespace
    t = re.sub(r"\s+", " ", t)
    return t

text1 = normalize_text(load_text(CONFIG["book1_path"]))
text2 = normalize_text(load_text(CONFIG["book2_path"]))

tokens1 = WORD_RE.findall(text1.lower())
tokens2 = WORD_RE.findall(text2.lower())

if CONFIG["use_stopwords"]:
    tokens1 = [t for t in tokens1 if t not in STOPWORDS]
    tokens2 = [t for t in tokens2 if t not in STOPWORDS]

tokens = tokens1 + tokens2

len(tokens1), len(tokens2), len(tokens)

len(tokens), tokens[:12]


(568005,
 ['chapter',
  'i',
  'chapter',
  'ii',
  'chapter',
  'iii',
  'chapter',
  'iv',
  'chapter',
  'v',
  'chapter',
  'vi'])

In [None]:
def load_text(p: str) -> str:
    with open(p, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def normalize_text(t: str) -> str:
    # 1) strip Gutenberg header/footer FIRST
    t = strip_gutenberg(t)
    # 2) join hyphenated line breaks (e.g., "won-\nderful")
    t = re.sub(r"-\s*\n", "", t)
    # 3) normalize whitespace
    t = re.sub(r"\s+", " ", t)
    return t

text1 = normalize_text(load_text(CONFIG["book1_path"]))
text2 = normalize_text(load_text(CONFIG["book2_path"]))

tokens1 = WORD_RE.findall(text1.lower())
tokens2 = WORD_RE.findall(text2.lower())

if CONFIG["use_stopwords"]:
    tokens1 = [t for t in tokens1 if t not in STOPWORDS]
    tokens2 = [t for t in tokens2 if t not in STOPWORDS]

tokens = tokens1 + tokens2

len(tokens1), len(tokens2), len(tokens)

len(tokens), tokens[:12]


(568005,
 ['chapter',
  'i',
  'chapter',
  'ii',
  'chapter',
  'iii',
  'chapter',
  'iv',
  'chapter',
  'v',
  'chapter',
  'vi'])