In [None]:
# imports & load precomputed CSVs
from pathlib import Path
import pandas as pd
import re
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

DATA_DIR = Path("data")
# expected files (already precomputed and committed)
CHAP_LEMMAS_CSV = DATA_DIR / "chap_lemmas.csv"
SENTIMENT_CSV = DATA_DIR / "sentiment_by_chapter.csv"

# load them defensively
if not CHAP_LEMMAS_CSV.exists():
    raise FileNotFoundError(f"{CHAP_LEMMAS_CSV} not found. Commit this file to the repo before launching Binder.")
chap_lemmas_df = pd.read_csv(CHAP_LEMMAS_CSV, encoding="utf-8")
chap_lemmas_df['chapter'] = pd.to_numeric(chap_lemmas_df['chapter'], errors='coerce').fillna(0).astype(int)
chap_lemmas_df['lemmas_str'] = chap_lemmas_df['lemmas_str'].fillna("").astype(str)
chap_lemmas_df['word_count'] = pd.to_numeric(chap_lemmas_df.get('word_count', 0), errors='coerce').fillna(0).astype(int)

if SENTIMENT_CSV.exists():
    sentiment_df = pd.read_csv(SENTIMENT_CSV)
else:
    sentiment_df = None

display(HTML("<h2>Dracula â€” interactive term frequency (lemmatized)</h2>"))
display(HTML("<p>Type a single lemmatized word (e.g., <em>vampire</em>) and click Plot. Use Normalize to show per 1,000 words.</p>"))


In [None]:
# helpers: build per-chapter counters from lemmas_str (fast)
from collections import Counter
_lemma_counters = []
for s in chap_lemmas_df['lemmas_str'].tolist():
    if not s:
        _lemma_counters.append(Counter())
    else:
        # lemmas_str expected to be space-joined lemmas
        _lemma_counters.append(Counter(s.split()))

def lemmatize_query_simple(q):
    # here we assume user types the lemma already; we do a simple normalization
    q = (q or "").strip().lower()
    # restrict to single word alpha
    m = re.findall(r'[a-z]+', q)
    return m[0] if m else None

def counts_for_lemma(lemma):
    if not lemma:
        return [0]*len(_lemma_counters)
    return [c.get(lemma, 0) for c in _lemma_counters]

def make_plot_df_for_lemma(lemma, normalize=False):
    counts = counts_for_lemma(lemma)
    df = pd.DataFrame({
        "chapter": chap_lemmas_df['chapter'].astype(int),
        "count": counts,
        "word_count": chap_lemmas_df['word_count'].astype(int)
    }).sort_values('chapter').reset_index(drop=True)
    df = df[df['chapter'] > 0]   # drop preface if present
    if normalize:
        df['norm_per_1k'] = df['count'] / (df['word_count'].replace({0:1}) / 1000.0)
    return df


In [None]:
# UI: text input, normalize checkbox, plot button, output
term_input = widgets.Text(value="", placeholder="Type a lemma, e.g. vampire", description="Term:", layout=widgets.Layout(width="60%"))
normalize_chk = widgets.Checkbox(value=False, description="Normalize (per 1k words)")
plot_btn = widgets.Button(description="Plot", button_style="primary")
out = widgets.Output(layout=widgets.Layout(width="100%"))

def render_query(q, normalize=False):
    with out:
        clear_output(wait=True)
        lemma = lemmatize_query_simple(q)
        if not lemma:
            print("Please enter a single word lemma (a-z).")
            return
        df = make_plot_df_for_lemma(lemma, normalize=normalize)
        if df.empty or df['count'].sum() == 0:
            print(f"No occurrences of '{lemma}' found.")
            return
        ycol = 'norm_per_1k' if normalize else 'count'
        ylabel = "Occurrences per 1,000 words" if normalize else "Raw occurrences"
        fig = px.line(df, x='chapter', y=ycol, markers=True,
                      title=f"Frequency of '{lemma}' by chapter",
                      labels={'chapter': 'Chapter', ycol: ylabel})
        fig.update_layout(height=420, margin=dict(l=40, r=20, t=60, b=40), xaxis=dict(dtick=1))
        fig.show()

def on_click(b):
    render_query(term_input.value, normalize_chk.value)

plot_btn.on_click(on_click)
# allow Enter in textbox where supported
try:
    term_input.on_submit(lambda widget: render_query(widget.value, normalize_chk.value))
except Exception:
    pass

display(widgets.HBox([term_input, plot_btn, normalize_chk]))
display(out)
