In [1]:
def load_book(filepath: str) -> str:
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read()

    if 'CHAPTER I' in text:
        start = text.find('CHAPTER I')
        text = text[start:]
    elif '*** START OF' in text:
        start = text.find('*** START OF')
        text = text[start + 100:]

    if '*** END OF' in text:
        end = text.find('*** END OF')
        text = text[:end]
    elif 'End of Project Gutenberg' in text:
        end = text.find('End of Project Gutenberg')
        text = text[:end]

    return text.strip()

CP_text = load_book('../data/Crime-punishment.txt')
BK_text = load_book('../data/The-Brotherskaramazov.txt')

print(f"CP characters: {len(CP_text):,}")
import re
from typing import List

def split_chapters(text: str) -> List[str]:
    chapters = re.split(r'\bCHAPTER\s+[IVXLCDM]+\b', text)
    chapters = [c.strip() for c in chapters if len(c.strip()) > 500]
    return chapters

CP_chapters = split_chapters(CP_text)

print("Chapters loaded:", len(CP_chapters))


CP characters: 1,211,652
Chapters loaded: 1


In [2]:
import json
from typing import Dict, List

def load_characters(path: str) -> Dict:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

charlib = load_characters("Character Library/Crime_punishment.json")

rask = charlib["Rodion_Raskolnikov"]


In [3]:
def normalize_aliases(alias_list):
    strong = []
    weak = []

    for a in alias_list:
        if isinstance(a, dict):
            if a.get("tier", 3) >= 3:
                weak.append(a["text"].lower())
        else:
            strong.append(a.lower())

    return strong, weak

RASK_ALIASES, RASK_WEAK = normalize_aliases(rask["aliases"])

print("Strong aliases:", RASK_ALIASES)
print("Weak descriptors:", RASK_WEAK)
RASK_ALIASES, RASK_WEAK = normalize_aliases(rask["aliases"])

Strong aliases: ['raskolnikov', 'rodion romanovich raskolnikov', 'rodion romanovich', 'rodion', 'rodya', 'rodka', 'rodenka']
Weak descriptors: ['the student', 'the young man', 'the lodger', 'the murderer', 'the author of the article', 'the former student']


In [4]:
def detect_murder_chapter(chapters: List[str]) -> int:
    murder_terms = [
        "axe", "hatchet", "blood", "pawnbroker",
        "alyona", "lizaveta", "murdered", "killed"
    ]

    for i, ch in enumerate(chapters):
        text = ch.lower()
        hits = sum(term in text for term in murder_terms)
        if hits >= 3:
            return i

    raise RuntimeError("Murder chapter not detected.")

murder_idx = detect_murder_chapter(CP_chapters)
print("Murder chapter index:", murder_idx)


Murder chapter index: 0


In [6]:
from nltk.tokenize import sent_tokenize

def extract_character_passages(
    chapters: List[str],
    strong_aliases: List[str],
    weak_aliases: List[str],
    window: int = 1
):
    extracted = []

    for chap_idx, chapter in enumerate(chapters):
        sents = sent_tokenize(chapter)
        for i, sent in enumerate(sents):
            sent_lc = sent.lower()

            strong_hit = any(a in sent_lc for a in strong_aliases)
            weak_hit = any(w in sent_lc for w in weak_aliases)

            if strong_hit or weak_hit:
                start = max(0, i - window)
                end = min(len(sents), i + window + 1)
                context = " ".join(sents[start:end])

                extracted.append({
                    "chapter": chap_idx,
                    "text": context,
                    "strong_hit": strong_hit
                })

    return extracted


In [7]:
import spacy
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize

nlp = spacy.load("en_core_web_sm", disable=["ner"])
analyzer = SentimentIntensityAnalyzer()

def extract_psych_features(text: str) -> dict:
    doc = nlp(text)
    words = [t.text.lower() for t in doc if t.is_alpha]
    sents = sent_tokenize(text)

    total_words = len(words) or 1

    features = {}

    features["I_ratio"] = words.count("i") / total_words
    features["negation_ratio"] = sum(w in ["no","not","never","n't"] for w in words) / total_words
    features["modal_ratio"] = sum(w in ["must","should","ought"] for w in words) / total_words

    features["avg_sentence_length"] = np.mean([len(word_tokenize(s)) for s in sents])
    features["lexical_diversity"] = len(set(words)) / total_words

    pos_counts = Counter(t.pos_ for t in doc)
    features["verb_ratio"] = pos_counts["VERB"] / total_words
    features["adj_ratio"] = pos_counts["ADJ"] / total_words

    sent = analyzer.polarity_scores(text)
    features["sentiment_compound"] = sent["compound"]

    return features


In [10]:
from nltk.tokenize import sent_tokenize

def extract_character_passages(
    chapters,
    strong_aliases,
    weak_aliases,
    window=1
):
    extracted = []

    for chap_idx, chapter in enumerate(chapters):
        sents = sent_tokenize(chapter)

        for i, sent in enumerate(sents):
            sent_lc = sent.lower()

            strong_hit = any(a in sent_lc for a in strong_aliases)
            weak_hit = any(w in sent_lc for w in weak_aliases)

            if strong_hit or weak_hit:
                start = max(0, i - window)
                end = min(len(sents), i + window + 1)
                context = " ".join(sents[start:end])

                extracted.append({
                    "chapter": chap_idx,
                    "text": context,
                    "strong_ref": strong_hit   # ✅ always present
                })

    return extracted


In [11]:
rask_passages = extract_character_passages(
    CP_chapters,
    RASK_ALIASES,
    RASK_WEAK,
    window=1
)


In [12]:
import pandas as pd

rows = []

for p in rask_passages:
    feats = extract_psych_features(p["text"])
    feats["chapter"] = p["chapter"]
    feats["post_murder"] = p["chapter"] >= murder_idx
    feats["strong_ref"] = p["strong_ref"]
    rows.append(feats)

rask_df = pd.DataFrame(rows)

# Optional: filter to strong mentions only
rask_df = rask_df[rask_df.strong_ref]

print("rask_df shape:", rask_df.shape)


rask_df shape: (969, 11)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sns.set(style="whitegrid")
FEATURES = [
    "I_ratio",
    "negation_ratio",
    "sentiment_compound"
]

fig, axes = plt.subplots(len(FEATURES), 1, figsize=(12, 9), sharex=True)

for ax, feat in zip(axes, FEATURES):
    sns.lineplot(
        data=rask_df,
        x="chapter",
        y=feat,
        estimator="mean",
        ci=95,
        ax=ax
    )

    ax.axvline(murder_idx, color="red", linestyle="--", label="Murder")
    ax.set_ylabel(feat)
    ax.legend()

axes[-1].set_xlabel("Chapter")
plt.suptitle("Raskolnikov’s Psychological Shift Across Crime and Punishment", fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(12, 5))

sns.lineplot(
    data=rask_df,
    x="chapter",
    y="lexical_diversity",
    estimator="mean",
    ci=95,
    label="Lexical diversity",
    ax=ax
)

sns.lineplot(
    data=rask_df,
    x="chapter",
    y="avg_sentence_length",
    estimator="mean",
    ci=95,
    label="Avg sentence length",
    ax=ax
)

ax.axvline(murder_idx, color="red", linestyle="--")
ax.set_title("Cognitive Narrowing After the Murder")
ax.set_ylabel("Value")
ax.legend()
plt.show()


In [None]:
long_df = rask_df.melt(
    id_vars="post_murder",
    value_vars=[
        "I_ratio",
        "negation_ratio",
        "modal_ratio",
        "sentiment_compound"
    ],
    var_name="feature",
    value_name="value"
)

plt.figure(figsize=(10, 5))

sns.boxplot(
    data=long_df,
    x="feature",
    y="value",
    hue="post_murder"
)

plt.title("Raskolnikov: Pre vs Post Murder Linguistic Profile")
plt.xticks(rotation=30)
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

features_for_pca = [
    "I_ratio",
    "negation_ratio",
    "modal_ratio",
    "lexical_diversity",
    "sentiment_compound"
]

X = rask_df[features_for_pca].dropna()
X_scaled = StandardScaler().fit_transform(X)

pca = PCA(n_components=2)
proj = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(proj, columns=["PC1", "PC2"])
pca_df["post_murder"] = rask_df.iloc[X.index]["post_murder"].values

plt.figure(figsize=(8,6))
sns.scatterplot(
    data=pca_df,
    x="PC1",
    y="PC2",
    hue="post_murder",
    alpha=0.6
)
plt.title("Psychological State Space of Raskolnikov")
plt.show()
