In [1]:
"""
# Chain‑of‑Thought Faithfulness Experiments 🧪

This notebook reproduces the core analyses described in the research plan:

* **Token counts, category entropy, Yule’s K** across hint conditions.
* **Back‑tracking incidence ↔ final accuracy** correlation.
* **Category‑sequence → accuracy** logistic‑regression classifier.
* **χ² tests & Markov transition matrices**.

> **Tip** ▸ Make sure you have installed: `pandas matplotlib scikit‑learn scipy`.
"""

%cd ..
%pwd
!ls g_cot_cluster
# %%
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd

import g_cot_cluster.direct as cot  # local package from this canvas

plt.rcParams["figure.dpi"] = 120  # crisper plots in notebooks

DATA_DIR = Path("g_cot_cluster/outputs/mmlu/DeepSeek-R1-Distill-Llama-8B")
OUT_DIR = "g_cot_cluster/outputs/direct"


/root/CoTFaithChecker
cot_direct  hidden_states  main  outputs


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


ModuleNotFoundError: No module named 'g_cot_cluster.direct'

In [None]:
"""
# Chain‑of‑Thought Faithfulness Experiments 🧪

This notebook reproduces the core analyses described in the research plan:

* **Token counts, category entropy, Yule’s K** across hint conditions.
* **Back‑tracking incidence ↔ final accuracy** correlation.
* **Category‑sequence → accuracy** logistic‑regression classifier.
* **χ² tests & Markov transition matrices**.

> **Tip** ▸ Make sure you have installed: `pandas matplotlib scikit‑learn scipy`.
"""

# %%
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd

import cot_analysis as cot  # local package from this canvas

plt.rcParams["figure.dpi"] = 120  # crisper plots in notebooks

DATA_DIR = Path("g_cot_cluster/outputs/mmlu/DeepSeek-R1-Distill-Llama-8B")

# %% [markdown]
"""## 1  Load and enrich the dataset"""

# %%
df = cot.load_data(DATA_DIR)
df = cot.add_basic_metrics(df)
print(df.head())

# %% [markdown]
"""### Optional: merge in ground‑truth answers

Uncomment if you have a CSV with `question_id,correct_answer`."""

# %% (optional)
# answer_key = pd.read_csv("answer_key.csv")
# df = cot.merge_accuracy(df, answer_key)

# %% [markdown]
"""## 2  Descriptive metrics"""

# %%
cot.plot_token_counts(df)
plt.show()

# %%
cot.plot_category_entropy(df)
plt.show()

# %%
cot.plot_lexical_diversity(df)
plt.show()

# %% [markdown]
"""## 3  Back‑tracking vs. accuracy"""

# %%
if df["is_correct"].notna().any():
    fig, ax = plt.subplots()
    ax.scatter(df["has_backtracking"].astype(int), df["is_correct"].astype(int) + 0.02 * (df.index % 5))
    ax.set_xlabel("Back‑tracking present (0/1)")
    ax.set_ylabel("Accuracy (0/1)")
    ax.set_title("Back‑tracking incidence vs. accuracy")
    plt.tight_layout()
    plt.show()

# %% [markdown]
"""Pearson correlation coefficient:"""

# %%
if df["is_correct"].notna().any():
    from scipy.stats import pearsonr

    r, p = pearsonr(df["has_backtracking"].astype(int), df["is_correct"].astype(int))
    print(f"r = {r:.3f}  (p = {p:.4f})")

# %% [markdown]
"""## 4  Category‑sequence classifier"""

# %%
if df["is_correct"].notna().any():
    clf, vec, res = cot.train_category_classifier(df)
    print(res)

# %% [markdown]
"""## 5  χ² test on marginal frequencies"""

# %%
chi2_res = cot.categorical_chi2(df)
print(chi2_res)

# %% [markdown]
"""## 6  Markov transition matrices"""

# %%
mat = cot.global_transition_matrix(df)
cot.plot_transition_heatmap(mat, title="Global category‑transition frequencies")
plt.show()

# %% [markdown]
"""That’s it — feel free to customise further or add deeper dives!"""
