In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sys
from pathlib import Path
import pandas as pd
import spacy

In [12]:
# ...existing code...
from pathlib import Path
import os

def find_project_root(start: Path = None, markers=(".git", "pyproject.toml", "setup.py")) -> Path:
    start = Path(start or Path.cwd()).resolve()
    for p in [start] + list(start.parents):
        if any((p / m).exists() for m in markers):
            return p
    return Path.cwd()

# Prefer explicit env var for cloud runs; otherwise try __file__ then cwd then repo-root
project_root = None
if "PROJECT_ROOT" in os.environ:
    project_root = Path(os.environ["PROJECT_ROOT"]).resolve()
else:
    try:
        # works in scripts, not in notebooks
        project_root = Path(__file__).parent.resolve()
    except NameError:
        # notebook / interactive fallback: try repo root then cwd
        project_root = find_project_root()

# build paths relative to project_root
notebooks_dir = project_root / "notebooks"
mis_dir = notebooks_dir / "misclassifications"

bert_path = mis_dir / "misclassifications_Stanford_bert_partial_finetune.csv"
bert_lora_path = mis_dir / "misclassifications_Stanford_bert_lora_r8.csv"
distilled_bert_path = mis_dir / "misclassifications_Stanford_distilbert_lora_r8.csv"
distilled_bert_lora_path = mis_dir / "misclassifications_Stanford_distilbert_partial_finetune.csv"

print("Using project_root:", project_root)

Using project_root: C:\Users\Besitzer\OneDrive\Dokumente\CBS_Copenhagen\Semester\WS2025\AdvNLP\Final Exam\AVDNLP_final_project


In [13]:
# ...existing code...
from pathlib import Path

def find_file(name, start=project_root):
    start = Path(start or Path.cwd()).resolve()
    return list(start.rglob(name))

print("project_root:", project_root)
print("mis_dir:", mis_dir)
print("expected bert_path:", bert_path)
print("bert_path.exists():", bert_path.exists())

if not bert_path.exists():
    print("Searching repo for misclassifications_Stanford_bert_partial_finetune.csv ...")
    found = find_file("misclassifications_Stanford_bert_partial_finetune.csv", project_root)
    if not found:
        raise FileNotFoundError(f"Could not locate misclassifications_Stanford_bert_partial_finetune.csv under {project_root}")
    bert_path = found[0]
    print("Found bert_path at:", bert_path)

# helper to load or search
def load_or_search(p: Path):
    if isinstance(p, str):
        p = Path(p)
    if p.exists():
        return pd.read_csv(p)
    found = find_file(p.name, project_root)
    if not found:
        raise FileNotFoundError(f"Could not locate {p.name} under {project_root}")
    print(f"Loading {p.name} from {found[0]}")
    return pd.read_csv(found[0])

df_bert = load_or_search(bert_path)
df_bert_lora = load_or_search(bert_lora_path)
df_distilled_bert = load_or_search(distilled_bert_path)
df_distilled_bert_lora = load_or_search(distilled_bert_lora_path)

print("Loaded dataframes:", { 'bert': len(df_bert), 'bert_lora': len(df_bert_lora),
                             'distilled': len(df_distilled_bert), 'distilled_lora': len(df_distilled_bert_lora) })
# ...existing code...

project_root: C:\Users\Besitzer\OneDrive\Dokumente\CBS_Copenhagen\Semester\WS2025\AdvNLP\Final Exam\AVDNLP_final_project
mis_dir: C:\Users\Besitzer\OneDrive\Dokumente\CBS_Copenhagen\Semester\WS2025\AdvNLP\Final Exam\AVDNLP_final_project\notebooks\misclassifications
expected bert_path: C:\Users\Besitzer\OneDrive\Dokumente\CBS_Copenhagen\Semester\WS2025\AdvNLP\Final Exam\AVDNLP_final_project\notebooks\misclassifications\misclassifications_Stanford_bert_partial_finetune.csv
bert_path.exists(): False
Searching repo for misclassifications_Stanford_bert_partial_finetune.csv ...
Found bert_path at: C:\Users\Besitzer\OneDrive\Dokumente\CBS_Copenhagen\Semester\WS2025\AdvNLP\Final Exam\AVDNLP_final_project\misclassifications\misclassifications_Stanford_bert_partial_finetune.csv
Loading misclassifications_Stanford_bert_lora_r8.csv from C:\Users\Besitzer\OneDrive\Dokumente\CBS_Copenhagen\Semester\WS2025\AdvNLP\Final Exam\AVDNLP_final_project\misclassifications\misclassifications_Stanford_bert_lora

In [14]:
# 1) Inspect dataframes to detect text / gold / prediction columns and miscounts
def infer_cols(df):
    cols = [c for c in df.columns.tolist()]
    text_col = next((c for c in cols if 'text' in c.lower() or 'sentence' in c.lower() or 'tweet' in c.lower()), None)
    true_col = next((c for c in cols if c.lower() in ['label','labels','gold','true_label','target','y_true','y']), None)
    pred_col = next((c for c in cols if 'pred' in c.lower() or 'prediction' in c.lower() or 'predicted' in c.lower()), None)
    # fallback heuristics
    if text_col is None:
        text_col = next((c for c in cols if df[c].dtype == object and df[c].str.len().mean() > 10), None)
    return text_col, true_col, pred_col

frames = {'bert': df_bert, 'bert_lora': df_bert_lora, 'distilled': df_distilled_bert, 'distilled_lora': df_distilled_bert_lora}
colinfo = {}
for name, df in frames.items():
    t, y, p = infer_cols(df)
    miscount = None
    if y and p:
        miscount = int((df[y] != df[p]).sum())
    print(f"{name}: rows={len(df)} text_col={t} true_col={y} pred_col={p} misclassified={miscount}")
    colinfo[name] = (t, y, p)

bert: rows=14078 text_col=text true_col=true_label pred_col=predicted_label misclassified=14078
bert_lora: rows=16076 text_col=text true_col=true_label pred_col=predicted_label misclassified=16076
distilled: rows=18512 text_col=text true_col=true_label pred_col=predicted_label misclassified=18512
distilled_lora: rows=17063 text_col=text true_col=true_label pred_col=predicted_label misclassified=17063


In [17]:
import spacy
print(spacy.__version__)

3.8.11


In [19]:
!python -m spacy download en_core_web_sm

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\Besitzer\OneDrive\Dokumente\CBS_Copenhagen\Semester\WS2025\AdvNLP\Final Exam\AVDNLP_final_project\venv\Lib\site-packages\spacy\__main__.py", line 4, in <module>
    setup_cli()
  File "c:\Users\Besitzer\OneDrive\Dokumente\CBS_Copenhagen\Semester\WS2025\AdvNLP\Final Exam\AVDNLP_final_project\venv\Lib\site-packages\spacy\cli\_util.py", line 86, in setup_cli
    command = get_command(app)
              ^^^^^^^^^^^^^^^^
  File "c:\Users\Besitzer\OneDrive\Dokumente\CBS_Copenhagen\Semester\WS2025\AdvNLP\Final Exam\AVDNLP_final_project\venv\Lib\site-packages\typer\main.py", line 350, in get_command
    click_command: click.Command = get_group(typer_instance)
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Besitzer\OneDrive\Dokumente\CBS_Copenhagen\Semester\WS2025\AdvNLP\Final Exam\AVDNLP_final_project\venv\Li

In [20]:
nlp = spacy.load("en_core_web_sm")

In [21]:
# 3) Feature extraction focusing on sentence structure and syntax
import string
from collections import Counter

def syntax_features(text):
    doc = nlp(text if isinstance(text, str) else '')
    tok_texts = [t.text for t in doc]
    tokens = [t for t in doc]
    char_len = len(text) if text else 0
    token_count = len(tokens)
    avg_token_len = np.mean([len(t.text) for t in tokens]) if tokens else 0
    punct_count = sum(1 for ch in text if ch in string.punctuation)
    comma_count = text.count(',') if text else 0
    uppercase_ratio = np.mean([1 if (t.text.isupper() and t.text.isalpha()) else 0 for t in tokens]) if tokens else 0
    digit_ratio = np.mean([1 if any(ch.isdigit() for ch in t.text) else 0 for t in tokens]) if tokens else 0
    negations = sum(1 for t in tokens if t.dep_ == 'neg' or t.lemma_.lower() in {'not','no','never'})
    ner_count = len(doc.ents)
    # approximate clause count by counting certain dependency labels
    clause_deps = {'advcl','ccomp','xcomp','acl','relcl'}
    clause_count = sum(1 for t in tokens if t.dep_ in clause_deps)
    # tree depth: maximum ancestor chain length
    def token_depth(tok):
        depth = 0
        cur = tok
        while cur.head is not cur:
            depth += 1
            cur = cur.head
            if depth > 200:
                break
        return depth
    depth_vals = [token_depth(t) for t in tokens] if tokens else [0]
    max_depth = int(max(depth_vals)) if depth_vals else 0
    # POS distribution (top 5 tags)
    pos_counts = Counter([t.pos_ for t in tokens])
    pos_top = dict(pos_counts.most_common(5))
    return {
        'char_len': char_len,
        'token_count': token_count,
        'avg_token_len': float(avg_token_len),
        'punct_count': punct_count,
        'comma_count': comma_count,
        'uppercase_ratio': float(uppercase_ratio),
        'digit_ratio': float(digit_ratio),
        'negation_count': negations,
        'ner_count': ner_count,
        'clause_count': int(clause_count),
        'max_dep_depth': int(max_depth),
        'pos_top': pos_top
    }

def annotate_df(df, text_col, true_col, pred_col, max_rows=None):
    sub = df if max_rows is None else df.head(max_rows).copy()
    sub = sub.copy()
    sub['is_mis'] = False if (true_col is None or pred_col is None) else (sub[true_col] != sub[pred_col])
    feats = []
    for i, txt in enumerate(sub[text_col].fillna('').astype(str)):
        feats.append(syntax_features(txt))
    feats_df = pd.DataFrame(feats)
    # expand pos_top keys into columns (sparse)
    pos_df = feats_df['pos_top'].apply(lambda d: pd.Series(d)).fillna(0).astype(int)
    feats_df = pd.concat([feats_df.drop(columns=['pos_top']), pos_df], axis=1)
    res = pd.concat([sub.reset_index(drop=True), feats_df.reset_index(drop=True)], axis=1)
    return res

In [22]:
# 4) Annotate each dataframe (limit rows if needed for speed) and compare distributions
annotated = {}
for name, df in frames.items():
    tcol, ycol, pcol = colinfo.get(name, (None, None, None))
    if tcol is None:
        print(f"Skipping {name}: no text column detected")
        continue
    print(f"Annotating {name} (this may take a bit) - using columns text={tcol} true={ycol} pred={pcol}")
    annotated[name] = annotate_df(df, tcol, ycol, pcol, max_rows=2000)  # adjust max_rows as needed
    print(f"Annotated {name}: {len(annotated[name])} rows")

# Example aggregated comparison plots: token_count, max_dep_depth, clause_count, ner_count, punct_count
keys = ['token_count','max_dep_depth','clause_count','ner_count','punct_count','negation_count','uppercase_ratio']
for name, ann in annotated.items():
    if 'is_mis' not in ann.columns:
        continue
    print('\nModel:', name)
    display_cols = [k for k in keys if k in ann.columns]
    fig, axes = plt.subplots(len(display_cols), 1, figsize=(8, 3*len(display_cols)))
    if len(display_cols) == 1:
        axes = [axes]
    for ax, col in zip(axes, display_cols):
        sns.boxplot(x='is_mis', y=col, data=ann, ax=ax)
        ax.set_title(f"{name} â€” {col} by misclassified")
    plt.tight_layout()
    plt.show()

Annotating bert (this may take a bit) - using columns text=text true=true_label pred=predicted_label


KeyboardInterrupt: 

In [None]:
# 5) POS differences and top tokens in misclassified vs correct
for name, ann in annotated.items():
    if 'is_mis' not in ann.columns:
        continue
    print('\nPOS / token-level diff for', name)
    pos_cols = [c for c in ann.columns if c.isupper() and len(c) <= 5]  # heuristic for POS columns created
    # fallback: compute POS counts from spaCy on a sample
    sample = ann.sample(min(500, len(ann)), random_state=1)
    # compute simple token frequency separately
    mis_tokens = Counter()
    ok_tokens = Counter()
    for _, row in sample.iterrows():
        doc = nlp(str(row[colinfo[name][0]])) if colinfo[name][0] else nlp('')
        toks = [t.lemma_.lower() for t in doc if t.is_alpha]
        if row.get('is_mis', False):
            mis_tokens.update(toks)
        else:
            ok_tokens.update(toks)
    # top differences
    top_mis = {k: mis_tokens[k] for k in list(dict(mis_tokens.most_common(20)).keys())}
    top_ok = {k: ok_tokens[k] for k in list(dict(ok_tokens.most_common(20)).keys())}
    print('Top tokens in misclassified (sample):', list(top_mis.items())[:10])
    print('Top tokens in correct (sample):', list(top_ok.items())[:10])

In [None]:
# 6) Show representative misclassified examples to inspect syntactic issues
for name, ann in annotated.items():
    if 'is_mis' not in ann.columns:
        continue
    print('\n=== Model:', name, 'sample misclassified examples ===')
    # prioritize long/deep sentences and many clauses/punctuation
    mis = ann[ann['is_mis']].sort_values(by=['max_dep_depth','clause_count','token_count'], ascending=False).head(10)
    for i, row in mis.iterrows():
        txt = row[colinfo[name][0]] if colinfo[name][0] in row else row.get(colinfo[name][0], '')
        print('---')
        print('index:', i)
        print('true:', row[colinfo[name][1]] if colinfo[name][1] in row else None, 'pred:', row[colinfo[name][2]] if colinfo[name][2] in row else None)
        print('token_count:', row.get('token_count'), 'max_dep_depth:', row.get('max_dep_depth'), 'clause_count:', row.get('clause_count'), 'neg:', row.get('negation_count'))
        print('text:', txt)
        # small parse printing to visualize structure
        doc = nlp(str(txt))
        print('POS tags:', ' '.join(f"{t.text}/{t.pos_}" for t in doc))
        print()

    # Also show short ambiguous examples
    short_mis = ann[ann['is_mis']].sort_values(by='token_count', ascending=True).head(5)
    if len(short_mis):
        print('\nShort misclassified examples:')
        for _, row in short_mis.iterrows():
            print('-', row[colinfo[name][0]])

Notes / next steps:
- Inspect printed examples for recurring structural issues: long sentences with many clauses, heavy punctuation, nested clauses, or negation patterns.
- If patterns appear (e.g. many negations or long dependency chains), consider targeted data augmentation or model fine-tuning with syntactic-aware objectives.
- You can increase annotate_df max_rows or run on full datasets if compute/time allows.