# Textual metrics (Length, Boilerplate, Fog, Hardinfo, Redundancy, Stickiness)

This notebook computes the metrics from your CSV reports. Each metric is computed in its own cell, and a summary table is produced at the end.


In [80]:
# Setup imports and configuration.
from pathlib import Path
from collections import Counter, defaultdict
import math
import re
import string
import pandas as pd

# Update these two values to view metrics for a specific report
TARGET_COMPANY = "AFI"
TARGET_YEAR = 2020

# Folder holding CSV files (per report or combined)
REPORTS_DIR = Path("textual-metrics/reports")
if not REPORTS_DIR.exists():
    REPORTS_DIR = Path("reports")


In [81]:
# Load report CSVs and build per-report sentence lists.
def parse_company_year(value: str):
    match = re.search(r"([A-Za-z]+)(\d{4})", str(value))
    if match:
        return match.group(1).upper(), int(match.group(2))
    return None, None

csv_paths = sorted(REPORTS_DIR.glob("*.csv"))
if not csv_paths:
    raise FileNotFoundError(f"No CSV files found in {REPORTS_DIR}")

frames = []
for path in csv_paths:
    df = pd.read_csv(path)
    if "sentence" not in df.columns:
        continue

    file_company, file_year = parse_company_year(path.stem)
    if not file_company or not file_year:
        raise ValueError(f"Could not parse company/year from filename: {path.name}")

    df = df.copy()
    df["company"] = file_company
    df["year"] = file_year
    frames.append(df)

all_df = pd.concat(frames, ignore_index=True)
if all_df[["company", "year"]].isna().any().any():
    raise ValueError("Could not parse company/year for some rows. Check filename pattern.")

reports = {}
for (company, year), group in all_df.groupby(["company", "year"]):
    sentences = group["sentence"].dropna().astype(str).tolist()
    reports[(company, int(year))] = sentences

print(f"Loaded {len(reports)} reports from {len(csv_paths)} CSV files.")


Loaded 1484 reports from 1484 CSV files.


In [82]:
# Define helpers for tokenization, n-grams, syllables, and numeric filters.
def words_in_text(text: str):
    raw = []
    for token in str(text).split():
        token = token.strip(string.punctuation)
        if token:
            raw.append(token)

    words = []
    buffer = []
    for token in raw:
        if len(token) < 2:
            buffer.append(token)
            continue
        if buffer:
            if len(buffer) >= 2:
                words.append(''.join(buffer))
            else:
                words.append(buffer[0])
            buffer = []
        words.append(token)

    if buffer:
        if len(buffer) >= 2:
            words.append(''.join(buffer))
        else:
            words.append(buffer[0])

    return words

def normalized_words(text: str):
    return [w.lower() for w in words_in_text(text)]

def ngrams(words, n):
    return [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]

def count_syllables(word: str):
    word = re.sub(r"[^a-zA-Z]", "", word).lower()
    if not word:
        return 0
    vowels = "aeiouy"
    count = 0
    prev_vowel = False
    for ch in word:
        is_vowel = ch in vowels
        if is_vowel and not prev_vowel:
            count += 1
        prev_vowel = is_vowel
    if word.endswith("e") and count > 1:
        count -= 1
    if word.endswith("le") and len(word) > 2 and word[-3] not in vowels:
        count += 1
    return max(count, 1)

DATE_PATTERNS = [
    re.compile(r"\b\d{4}-\d{2}-\d{2}\b", re.IGNORECASE),
    re.compile(r"\b\d{1,2}/\d{1,2}/\d{2,4}\b", re.IGNORECASE),
    re.compile(r"\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b", re.IGNORECASE),
    re.compile(r"\b\d{1,2}\s+(?:jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\s+\d{4}\b", re.IGNORECASE),
    re.compile(r"\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\s+\d{4}\b", re.IGNORECASE),
]
SECTION_PATTERNS = [
    re.compile(r"\b\d+(?:\.\d+)+\b", re.IGNORECASE),
    re.compile(r"\bsection\s+\d+(?:\.\d+)*\b", re.IGNORECASE),
]
PAGE_PATTERNS = [
    re.compile(r"\bpage\s+\d+\b", re.IGNORECASE),
    re.compile(r"\bp\.?\s*\d+\b", re.IGNORECASE),
]
NUMBER_PATTERN = re.compile(r"\b\d{1,3}(?:,\d{3})+(?:\.\d+)?\b|\b\d+(?:\.\d+)?\b")


In [83]:
# Compute Length for the selected report.
def total_words(sentences):
    return sum(len(words_in_text(s)) for s in sentences)

sentences = reports.get((TARGET_COMPANY, TARGET_YEAR))
if sentences is None:
    raise KeyError(f"Report not found for {TARGET_COMPANY}{TARGET_YEAR}")

total = total_words(sentences)
length_metric = total / 1000 if total else 0
print(f"Length ({TARGET_COMPANY}{TARGET_YEAR}): {length_metric:.4f} (total_words={total}, denominator=1000)")


Length (AFI2020): 23.2700 (total_words=23270, denominator=1000)


In [84]:
# Compute Boilerplate for the selected report (year-level 75% rule).
report_4grams = {}
for (company, year), sents in reports.items():
    grams = set()
    for s in sents:
        words = normalized_words(s)
        grams.update(ngrams(words, 4))
    report_4grams[(company, year)] = grams

boilerplate_4grams_by_year = {}
for year in sorted({yr for _, yr in reports.keys()}):
    year_reports = [grams for (comp, yr), grams in report_4grams.items() if yr == year]
    firm_count = len(year_reports)
    if firm_count == 0:
        boilerplate_4grams_by_year[year] = set()
        continue
    counts = Counter()
    for grams in year_reports:
        counts.update(grams)
    threshold = math.ceil(0.75 * firm_count)
    boilerplate_4grams_by_year[year] = {g for g, c in counts.items() if c >= threshold}

def boilerplate_words(sentences, year):
    boiler_grams = boilerplate_4grams_by_year.get(year, set())
    count = 0
    for s in sentences:
        words = normalized_words(s)
        sentence_grams = set(ngrams(words, 4))
        if sentence_grams & boiler_grams:
            count += len(words_in_text(s))
    return count

boiler_words = boilerplate_words(sentences, TARGET_YEAR)
boiler_metric = (boiler_words / total) if total else 0
print(f"Boilerplate ({TARGET_COMPANY}{TARGET_YEAR}): {boiler_metric:.4f} (boilerplate_words={boiler_words}, total_words={total})")


Boilerplate (AFI2020): 0.1372 (boilerplate_words=3193, total_words=23270)


In [85]:
# Compute Fog index for the selected report (education level).
def fog_index(sentences):
    sentence_list = [s for s in sentences if words_in_text(s)]
    if not sentence_list:
        return None
    words = [w for s in sentence_list for w in words_in_text(s)]
    if not words:
        return None
    complex_count = sum(1 for w in words if count_syllables(w) >= 3)
    # Fog = 0.4 * (ASL + 100 * PCW)
    return 0.4 * ((len(words) / len(sentence_list)) + 100 * (complex_count / len(words)))

fog_metric = fog_index(sentences)
print(f"Fog ({TARGET_COMPANY}{TARGET_YEAR}): {fog_metric:.4f} (sentences={len([s for s in sentences if words_in_text(s)])}, words={len([w for s in sentences for w in words_in_text(s)])}, complex_words={sum(1 for s in sentences for w in words_in_text(s) if count_syllables(w) >= 3)})")


Fog (AFI2020): 19.8375 (sentences=884, words=23270, complex_words=5415)


In [86]:
# Compute Hardinfo for the selected report (except for dates, sections, and page numbers).
def hardinfo_count(sentences):
    count = 0
    for s in sentences:
        cleaned = s
        for pattern in DATE_PATTERNS + SECTION_PATTERNS + PAGE_PATTERNS:
            cleaned = pattern.sub(" ", cleaned)
        count += len(NUMBER_PATTERN.findall(cleaned))
    return count

hardinfo = hardinfo_count(sentences)
hardinfo_metric = (hardinfo / total) * 1000 if total else 0
print(f"Hardinfo ({TARGET_COMPANY}{TARGET_YEAR}): {hardinfo_metric:.4f} (informative_numbers={hardinfo}, total_words={total})")


Hardinfo (AFI2020): 53.2875 (informative_numbers=1240, total_words=23270)


In [87]:
# Compute Redundancy for the selected report.
def redundancy_words(sentences):
    normalized = [str(s).strip() for s in sentences if str(s).strip()]
    counts = Counter(normalized)
    redundant = 0
    for sentence, freq in counts.items():
        if freq > 1:
            redundant += freq * len(words_in_text(sentence))
    return redundant

redundant = redundancy_words(sentences)
redundancy_metric = (redundant / total) if total else 0
print(f"Redundancy ({TARGET_COMPANY}{TARGET_YEAR}): {redundancy_metric:.4f} (redundant_words={redundant}, total_words={total})")


Redundancy (AFI2020): 0.1508 (redundant_words=3508, total_words=23270)


In [88]:
# Compute Stickiness for the selected report using prior year.
def report_8grams(sentences):
    grams = set()
    for s in sentences:
        words = normalized_words(s)
        grams.update(ngrams(words, 8))
    return grams

def sticky_words(sentences, prior_sentences):
    prior_grams = report_8grams(prior_sentences)
    count = 0
    for s in sentences:
        words = normalized_words(s)
        if set(ngrams(words, 8)) & prior_grams:
            count += len(words_in_text(s))
    return count

prior_report = reports.get((TARGET_COMPANY, TARGET_YEAR - 1))
if prior_report is None:
    stickiness_metric = None
    print(f"Stickiness ({TARGET_COMPANY}{TARGET_YEAR}): N/A (missing prior year report)")
else:
    sticky = sticky_words(sentences, prior_report)
    stickiness_metric = (sticky / total) if total else 0
    print(f"Stickiness ({TARGET_COMPANY}{TARGET_YEAR}): {stickiness_metric:.4f} (sticky_words={sticky}, total_words={total})")


Stickiness (AFI2020): 0.7744 (sticky_words=18020, total_words=23270)


In [89]:
# Build summary table for all reports.
records = []
for (company, year), sents in reports.items():
    total = total_words(sents)
    length_metric = total / 1000 if total else 0
    boiler_words = boilerplate_words(sents, year)
    boiler_metric = (boiler_words / total) if total else 0
    fog_metric = fog_index(sents)
    hardinfo = hardinfo_count(sents)
    hardinfo_metric = (hardinfo / total) * 1000 if total else 0
    redundant = redundancy_words(sents)
    redundancy_metric = (redundant / total) if total else 0
    prior = reports.get((company, year - 1))
    if prior is None:
        stickiness_metric = None
    else:
        sticky = sticky_words(sents, prior)
        stickiness_metric = (sticky / total) if total else 0

    records.append({
        "company": company,
        "year": year,
        "length": length_metric,
        "boilerplate": boiler_metric,
        "fog": fog_metric,
        "hardinfo": hardinfo_metric,
        "redundancy": redundancy_metric,
        "stickiness": stickiness_metric,
    })

summary = pd.DataFrame(records).sort_values(["company", "year"]).reset_index(drop=True)
summary


Unnamed: 0,company,year,length,boilerplate,fog,hardinfo,redundancy,stickiness
0,AFI,2013,35.109,0.129767,20.309889,43.122846,0.098180,
1,AFI,2014,33.441,0.128256,20.437731,45.841931,0.106755,0.790407
2,AFI,2015,26.904,0.104037,20.245238,50.773119,0.154252,0.633958
3,AFI,2016,23.992,0.124083,19.677189,55.310103,0.130127,0.808269
4,AFI,2017,17.438,0.118305,20.001296,50.865925,0.010322,0.824406
...,...,...,...,...,...,...,...,...
1479,WHS,2021,39.299,0.127484,20.911034,26.616453,0.006158,0.391104
1480,WHS,2022,41.885,0.130405,21.185322,26.883132,0.013083,0.439585
1481,WHS,2023,39.808,0.124548,20.663708,28.712822,0.004070,0.457546
1482,WHS,2024,34.516,0.181046,20.716922,22.916908,0.199618,0.610181


In [90]:
# Save summary to CSV.
output_dir = Path("textual-metrics/outputs")
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "textual_metrics_summary.csv"
summary.to_csv(output_path, index=False)
print(f"Saved summary to {output_path}")


Saved summary to textual-metrics/outputs/textual_metrics_summary.csv
