# LaTeXpOsEd: Analyzation Stage

This last step is where all the data that comes from the data mining is analyzed. It involves not just this script, but a manual inspection of the findings.

Before running this script:

- Complete: [3_mine_pattern-matching.ipynb](3_mine_pattern-matching.ipynb)
- Complete: [3_mine_entity-extraction.ipynb](3_mine_entity-extraction.ipynb)
- Complete: [3_mine_logical-filtering.ipynb](3_mine_logical-filtering.ipynb)

In [None]:
import json
import os
import re
from collections import Counter
import hashlib

from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
PAPERS_FOLDER = 'data/final'
CHARTS_FOLDER = 'data/charts'
COMMENTS_JSONL = 'data/paper_comments.jsonl'

## Comment statstics

In [None]:
# Most common file types
file_types = Counter()
for filename in tqdm(os.listdir(PAPERS_FOLDER)):
    if filename.endswith('.json'):
        with open(os.path.join(PAPERS_FOLDER, filename), 'r') as f:
            paper = json.load(f)
            if not paper:
                continue
            for file in paper:
                ext = os.path.splitext(file)[0].lower()
                file_types[ext] += 1
                
file_types.most_common()

In [None]:
# Comments statistics
papers_with_source = 0
papers_with_no_comments = 0
papers_with_short_comments = 0

with open(COMMENTS_JSONL, 'r') as f:
    for line in f:
        comment = json.loads(line)
        papers_with_source += 1
        if not comment['comments']:
            papers_with_no_comments += 1
        elif len(comment['comments']) < 100:
            papers_with_short_comments += 1

papers_with_source, papers_with_no_comments, papers_with_short_comments

In [None]:
x = [
    100_000 - 92_303,
    92_303 - 86164,
    4586,
    86164 - 4586
]
y = [
    'No Archive',
    'No LaTeX',
    'No Comments',
    'Remaining'
]

In [None]:
colors = [
    plt.cm.tab20.colors[6], # red
    plt.cm.tab20.colors[2], # orange
    plt.cm.tab20.colors[3], # light orange
    plt.cm.tab20.colors[4], # green
]

fig, ax = plt.subplots(figsize=(5, 3))
wedges, texts, autotexts = ax.pie(
    x,
    labels=None,  # moved labels to legend
    colors=colors,
    autopct=lambda pct: f"{pct:.1f}%" if pct >= 3 else "",
    startangle=90,
    counterclock=False,
    pctdistance=0.8,
    textprops={"color": "#eee", "fontsize": 10}
)

# Draw center circle to make it a ring
centre_circle = plt.Circle((0, 0), 0.45, fc="white")
fig.gca().add_artist(centre_circle)

# Legend on the right
ax.legend(wedges, y, loc="center left", bbox_to_anchor=(1.0, 0.5), frameon=False)

# Equal aspect ratio ensures that pie is drawn as a circle
ax.axis('equal')

plt.tight_layout()
plt.savefig(f'{CHARTS_FOLDER}/comment_cleanup.pdf', bbox_inches='tight')
plt.show()

## Top domains

In [None]:
# Data
domains = [
    "github.com", "doi.org", "arxiv.org", "q.uiver.app", "stackexchange.com",
    "fink-portal.org", "huggingface.co", "4open.science", "orcid.org", "docs.google.com"
]
percentages = [12.7, 9.1, 8.7, 8.1, 4.0, 3.8, 3.1, 1.9, 1.1, 1.1]

# Sort by percentage
sorted_data = sorted(zip(percentages, domains), reverse=True)
percentages_sorted, domains_sorted = zip(*sorted_data)

# Offset for visibility
offset = 5.0
scaled_values = [p + offset for p in percentages_sorted]

# Create figure
fig, ax = plt.subplots(figsize=(10, 8))

# Gradient colors
colors = [plt.cm.Blues(0.95 - 0.6*(i/len(domains_sorted))) for i in range(len(domains_sorted))]

# Draw bars (rounded edges)
bars = ax.barh(domains_sorted, scaled_values, color=colors, height=0.9)
for bar in bars:
    bar.set_linewidth(0)
    bar.set_edgecolor("none")
    bar.set_alpha(0.8)

# Add text labels with fancy boxes
for bar, value, label in zip(bars, percentages_sorted, domains_sorted):
    ax.text(0.3, bar.get_y() + bar.get_height()/2,
            f"{label} ({value:.1f}%)",
            va="center", ha="left", fontsize=14, color="white", fontweight="bold",
            bbox=dict(facecolor="black", alpha=0.35, boxstyle="round,pad=0.3"))

# Remove y-axis ticks
ax.set_yticks([])

# X-axis: manual ticks (counts instead of percentage)
ax.set_xlim(0, 20)
ax.set_xticks([6.5, 13.5, 18])
ax.set_xticklabels(["500", "3000", "5000"], fontsize=12, fontweight="bold")

# Style grid
ax.grid(axis="x", linestyle="--", alpha=0.7, linewidth=1.2)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["bottom"].set_alpha(0.4)

# Layout polish
ax.invert_yaxis()

#plt.title("Top Referenced Domains in LaTeX Comments", fontsize=18, fontweight="bold", pad=15)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig(f"{CHARTS_FOLDER}/top_domains.pdf", format="pdf", dpi=300)
plt.show()

## Search affiliations of papers with cleaned comments

In [None]:
empty_comment_article_ids = list()

with open(COMMENTS_JSONL, 'r') as f:
    for line in f:
        article = json.loads(line)
        if len(article['comments']) == 0:
            empty_comment_article_ids.append(article['name'])
            
len(empty_comment_article_ids), empty_comment_article_ids[:5]

In [None]:
# Inspect a paper by ID

target_file = 'EXAMPLE-ID.json'
with open(f'{PAPERS_FOLDER}/{target_file}', 'r') as f:
    paper = json.load(f)
    
for file in paper:
    print(f'============================= {file} =============================')
    print(paper[file])

In [None]:
# Test affiliation search pattern

pattern_affiliation = re.compile(r'\\(affiliation|affil|IEEEauthorblockA)?.*?\{(.+?)\}')

tests = [
    '\\affil[$\\dagger$]{Department of Computer Science, ABC University, D Country}\n\\affil[$\\dagger$]{Department of Chocolate, Belgium}',
    '\\affiliation{makako U}',
]

# All non-overlapping matches with positions
for i, t in enumerate(tests, 1):
    matches = [(m.group(2)) for m in pattern_affiliation.finditer(t)]
    print(i, matches)

In [None]:
institutions = []

for file_name in empty_comment_article_ids:
    with open(FULL_PAPERS_FOLDER + '/' + file_name, 'r') as f:
        files = json.load(f)
        content = ''
        for file in files:
            content += files[file] + '\n'
        # Split into lines and collect any line containing a match
        lines = content.splitlines()
        matched_lines = []
        for line in lines:
            if '\\affil' in line or '\\IEEEauthorblockA' in line:
                matched_lines.append(line)
        institutions.extend(matched_lines)
        
institutions

further analysis with manual inspection and LLM prompting...

## Validate bitcoin addresses

In [None]:
B58_ALPHABET = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
BECH32_CHARSET = 'qpzry9x8gf2tvdw0s3jn54khce6mua7l'
BECH32_GEN = [0x3b6a57b2, 0x26508e6d, 0x1ea119fa, 0x3d4233dd, 0x2a1462b3]


def sha256(b: bytes) -> bytes:
    return hashlib.sha256(b).digest()


def b58decode_check(addr: str) -> bool:
    # Reject invalid characters
    for c in addr:
        if c not in B58_ALPHABET:
            return False
    # Convert base58 to integer
    num = 0
    for c in addr:
        num = num * 58 + B58_ALPHABET.index(c)
    # Count leading zeros
    n_pad = 0
    for c in addr:
        if c == '1':
            n_pad += 1
        else:
            break
    full = num.to_bytes((num.bit_length() + 7) // 8, 'big') if num > 0 else b''
    full = b'\x00' * n_pad + full
    if len(full) < 4:
        return False
    payload, checksum = full[:-4], full[-4:]
    return checksum == sha256(sha256(payload))[:4]


def bech32_polymod(values):
    chk = 1
    for v in values:
        b = (chk >> 25) & 0xff
        chk = ((chk & 0x1ffffff) << 5) ^ v
        for i in range(5):
            if (b >> i) & 1:
                chk ^= BECH32_GEN[i]
    return chk


def bech32_hrp_expand(hrp):
    return [ord(x) >> 5 for x in hrp] + [0] + [ord(x) & 31 for x in hrp]


def bech32_verify_checksum(hrp, data, bech32m=False):
    const = 0x2bc830a3 if bech32m else 1
    return bech32_polymod(bech32_hrp_expand(hrp) + data) == const


def bech32_decode(addr):
    # per BIP-173/350
    if any(ord(x) < 33 or ord(x) > 126 for x in addr):
        return None
    if addr.lower() != addr and addr.upper() != addr:
        return None
    addr = addr.lower()
    pos = addr.rfind('1')
    if pos == -1:
        return None
    hrp = addr[:pos]
    data = addr[pos+1:]
    if len(hrp) < 1 or len(data) < 6:
        return None
    if not all(x in BECH32_CHARSET for x in data):
        return None
    data_vals = [BECH32_CHARSET.find(c) for c in data]
    return hrp, data_vals


def convert_bits(data, from_bits, to_bits, pad=True):
    acc = 0
    bits = 0
    ret = []
    maxv = (1 << to_bits) - 1
    for value in data:
        if value < 0 or (value >> from_bits):
            return None
        acc = (acc << from_bits) | value
        bits += from_bits
        while bits >= to_bits:
            bits -= to_bits
            ret.append((acc >> bits) & maxv)
    if pad:
        if bits:
            ret.append((acc << (to_bits - bits)) & maxv)
    elif bits >= from_bits or ((acc << (to_bits - bits)) & maxv):
        return None
    return ret


def is_valid_bech32(addr: str) -> bool:
    dec = bech32_decode(addr)
    if not dec:
        return False
    hrp, data = dec
    if hrp not in ('bc', 'tb', 'bcrt'):
        return False
    # Minimum witness program structure
    if len(data) < 7:
        return False
    witver = data[0]
    prog = data[1:-6]
    # Verify checksum for both encodings depending on version per BIP-350
    if witver == 0:
        if not bech32_verify_checksum(hrp, data, bech32m=False):
            return False
    else:
        if not bech32_verify_checksum(hrp, data, bech32m=True):
            return False
    # Convert 5-bit groups to bytes and check length constraints
    prog_bytes = convert_bits(prog, 5, 8, pad=False)
    if prog_bytes is None:
        return False
    if len(prog_bytes) < 2 or len(prog_bytes) > 40:
        return False
    if witver == 0 and len(prog_bytes) not in (20, 32):
        return False
    if witver > 16:
        return False
    return True


def detect_network(addr: str) -> str:
    # Simple heuristic: mainnet/testnet/regtest by HRP or leading version byte
    if addr.lower().startswith('bc1'):
        return 'mainnet'
    if addr.lower().startswith(('tb1', 'bcrt1')):
        return 'test/reg'
    if addr and addr[0] in '13':
        return 'mainnet'
    if addr and addr[0] in 'mn2':
        return 'test/reg'
    return 'unknown'


def validate_address(addr: str) -> dict:
    kind = 'unknown'
    valid = False
    if addr and addr[0] in '123mn':
        valid = b58decode_check(addr)
        kind = 'base58'
    elif addr.lower().startswith(('bc1', 'tb1', 'bcrt1')):
        valid = is_valid_bech32(addr)
        kind = 'bech32'
    else:
        valid = False
    return {
        'address': addr,
        'valid': valid,
        'type': kind,
        'network': detect_network(addr)
    }


def validate_file(btc_file: str) -> list:
    with open(btc_file, 'r', encoding='utf-8') as f:
        addrs = [line.strip() for line in f if line.strip()]

    results = [validate_address(a) for a in addrs]
    valid_count = sum(1 for r in results if r['valid'])
    print(f'Validated {len(results)} addresses. Valid: {valid_count}.')
    return results

In [None]:
validate_address('1AGNa15ZQXAZUgFiqJ2i7Z2DPU2J6hW62i')

In [None]:
validate_file('data/btc_addresses.txt')

## IBAN Validation

In [None]:
# Non-exhaustive list of country codes and their IBAN lengths
# Source: https://www.iban.com/structure
IBAN_LENGTHS = {
    "AL": 28, "AD": 24, "AT": 20, "AZ": 28, "BH": 22, "BE": 16, "BA": 20, "BR": 29,
    "BG": 22, "CR": 22, "HR": 21, "CY": 28, "CZ": 24, "DK": 18, "DO": 28, "EE": 20,
    "FO": 18, "FI": 18, "FR": 27, "GE": 22, "DE": 22, "GI": 23, "GR": 27, "GL": 18,
    "GT": 28, "HU": 28, "IS": 26, "IE": 22, "IL": 23, "IT": 27, "JO": 30, "KZ": 20,
    "XK": 20, "KW": 30, "LV": 21, "LB": 28, "LI": 21, "LT": 20, "LU": 20, "MK": 19,
    "MT": 31, "MR": 27, "MU": 30, "MD": 24, "MC": 27, "ME": 22, "NL": 18, "NO": 15,
    "PK": 24, "PS": 29, "PL": 28, "PT": 25, "QA": 29, "RO": 24, "SM": 27, "SA": 24,
    "RS": 22, "SK": 24, "SI": 19, "ES": 24, "SE": 24, "CH": 21, "TL": 23, "TN": 24,
    "TR": 26, "AE": 23, "GB": 22, "VA": 22, "VG": 24, "UA": 29, "SC": 31, "IQ": 23,
    "BY": 28, "SV": 28, "LY": 25, "SD": 18, "BI": 27, "DJ": 27, "RU": 33, "SO": 23,
    "NI": 28, "MN": 20, "FK": 18, "OM": 23, "YE": 30, "HN": 28
}

# Optional: relaxed structural regex (2 letters, 2 digits, then alphanumerics)
IBAN_RE = re.compile(r'^[A-Z]{2}[0-9]{2}[A-Z0-9]+$')

def _alnum_to_int_str(s: str) -> str:
    # Map A..Z -> 10..35, digits remain
    out = []
    for ch in s:
        if ch.isdigit():
            out.append(ch)
        else:
            out.append(str(ord(ch) - 55))  # ord('A')=65 -> 10
    return ''.join(out)

def _mod97_large(num_str: str) -> int:
    # Compute num_str % 97 without big ints by chunking
    remainder = 0
    for ch in num_str:
        remainder = (remainder * 10 + ord(ch) - 48) % 97
    return remainder

def validate_iban(iban: str) -> bool:
    if iban is None:
        return False
    # Normalize: remove spaces and make uppercase
    iban_clean = re.sub(r'\s+', '', iban).upper()
    # Basic structure check
    if len(iban_clean) < 4 or not IBAN_RE.match(iban_clean):
        return False
    country = iban_clean[:2]
    if country not in IBAN_LENGTHS:
        return False
    if len(iban_clean) != IBAN_LENGTHS[country]:
        return False
    # Rearrange and convert
    rearranged = iban_clean[4:] + iban_clean[:4]
    numeric = _alnum_to_int_str(rearranged)
    # MOD 97 must equal 1
    return _mod97_large(numeric) == 1

samples = [
    "GB33 BUKB 202015 55555555",
    "DE75512108001245126199",
    "FR7630006000011234567890189",
    "GB00BUKB20201555555555", # Incorrect
]
for s in samples:
    print(s, "=>", validate_iban(s))

In [None]:
total_valid = 0
with open("data/IBAN.txt", "r") as f:
    for line in f:
        iban = line.strip()
        if iban:
            is_valid = validate_iban(iban)
            print(iban, "=>", is_valid)
            if is_valid:
                total_valid += 1
total_valid