In [None]:
import json
import os
import re
from collections import Counter

from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
PAPERS_FOLDER = 'data/final'
CHARTS_FOLDER = 'data/charts'
COMMENTS_JSONL = 'data/paper_comments.jsonl'

## Comment statstics

In [None]:
# Most common file types
file_types = Counter()
for filename in tqdm(os.listdir(PAPERS_FOLDER)):
    if filename.endswith('.json'):
        with open(os.path.join(PAPERS_FOLDER, filename), 'r') as f:
            paper = json.load(f)
            if not paper:
                continue
            for file in paper:
                ext = os.path.splitext(file)[0].lower()
                file_types[ext] += 1
                
file_types.most_common()

In [None]:
# Comments statistics
papers_with_source = 0
papers_with_no_comments = 0
papers_with_short_comments = 0

with open(COMMENTS_JSONL, 'r') as f:
    for line in f:
        comment = json.loads(line)
        papers_with_source += 1
        if not comment['comments']:
            papers_with_no_comments += 1
        elif len(comment['comments']) < 100:
            papers_with_short_comments += 1

papers_with_source, papers_with_no_comments, papers_with_short_comments

In [None]:
x = [
    100_000 - 92_303,
    92_303 - 86164,
    4586,
    86164 - 4586
]
y = [
    'No Archive',
    'No LaTeX',
    'No Comments',
    'Remaining'
]

In [None]:
colors = [
    plt.cm.tab20.colors[6], # red
    plt.cm.tab20.colors[2], # orange
    plt.cm.tab20.colors[3], # light orange
    plt.cm.tab20.colors[4], # green
]

fig, ax = plt.subplots(figsize=(5, 3))
wedges, texts, autotexts = ax.pie(
    x,
    labels=None,  # moved labels to legend
    colors=colors,
    autopct=lambda pct: f"{pct:.1f}%" if pct >= 3 else "",
    startangle=90,
    counterclock=False,
    pctdistance=0.8,
    textprops={"color": "#eee", "fontsize": 10}
)

# Draw center circle to make it a ring
centre_circle = plt.Circle((0, 0), 0.45, fc="white")
fig.gca().add_artist(centre_circle)

# Legend on the right
ax.legend(wedges, y, loc="center left", bbox_to_anchor=(1.0, 0.5), frameon=False)

# Equal aspect ratio ensures that pie is drawn as a circle
ax.axis('equal')

plt.tight_layout()
plt.savefig(f'{CHARTS_FOLDER}/comment_cleanup.pdf', bbox_inches='tight')
plt.show()

## Top domains

In [None]:
# Data
domains = [
    "github.com", "doi.org", "arxiv.org", "q.uiver.app", "stackexchange.com",
    "fink-portal.org", "huggingface.co", "4open.science", "orcid.org", "docs.google.com"
]
percentages = [12.7, 9.1, 8.7, 8.1, 4.0, 3.8, 3.1, 1.9, 1.1, 1.1]

# Sort by percentage
sorted_data = sorted(zip(percentages, domains), reverse=True)
percentages_sorted, domains_sorted = zip(*sorted_data)

# Offset for visibility
offset = 5.0
scaled_values = [p + offset for p in percentages_sorted]

# Create figure
fig, ax = plt.subplots(figsize=(10, 8))

# Gradient colors
colors = [plt.cm.Blues(0.95 - 0.6*(i/len(domains_sorted))) for i in range(len(domains_sorted))]

# Draw bars (rounded edges)
bars = ax.barh(domains_sorted, scaled_values, color=colors, height=0.9)
for bar in bars:
    bar.set_linewidth(0)
    bar.set_edgecolor("none")
    bar.set_alpha(0.8)

# Add text labels with fancy boxes
for bar, value, label in zip(bars, percentages_sorted, domains_sorted):
    ax.text(0.3, bar.get_y() + bar.get_height()/2,
            f"{label} ({value:.1f}%)",
            va="center", ha="left", fontsize=14, color="white", fontweight="bold",
            bbox=dict(facecolor="black", alpha=0.35, boxstyle="round,pad=0.3"))

# Remove y-axis ticks
ax.set_yticks([])

# X-axis: manual ticks (counts instead of percentage)
ax.set_xlim(0, 20)
ax.set_xticks([6.5, 13.5, 18])
ax.set_xticklabels(["500", "3000", "5000"], fontsize=12, fontweight="bold")

# Style grid
ax.grid(axis="x", linestyle="--", alpha=0.7, linewidth=1.2)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["bottom"].set_alpha(0.4)

# Layout polish
ax.invert_yaxis()

#plt.title("Top Referenced Domains in LaTeX Comments", fontsize=18, fontweight="bold", pad=15)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig(f"{CHARTS_FOLDER}/top_domains.pdf", format="pdf", dpi=300)
plt.show()

## Search affiliations of papers with cleaned comments

In [None]:
empty_comment_article_ids = list()

with open(COMMENTS_JSONL, 'r') as f:
    for line in f:
        article = json.loads(line)
        if len(article['comments']) == 0:
            empty_comment_article_ids.append(article['name'])
            
len(empty_comment_article_ids), empty_comment_article_ids[:5]

In [None]:
# Inspect a paper by ID

target_file = 'EXAMPLE-ID.json'
with open(f'{PAPERS_FOLDER}/{target_file}', 'r') as f:
    paper = json.load(f)
    
for file in paper:
    print(f'============================= {file} =============================')
    print(paper[file])

In [None]:
# Test affiliation search pattern

pattern_affiliation = re.compile(r'\\(affiliation|affil|IEEEauthorblockA)?.*?\{(.+?)\}')

tests = [
    '\\affil[$\\dagger$]{Department of Computer Science, ABC University, D Country}\n\\affil[$\\dagger$]{Department of Chocolate, Belgium}',
    '\\affiliation{makako U}',
]

# All non-overlapping matches with positions
for i, t in enumerate(tests, 1):
    matches = [(m.group(2)) for m in pattern_affiliation.finditer(t)]
    print(i, matches)

In [None]:
institutions = []

for file_name in empty_comment_article_ids:
    with open(FULL_PAPERS_FOLDER + '/' + file_name, 'r') as f:
        files = json.load(f)
        content = ''
        for file in files:
            content += files[file] + '\n'
        # Split into lines and collect any line containing a match
        lines = content.splitlines()
        matched_lines = []
        for line in lines:
            if '\\affil' in line or '\\IEEEauthorblockA' in line:
                matched_lines.append(line)
        institutions.extend(matched_lines)
        
institutions

further analysis with manual inspection and LLM prompting...