# QC Entity Inventory Explorer

This notebook allows exploration of QC entity inventory results generated
by `qc_entity_inventory.py`.  

We can inspect:
- The top entities per class
- Rare entities with low occurrences

The insights gained here will help define **rules for annotation**,
to be documented in [`docs/annotation_rules.md`](../docs/annotation_rules.md).  
For example, these rules may include:
- Standardizing numeric formats, such as temperatures in Kelvin (`300 K` ‚Üí `300K`)
- Removing unnecessary spaces between values and units
- Normalizing synonyms or alternative spellings of entity names
... 

## Load libraries and setup

In [1]:
import sys
from pathlib import Path

import pandas as pd

project_root = Path("..").resolve()
sys.path.insert(0, str(project_root))

In [2]:
%load_ext watermark
%watermark

Last updated: 2026-01-03T01:00:09.229181+01:00

Python implementation: CPython
Python version       : 3.14.0
IPython version      : 8.13.2

Compiler    : Clang 20.1.4 
OS          : Linux
Release     : 6.8.0-51-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit



In [3]:
# Folder containing the QC entity inventory files
out_folder = Path("../results/qc_annotations")

# Dictionary to store data per class
entity_data = {}

## Load data

In [12]:
for file_path in out_folder.glob("*.txt"):
    with file_path.open(encoding="utf-8") as f:
        lines = f.readlines()

    # Skip header lines (start with #)
    data_lines = [line.strip() for line in lines if not line.startswith("#")]

    # Build DataFrame
    df = pd.DataFrame(
        [line.split("\t") for line in data_lines],
        columns=["entity", "count", "annotation_paths"]
    )
    df.loc[:, "count"] = df["count"].astype(int)

    # Compute totals
    total_occurrences = df["count"].sum()
    unique_entities = df.shape[0]

    # Store per class
    entity_data[file_path.stem] = df

print("Annotation summary by class:")
print("-" * 50)
print(f"{'Class':20} | {'Total occurrences':17} | {'Unique entities':15}")
print("-" * 50)

# Trier par total_occurrences d√©croissant
for class_name, df in sorted(entity_data.items(), key=lambda x: x[1]["count"].sum(), 
    reverse=True):
    total_occurrences = df["count"].sum()
    unique_entities = df.shape[0]
    print(f"{class_name:20} | {total_occurrences:17} | {unique_entities:15}")

print("-" * 50)


Annotation summary by class:
--------------------------------------------------
Class                | Total occurrences | Unique entities
--------------------------------------------------
MOL                  |              3480 |            1175
FFM                  |               414 |             106
SOFTNAME             |               355 |              79
STIME                |               186 |              98
TEMP                 |               131 |              57
SOFTVERS             |                90 |              54
--------------------------------------------------


## 

## Molecules üß¨

In [13]:
df = entity_data["MOL"].sort_values("count", ascending=False)
df


Unnamed: 0,entity,count,annotation_paths
0,water,214,"[annotations/v2/zenodo_3613573.json, annotatio..."
1,popc,105,"[annotations/v2/zenodo_3613573.json, annotatio..."
2,dppc,56,"[annotations/v2/zenodo_259443.json, annotation..."
3,cholesterol,55,"[annotations/v2/zenodo_259443.json, annotation..."
4,nacl,44,"[annotations/v2/zenodo_51760.json, annotations..."
...,...,...,...
775,dioctylfluorene,1,[annotations/v2/figshare_3494177.json]
776,dipalmitoylphosphatidylcholine,1,[annotations/v2/figshare_2227162.json]
777,dipeptides,1,[annotations/v2/figshare_2532337.json]
778,diphenyl diselenide,1,[annotations/v2/figshare_3457004.json]


In [14]:
total_entities = len(df)
rare_df = df[df["count"] == 1]
num_rare = len(rare_df)

print(f"Rare molecules (count = 1): {num_rare} / {total_entities} total entities")
rare_df

Rare molecules (count = 1): 599 / 1175 total entities


Unnamed: 0,entity,count,annotation_paths
976,norcoclaurine synthase,1,[annotations/v2/figshare_9905063.json]
971,nitrogen,1,[annotations/v2/figshare_6965759.json]
972,nitrogen dioxide,1,[annotations/v2/figshare_6965759.json]
973,nmda receptor,1,[annotations/v2/zenodo_7301759.json]
974,nmdars,1,[annotations/v2/zenodo_7301759.json]
...,...,...,...
775,dioctylfluorene,1,[annotations/v2/figshare_3494177.json]
776,dipalmitoylphosphatidylcholine,1,[annotations/v2/figshare_2227162.json]
777,dipeptides,1,[annotations/v2/figshare_2532337.json]
778,diphenyl diselenide,1,[annotations/v2/figshare_3457004.json]


## Force Field Model üõ†Ô∏è

In [15]:
df = entity_data["FFM"].sort_values("count", ascending=False)
df

Unnamed: 0,entity,count,annotation_paths
0,charmm36,44,"[annotations/v2/figshare_14225919.json, annota..."
1,martini,34,"[annotations/v2/figshare_13836577.json, annota..."
2,charmm,28,"[annotations/v2/zenodo_1198454.json, annotatio..."
3,amber,25,"[annotations/v2/zenodo_7273800.json, annotatio..."
4,tip3p,25,"[annotations/v2/zenodo_259443.json, annotation..."
...,...,...,...
75,charmm36m.,1,[annotations/v2/zenodo_6592231.json]
76,charmm36mw,1,[annotations/v2/zenodo_5573728.json]
78,ff03*,1,[annotations/v2/figshare_2532337.json]
79,ff03w,1,[annotations/v2/figshare_2532337.json]


In [16]:
total_entities = len(df)
rare_df = df[df["count"] == 1]
num_rare = len(rare_df)

print(f"Rare force field models (count = 1): {num_rare} / {total_entities} total entities")
rare_df

Rare force field models (count = 1): 47 / 106 total entities


Unnamed: 0,entity,count,annotation_paths
88,gromos54a7,1,[annotations/v2/figshare_4806544.json]
91,lipid17,1,[annotations/v2/zenodo_3610470.json]
90,lipid14,1,[annotations/v2/zenodo_34415.json]
89,l-opls,1,[annotations/v2/figshare_14225919.json]
84,gap,1,[annotations/v2/zenodo_7112198.json]
87,gromos 56a6carbo/carbo r,1,[annotations/v2/figshare_7607222.json]
86,gromos 53a6,1,[annotations/v2/figshare_11771604.json]
85,gdml,1,[annotations/v2/zenodo_7112198.json]
93,martini 3,1,[annotations/v2/zenodo_5060102.json]
92,martini 2.2p,1,[annotations/v2/zenodo_6973476.json]


## Software name ‚öôÔ∏è

In [17]:
df = entity_data["SOFTNAME"].sort_values("count", ascending=False)
df

Unnamed: 0,entity,count,annotation_paths
0,gromacs,136,"[annotations/v2/zenodo_51760.json, annotations..."
1,lammps,28,"[annotations/v2/zenodo_7234728.json, annotatio..."
2,amber,19,"[annotations/v2/zenodo_6870476.json, annotatio..."
3,vmd,15,"[annotations/v2/figshare_1381865.json, annotat..."
4,charmm-gui,9,"[annotations/v2/zenodo_3988469.json, annotatio..."
...,...,...,...
52,gibbs,1,[annotations/v2/figshare_2267512.json]
53,github,1,[annotations/v2/zenodo_3610470.json]
54,glide,1,[annotations/v2/zenodo_3567651.json]
55,glycam,1,[annotations/v2/zenodo_5592299.json]


In [18]:
total_entities = len(df)
rare_df = df[df["count"] == 1]
num_rare = len(rare_df)

print(f"Rare software names (count = 1): {num_rare} / {total_entities} total entities")
rare_df

Rare software names (count = 1): 42 / 79 total entities


Unnamed: 0,entity,count,annotation_paths
68,pathsample,1,[annotations/v2/zenodo_7107608.json]
60,intermol,1,[annotations/v2/zenodo_5592299.json]
61,lipidbook,1,[annotations/v2/zenodo_13392.json]
62,maestro,1,[annotations/v2/zenodo_3362889.json]
63,molpro,1,[annotations/v2/zenodo_6791151.json]
64,molsim,1,[annotations/v2/zenodo_7523635.json]
65,namd2,1,[annotations/v2/zenodo_4245236.json]
66,optim,1,[annotations/v2/zenodo_7107608.json]
67,paramchem,1,[annotations/v2/zenodo_6988344.json]
71,pyinteraph,1,[annotations/v2/zenodo_1346073.json]


## Software version üî¢

In [19]:
df = entity_data["SOFTVERS"].sort_values("count", ascending=False)
df

Unnamed: 0,entity,count,annotation_paths
0,3.x,7,"[annotations/v2/zenodo_51760.json, annotations..."
1,4.0.7,7,"[annotations/v2/zenodo_14591.json, annotations..."
2,4.5,6,"[annotations/v2/zenodo_15550.json, annotations..."
3,16,4,"[annotations/v2/zenodo_3988469.json, annotatio..."
4,5.1.4,4,"[annotations/v2/zenodo_1198454.json, annotatio..."
5,2020,3,"[annotations/v2/zenodo_4245236.json, annotatio..."
6,5.0.3,3,"[annotations/v2/zenodo_30904.json, annotations..."
7,v. 5.1 >,3,"[annotations/v2/zenodo_1198454.json, annotatio..."
12,5.1.1,2,"[annotations/v2/zenodo_55565.json, annotations..."
14,simulation engine v5,2,"[annotations/v2/zenodo_247386.json, annotation..."


In [20]:
total_entities = len(df)
rare_df = df[df["count"] == 1]
num_rare = len(rare_df)

print(f"Rare software versions (count = 1): {num_rare} / {total_entities} total entities")
rare_df

Rare software versions (count = 1): 39 / 54 total entities


Unnamed: 0,entity,count,annotation_paths
43,software (v. 2016.4),1,[annotations/v2/zenodo_3901180.json]
36,5.0.3.,1,[annotations/v2/zenodo_34415.json]
37,5.0.6,1,[annotations/v2/zenodo_1293813.json]
38,5.0.x,1,[annotations/v2/zenodo_838635.json]
39,5.1.5,1,[annotations/v2/zenodo_6973476.json]
40,5.1.x,1,[annotations/v2/zenodo_6817824.json]
41,molecular dynamics,1,[annotations/v2/zenodo_6879091.json]
42,simulation engine version 2019.4,1,[annotations/v2/zenodo_3667662.json]
50,version 2.6.0,1,[annotations/v2/zenodo_3865919.json]
44,stable dec. 2018,1,[annotations/v2/zenodo_7273800.json]


## Simulation time ‚è±

In [21]:
df = entity_data["STIME"].sort_values("count", ascending=False)
df

Unnamed: 0,entity,count,annotation_paths
0,500ns,12,"[annotations/v2/zenodo_1009027.json, annotatio..."
2,microsecond,11,"[annotations/v2/figshare_7924394.json, annotat..."
1,100ns,11,"[annotations/v2/zenodo_6582985.json, annotatio..."
3,200 ns,9,"[annotations/v2/zenodo_259443.json, annotation..."
4,500 ns,8,"[annotations/v2/zenodo_3613573.json, annotatio..."
...,...,...,...
48,12 Œºs,1,[annotations/v2/zenodo_7007107.json]
47,11 ns,1,[annotations/v2/zenodo_51750.json]
46,109 ns,1,[annotations/v2/zenodo_51747.json]
45,1080 ns,1,[annotations/v2/zenodo_3778112.json]


In [22]:
total_entities = len(df)
rare_df = df[df["count"] == 1]
num_rare = len(rare_df)

print(f"Rare simulation times (count = 1): {num_rare} / {total_entities} total entities")
rare_df

Rare simulation times (count = 1): 71 / 98 total entities


Unnamed: 0,entity,count,annotation_paths
79,40ns,1,[annotations/v2/zenodo_13853.json]
72,36 Œºs,1,[annotations/v2/figshare_7857659.json]
77,4.1 us,1,[annotations/v2/zenodo_7037843.json]
76,4-8 Œºs,1,[annotations/v2/figshare_13387534.json]
75,380 ns,1,[annotations/v2/zenodo_1198158.json]
...,...,...,...
48,12 Œºs,1,[annotations/v2/zenodo_7007107.json]
47,11 ns,1,[annotations/v2/zenodo_51750.json]
46,109 ns,1,[annotations/v2/zenodo_51747.json]
45,1080 ns,1,[annotations/v2/zenodo_3778112.json]


## Temperature üå°Ô∏è

In [23]:
df = entity_data["TEMP"].sort_values("count", ascending=False)
df

Unnamed: 0,entity,count,annotation_paths
0,310k,17,"[annotations/v2/zenodo_1198454.json, annotatio..."
1,310 k,13,"[annotations/v2/zenodo_259443.json, annotation..."
2,300 k,10,"[annotations/v2/zenodo_6380887.json, annotatio..."
3,298 k,7,"[annotations/v2/zenodo_1167532.json, annotatio..."
4,323 k,6,"[annotations/v2/zenodo_51760.json, annotations..."
5,325k,5,"[annotations/v2/zenodo_1009027.json, annotatio..."
6,288k,4,"[annotations/v2/zenodo_3950029.json, annotatio..."
7,303k,4,"[annotations/v2/zenodo_1009607.json, annotatio..."
8,323k,4,"[annotations/v2/zenodo_14591.json, annotations..."
9,358k,4,"[annotations/v2/zenodo_3950029.json, annotatio..."


In [24]:
total_entities = len(df)
rare_df = df[df["count"] == 1]
num_rare = len(rare_df)

print(f"Rare software versions (count = 1): {num_rare} / {total_entities} total entities")
rare_df

Rare software versions (count = 1): 39 / 57 total entities


Unnamed: 0,entity,count,annotation_paths
46,3500,1,[annotations/v2/zenodo_7234728.json]
39,315,1,[annotations/v2/zenodo_1009027.json]
40,315k,1,[annotations/v2/zenodo_1009027.json]
41,320,1,[annotations/v2/zenodo_2653735.json]
42,322k,1,[annotations/v2/zenodo_1009027.json]
43,333 k,1,[annotations/v2/zenodo_2653735.json]
44,338 k,1,[annotations/v2/zenodo_1009027.json]
45,340 k.,1,[annotations/v2/zenodo_6380887.json]
55,80-120 c,1,[annotations/v2/zenodo_7007107.json]
47,36.85 c,1,[annotations/v2/figshare_11764158.json]
