Let's look at the last day's worth of crates and see what is in the S3 index versus Git.

First, we need to load up the Git repo and build a list of recent publishes.

In [2]:
import json

from datetime import datetime, timedelta
from pygit2 import Repository, GIT_SORT_TIME

# How far to go back: currently one day.
since = datetime.now() - timedelta(days=1)

# Load the repo.
repo = Repository("crates.io-index")

# Build the added crate versions up.
added = []
for commit in repo.walk(repo.head.target, GIT_SORT_TIME):
    # Check if the diff is before the start time we care about.
    when = datetime.fromtimestamp(commit.commit_time)
    if when < since:
        break
    
    # Check if there are actually any new packages.
    diff = repo.diff(commit.parents[0], commit)
    if diff.stats.insertions == 0:
        continue
                
    # Parse out the new crate versions.
    for patch in diff:
        for hunk in patch.hunks:
            for line in hunk.lines:
                if line.new_lineno != -1:
                    version = json.loads(line.content)
                    added.append({
                        "version": version,
                        "when": when,
                    })
                    if len(added) % 100 == 0:
                        print(f"{len(added)} {when}")
    
len(added)

100 2023-04-11 17:18:53
200 2023-04-11 15:26:08
300 2023-04-11 14:47:20
400 2023-04-11 13:51:46
500 2023-04-11 13:00:58
600 2023-04-11 12:41:40
700 2023-04-11 11:49:55
800 2023-04-11 10:18:09
900 2023-04-11 09:41:50
1000 2023-04-11 08:53:15
1100 2023-04-11 08:06:04
1200 2023-04-11 07:56:25
1300 2023-04-11 06:46:51
1400 2023-04-11 06:25:42
1500 2023-04-11 05:25:05
1600 2023-04-11 04:43:28
1700 2023-04-11 03:32:52
1800 2023-04-11 01:36:32
1900 2023-04-11 00:32:00
2000 2023-04-11 00:09:41
2100 2023-04-11 00:01:24
2200 2023-04-10 23:28:44
2300 2023-04-10 22:23:58
2400 2023-04-10 21:48:35
2500 2023-04-10 20:23:32
2600 2023-04-10 19:19:24
2700 2023-04-10 18:59:36


2701

Now we have to check which packages are available in the sparse index.

In [4]:
import requests

def crate_name_path(name: str) -> str:
    name = name.lower()
    if len(name) == 1:
        return f"1/{name}"
    elif len(name) == 2:
        return f"2/{name}"
    elif len(name) == 3:
        return f"3/{name[0]}/{name}"
    else:
        return f"{name[0:2]}/{name[2:4]}/{name}"
    
def crate_index_contains_version(index: requests.Response, version: str) -> bool:
    for index_version in [json.loads(line) for line in [line.strip() for line in r.text.split("\n")] if len(line) > 0]:
        if index_version["vers"] == version:
            return True
    return False

# Since some crates have been published multiple times, we should cache the sparse indices on a per-crate basis.
sparse = {}
names = set([version["version"]["name"] for version in added])
session = requests.Session()
for i, name in enumerate(names):
    sparse[name] = set([
        json.loads(line)["vers"] for line in [
            line.strip() for line in session.get(f"https://index.crates.io/{crate_name_path(name)}").iter_lines()
        ] if len(line) > 0
    ])
    if i % 100 == 0:
        print(f"retrieved {i}/{len(names)} crates")
    
# Create a new list with just the missing package versions.
missing = []
for version in added:
    if version["version"]["vers"] not in sparse[version["version"]["name"]]:
        missing.append(version)
        
len(missing)

retrieved 0/558 crates
retrieved 100/558 crates
retrieved 200/558 crates
retrieved 300/558 crates
retrieved 400/558 crates
retrieved 500/558 crates


50

Let's dump out the crates and versions in a friendlier manner.

In [5]:
[(version["when"], version["version"]["name"], version["version"]["vers"]) for version in missing]

[(datetime.datetime(2023, 4, 11, 13, 27, 32), 'witchcraft-server', '3.5.0'),
 (datetime.datetime(2023, 4, 11, 13, 26, 27),
  'witchcraft-server-macros',
  '3.5.0'),
 (datetime.datetime(2023, 4, 11, 13, 26, 8),
  'witchcraft-server-config',
  '3.5.0'),
 (datetime.datetime(2023, 4, 11, 13, 2, 12), 'slothlang', '1.4.0'),
 (datetime.datetime(2023, 4, 11, 12, 48, 31), 'toad-jni', '0.8.0'),
 (datetime.datetime(2023, 4, 11, 12, 42, 44), 'tx5-demo', '0.0.1-alpha.9'),
 (datetime.datetime(2023, 4, 11, 12, 38, 11), 's2protocol', '1.0.0'),
 (datetime.datetime(2023, 4, 11, 12, 33, 41), 'serde_arrow', '0.6.1'),
 (datetime.datetime(2023, 4, 11, 12, 16, 16),
  'thinker',
  '0.2.0-beta.1+2023041101'),
 (datetime.datetime(2023, 4, 11, 12, 16, 16),
  'thinker',
  '0.2.0-beta.1+2023041102'),
 (datetime.datetime(2023, 4, 11, 12, 16, 16), 'thinker', '0.3.0-beta.1'),
 (datetime.datetime(2023, 4, 11, 12, 16, 16),
  'thinker',
  '0.3.0-beta.1+2023041101'),
 (datetime.datetime(2023, 4, 11, 12, 15, 55), 'spv-cro

Now let's see _when_ this happened.

In [8]:
import plotly.express as px
from datetime import timezone
from pandas import DataFrame

# We'll set the windows to be each hour, and format the times in UTC.
def time_window(time: datetime) -> str:
    return time.astimezone(timezone.utc).strftime("%a %H:00")

# Now build up a dict of each time window.
hours = {}

for version in added:
    window = time_window(version["when"])
    if window in hours:
        hours[window]["all"] += 1
    else:
        hours[window] = {"all": 0, "missing": 0}
        
for version in missing:
    window = time_window(version["when"])
    hours[window]["missing"] += 1
    
# Turn it into a data frame.
hours = DataFrame.from_dict(hours, orient="index")
    
# And we can chart it.
px.line(hours)