In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from bisect import bisect
import seaborn as sns
from tqdm import tqdm

from dandi.dandiapi import DandiAPIClient
from collections import defaultdict

# Define helper functions

In [None]:
# bytes pretty-printing
UNITS_MAPPING = [
    (1<<50, ' PB'),
    (1<<40, ' TB'),
    (1<<30, ' GB'),
    (1<<20, ' MB'),
    (1<<10, ' KB'),
    (1, (' byte', ' bytes')),
]


def pretty_size(bytes, units=UNITS_MAPPING):
    """
    Get human-readable file sizes.
    simplified version of https://pypi.python.org/pypi/hurry.filesize/
    """
    for factor, suffix in units:
        if bytes >= factor:
            break
    amount = int(bytes / factor)

    if isinstance(suffix, tuple):
        singular, multiple = suffix
        if amount == 1:
            suffix = singular
        else:
            suffix = multiple
    return str(amount) + suffix

In [None]:
def has_nwb(metadata):
    return any(
        x['identifier'] == 'RRID:SCR_015242'
        for x in metadata['assetsSummary'].get('dataStandard', {})
    )

In [None]:
def get_related_publications(metadata):
    if "relatedResource" in metadata:
        for x in metadata["relatedResource"]:
            if x["relation"] == "dcite:IsDescribedBy" and "identifier" in x and (
                x["identifier"].startswith("doi") or x["identifier"].startswith("https://doi")
            ):
                return x["identifier"]
    return False

# Find DANDISets on the DANDI archive that use NWB and have an associated publications

In [None]:
client = DandiAPIClient()

dandisets = list(client.get_dandisets())

neurodata_type_map = dict(
    ecephys=["LFP", "Units", "ElectricalSeries"],
    ophys=["PlaneSegmentation", "TwoPhotonSeries", "ImageSegmentation"],
    icephys=[
        "PatchClampSeries",
        "VoltageClampSeries",
        "CurrentClampSeries",
        "CurrentClampStimulusSeries",
    ],
)

# Collect all dandiset with NWB data data
data = defaultdict(list)
for dandiset in tqdm(dandisets):
    dandiset = dandiset.for_version("draft")
    identifier = dandiset.identifier
    metadata = dandiset.get_raw_metadata()
    
    if not has_nwb(metadata) or not dandiset.draft_version.size:
        continue
    data["identifier"].append(identifier)
    data["name"].append(metadata["name"])
    data["authors"].append([x["name"] for x in metadata.get("contributor",[]) if x.get("includeInCitation", False)])
    data["created"].append(dandiset.created)
    data["size"].append(pretty_size(dandiset.draft_version.size))
    if "species" in metadata["assetsSummary"] and len(metadata["assetsSummary"]["species"]):
        data["species"].append(metadata["assetsSummary"]["species"][0]["name"])
    else:
        data["species"].append(np.nan)
    
    
    for modality, ndtypes in neurodata_type_map.items():
        data[modality].append(
            any(x in ndtypes for x in metadata["assetsSummary"]["variableMeasured"])
        )
    
    data["numberOfSubjects"].append(int(metadata["assetsSummary"].get("numberOfSubjects", 0)))
    data["numberOfFiles"].append(metadata["assetsSummary"].get("numberOfFiles", 0))        
    data["related_pub"].append(get_related_publications(metadata))
    
df = pd.DataFrame.from_dict(data)

# Update species to replace with more consisten names
species_replacement = {
    "Mus musculus - House mouse": "House mouse",
    "Rattus norvegicus - Norway rat": "Rat",
    "Brown rat": "Rat",
    "Rat; norway rat; rats; brown rat": "Rat",
    "Homo sapiens - Human": "Human",
    "Drosophila melanogaster - Fruit fly": "Fruit fly",
}

for key, val in species_replacement.items():
    df["species"] = df["species"].replace(key, val)
    
    
# Parse the size of the dandiset and add it to the table
def dandiset_size_to_mb(values: list[str]):
    """
    Parse size strings from DANDI to translate them to an array of values in MB
    
    :param values: array of strings of the from `2 TB`, `5 GB`, `100 MB`
    
    :returns: List of ints with sizes in MB. May contain None elements for values that could not be converted
    """
    outvals = [None] * len(values)
    for i, v in enumerate(values):
        size, unit = v.split(" ")[0:2]
        if unit == 'PB':
            outvals[i] = int(size) * 1_000_000_000.
        elif unit == 'TB':
            outvals[i] = int(size) * 1_000_000.
        elif unit == 'GB':
            outvals[i] = int(size) * 1_000.
        elif unit == 'MB':
            outvals[i] = int(size) * 1.
        elif unit == 'KB':
            outvals[i] = int(size) * 0.001
    return outvals

df.insert(loc=len(df.columns), column='size in MB', value=dandiset_size_to_mb(df['size']))

df

In [None]:
print("Total size: %.2f TB" % (df['size in MB'].sum() / 1_000_000.))
print("Total number of files: %i" % df['numberOfFiles'].sum())

In [None]:
df['species'].value_counts().plot(
    kind='barh', 
    rot=0, 
    title="Number of Dandisets by Species", 
    xlabel="Number of Dandisets", 
    ylabel="Species")
plt.show()

In [None]:
df2 = df[df["related_pub"] != False]
df2.reset_index(drop=True)
df2["created"] = df2["created"].apply(lambda x: x.date())
df2

In [None]:
!pip install xlwt

In [None]:
df2.to_excel("dandi_w_pubs.xls")