In [1]:
import pypistats
import pandas
import plotly.express as px
pypi_downloads = pypistats.overall("multiqc", mirrors=True, total="daily", format="pandas")
pypi_downloads = pypi_downloads.sort_values("date")
pypi_downloads = pypi_downloads[pypi_downloads["category"] == "with_mirrors"]
pypi_downloads

Unnamed: 0,category,date,percent,downloads
9,with_mirrors,2023-05-25,2.75%,5841
5,with_mirrors,2023-05-26,4.55%,9665
30,with_mirrors,2023-05-27,0.56%,1201
156,with_mirrors,2023-05-28,0.06%,136
37,with_mirrors,2023-05-29,0.46%,976
...,...,...,...,...
34,with_mirrors,2023-11-17,0.51%,1091
147,with_mirrors,2023-11-18,0.08%,171
151,with_mirrors,2023-11-19,0.07%,158
54,with_mirrors,2023-11-20,0.27%,584


In [29]:
pypi_downloads = pypi_downloads[["date", "downloads"]].set_index("date").cumsum().reset_index()

In [30]:
fig = px.line(
    pypi_downloads,
    x="date", y="downloads", title="PyPI downloads",
)
fig.update_yaxes(range=[0, 300_000])
fig

In [35]:
import pandas
import plotly.express as px

pypi_downloads_csv = "/Users/vlad/git/MultiQC/usage/plots/bquxjob_14f9646c_18bb3854e96.csv"
pypi_downloads = pandas.read_csv(pypi_downloads_csv)
pypi_downloads["date"] = pandas.to_datetime(pypi_downloads["download_month"])
pypi_downloads = pypi_downloads[["date", "total_downloads"]].set_index("date").cumsum().reset_index()
len(pypi_downloads)

fig = px.line(
    pypi_downloads,
    x="date", y="total_downloads", title="PyPI downloads",
)
# fig.update_yaxes(range=[0, 300_000])
fig

In [69]:
# export plot to svg
import plotly.io as pio
from pathlib import Path
DIR = Path("/Users/vlad/git/MultiQC/usage/plots")
pio.write_image(fig, DIR / "multiqc_pypi.svg", width=1000, height=800)

In [33]:
import pandas as pd

sql = """
SELECT
  FORMAT_TIMESTAMP('%Y-%m', timestamp) AS download_month,
  COUNT(*) AS total_downloads
FROM `bigquery-public-data.pypi.file_downloads`
WHERE file.project = 'multiqc' AND timestamp >= '2014-01-01 00:00:00'
GROUP BY download_month
ORDER BY download_month;
"""
# fetch from BigQuery
df = pd.read_gbq(sql, project_id="vlad-savelyev")
df

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=725825577420-unm2gnkiprugilg743tkbig250f4sfsj.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=Q5FF2ldUSTmwA4SST6zhUsr1IcyJiC&prompt=consent&access_type=offline


GenericGBQException: Reason: 403 POST https://bigquery.googleapis.com/bigquery/v2/projects/vlad-savelyev/jobs?prettyPrint=false: Access Denied: Project vlad-savelyev: User does not have bigquery.jobs.create permission in project vlad-savelyev.

Location: None
Job ID: d9768c37-7712-40cd-a984-420679dfef4f


In [82]:
from collections import Counter
import requests, json
import packaging
import pandas as pd
url = "https://raw.githubusercontent.com/bioconda/bioconda-plots/main/plots/multiqc/versions.json"
# [{"date":"2023-09-05","total":15949,"delta":18,"version":"1.10.1"}, ...
downloads_counter: Counter[str] = Counter()
response = requests.get(url)
if response.status_code == 200:
    items = json.loads(response.text)
    for item in items:
        downloads_counter[item["date"]] += item["total"]

downloads_by_version: dict[str, int] = {str(k): v for k, v in downloads_counter.items()}
downloads = pd.DataFrame(
    sorted(downloads_by_version.items()),
    columns=["date", "downloads"],
)
downloads

Unnamed: 0,date,downloads
0,2023-10-27,118987
1,2023-10-28,119370
2,2023-10-29,119461
3,2023-10-30,119599
4,2023-10-31,119967
5,2023-11-01,120473
6,2023-11-02,120925
7,2023-11-03,121318
8,2023-11-04,121669
9,2023-11-05,121792


In [88]:
import plotly.express as px
fig = px.line(
    downloads,
    x="date", y="downloads", title="Bioconda downloads",
)
fig.update_yaxes(range=[0, 150_000])
fig

In [61]:
from pathlib import Path
REPO_ROOT = Path("/Users/vlad/git/MultiQC/multiqc/modules")
import os
os.chdir(REPO_ROOT)
names = []
paths = []
dates = []
for path in REPO_ROOT.iterdir():
    if not path.is_dir():
        continue
    init_path = path / "__init__.py"
    if not init_path.exists():
        continue
    names.append(path.name)
    paths.append(init_path)
    cmd = f"git log --follow --format='%ai' -- {path} | tail -1"
    date = os.popen(cmd).read().strip()
    dates.append(date)

modules = pd.DataFrame(
    sorted(zip(names, dates), key=lambda e: e[1]),
    columns=["name", "date"],
)
modules["date"] = pd.to_datetime(modules["date"])
modules





Unnamed: 0,name,date
0,fastq_screen,2015-09-25 09:39:44+02:00
1,picard,2015-09-25 09:39:44+02:00
2,qualimap,2015-09-25 09:39:44+02:00
3,cutadapt,2015-09-25 09:39:44+02:00
4,bowtie2,2015-09-25 09:39:44+02:00
...,...,...
132,sourmash,2023-09-14 17:56:50-04:00
133,bracken,2023-09-22 09:24:26-07:00
134,truvari,2023-10-12 10:32:36+02:00
135,seqera_cli,2023-11-02 21:10:04+01:00


In [62]:
import plotly.express as px
# Plot the cumulative number of modules over time.
# That would be equivalent to the number of modules
# that have a date less than or equal to the current date
# (i.e. the number of modules that existed at that date)
modules = modules.sort_values("date")
# modules["total"] should be the number of module below modules["date"]
modules["total"] = modules.index + 1
modules


Unnamed: 0,name,date,total
0,fastq_screen,2015-09-25 09:39:44+02:00,1
1,picard,2015-09-25 09:39:44+02:00,2
2,qualimap,2015-09-25 09:39:44+02:00,3
3,cutadapt,2015-09-25 09:39:44+02:00,4
4,bowtie2,2015-09-25 09:39:44+02:00,5
...,...,...,...
132,sourmash,2023-09-14 17:56:50-04:00,133
133,bracken,2023-09-22 09:24:26-07:00,134
134,truvari,2023-10-12 10:32:36+02:00,135
135,seqera_cli,2023-11-02 21:10:04+01:00,136


In [63]:
fig = px.line(
    modules,
    x="date", y="total", title="Number of modules over time",
)
fig

In [64]:
# export plot to svg
import plotly.io as pio
from pathlib import Path
DIR = Path("/Users/vlad/git/MultiQC/usage/plots")
pio.write_image(fig, DIR / "multiqc_modules.svg", width=1000, height=800)

In [2]:
# Pull requests
from github import Github
from collections import defaultdict

g = Github("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")
repo = g.get_repo("ewels/MultiQC")

pulls = repo.get_pulls(state='all', sort='created', direction='desc')

NameError: name 'pd' is not defined

In [30]:
import pandas as pd
import datetime

# Given that each pr in pulls has pr.created_at and pr.closed_at timestamp,
# we want to find the number of open PRs at each given day in the past.
# We can do that by iterating over all PRs and incrementing a counter
# for each day between pr.created_at and pr.closed_at.
# We can then plot the cumulative number of PRs over time.
# That would be equivalent to the number of PRs that have a date less than or equal to the current date
# (i.e. the number of PRs that existed at that date)
open_pr_count_by_day = defaultdict(int)
for pr in pulls:
    created_at = pr.created_at.date()
    closed_at = pr.closed_at
    if closed_at is None:
        closed_at = datetime.date.today()
    else:
        closed_at = closed_at.date()
    for week in pd.date_range(created_at, closed_at, freq="W"):
        # # format day to string 
        # day = day.strftime("%Y-%m-%d")
        open_pr_count_by_day[week] += 1

open_pr_count_by_day

defaultdict(int,
            {Timestamp('2023-11-19 00:00:00', freq='W-SUN'): 45,
             Timestamp('2023-11-12 00:00:00', freq='W-SUN'): 50,
             Timestamp('2023-11-05 00:00:00', freq='W-SUN'): 48,
             Timestamp('2023-10-29 00:00:00', freq='W-SUN'): 50,
             Timestamp('2023-10-22 00:00:00', freq='W-SUN'): 50,
             Timestamp('2023-10-15 00:00:00', freq='W-SUN'): 56,
             Timestamp('2023-10-08 00:00:00', freq='W-SUN'): 49,
             Timestamp('2023-10-01 00:00:00', freq='W-SUN'): 48,
             Timestamp('2023-09-24 00:00:00', freq='W-SUN'): 48,
             Timestamp('2023-09-17 00:00:00', freq='W-SUN'): 52,
             Timestamp('2023-09-10 00:00:00', freq='W-SUN'): 53,
             Timestamp('2023-09-03 00:00:00', freq='W-SUN'): 54,
             Timestamp('2023-08-27 00:00:00', freq='W-SUN'): 57,
             Timestamp('2023-08-20 00:00:00', freq='W-SUN'): 56,
             Timestamp('2023-08-13 00:00:00', freq='W-SUN'): 56,
        

In [31]:
# Plot the number of PRs over time.
import pandas as pd
df = pd.DataFrame(
    sorted(open_pr_count_by_day.items()),
    columns=["date", "count"],
)
df["date"] = pd.to_datetime(df["date"])
fig = px.line(
    df,
    x="date",
    y="count",
    title="Open pull requests on each week",
)
# save
import plotly.io as pio
from pathlib import Path
DIR = Path("/Users/vlad/git/MultiQC/usage/plots")
pio.write_image(fig, DIR / "open_pull_requests_week.svg", width=1000, height=800)
fig

In [25]:
# Open issues on each day

import pandas as pd
import datetime
from github import Github
from collections import defaultdict

g = Github("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")
repo = g.get_repo("ewels/MultiQC")

issues = repo.get_issues(state='all', sort='created', direction='desc')
open_issues_count_by_day = defaultdict(int)
for issue in issues:
    created_at = issue.created_at.date()
    closed_at = issue.closed_at
    if closed_at is None:
        closed_at = datetime.date.today()
    else:
        closed_at = closed_at.date()
    for week in pd.date_range(created_at, closed_at, freq="W"):
        open_issues_count_by_day[week] += 1

open_issues_count_by_day

defaultdict(int,
            {Timestamp('2023-11-19 00:00:00', freq='W-SUN'): 265,
             Timestamp('2023-11-12 00:00:00', freq='W-SUN'): 276,
             Timestamp('2023-11-05 00:00:00', freq='W-SUN'): 274,
             Timestamp('2023-10-29 00:00:00', freq='W-SUN'): 277,
             Timestamp('2023-10-22 00:00:00', freq='W-SUN'): 277,
             Timestamp('2023-10-15 00:00:00', freq='W-SUN'): 288,
             Timestamp('2023-10-08 00:00:00', freq='W-SUN'): 277,
             Timestamp('2023-10-01 00:00:00', freq='W-SUN'): 276,
             Timestamp('2023-09-24 00:00:00', freq='W-SUN'): 272,
             Timestamp('2023-09-17 00:00:00', freq='W-SUN'): 289,
             Timestamp('2023-09-10 00:00:00', freq='W-SUN'): 287,
             Timestamp('2023-09-03 00:00:00', freq='W-SUN'): 287,
             Timestamp('2023-08-27 00:00:00', freq='W-SUN'): 287,
             Timestamp('2023-08-20 00:00:00', freq='W-SUN'): 286,
             Timestamp('2023-08-13 00:00:00', freq='W-SUN')

In [26]:
# Plot the number of issues over time.
import pandas as pd
df = pd.DataFrame(
    sorted(open_issues_count_by_day.items()),
    columns=["date", "count"],
)
df["date"] = pd.to_datetime(df["date"])
fig = px.line(
    df,
    x="date",
    y="count",
    title="Open issues on each week",
)
fig

In [29]:
# save
import plotly.io as pio
from pathlib import Path
DIR = Path("/Users/vlad/git/MultiQC/usage/plots")
# save as svg
pio.write_image(fig, DIR / "open_issues_week.svg", width=1000, height=800)

In [60]:
import pandas as pd
import datetime
from github import Github
from collections import defaultdict

prs_df = pd.DataFrame(sorted(open_pr_count_by_day.items()), columns=["date", "count"])
prs_df["date"] = pd.to_datetime(prs_df["date"])
issues_df = pd.DataFrame(sorted(open_issues_count_by_day.items()), columns=["date", "count"])
issues_df["date"] = pd.to_datetime(issues_df["date"])

# plot prs and issues in a stacked plot
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=prs_df["date"], y=prs_df["count"], name="PRs"))
fig.add_trace(go.Scatter(x=issues_df["date"], y=issues_df["count"], name="Issues"))
fig.update_layout(title="Open issues and pull requests on each week")
fig

In [None]:
# save
import plotly.io as pio
from pathlib import Path
DIR = Path("/Users/vlad/git/MultiQC/usage/plots")
# save as svg
pio.write_image(fig, DIR / "open_issues_and_prs_week.svg", width=1000, height=800)

In [58]:
# make stacked area plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=issues_df["date"], y=issues_df["count"], name="Issues", stackgroup='one'))
fig.add_trace(go.Scatter(x=prs_df["date"], y=prs_df["count"], name="PRs", stackgroup='one'))
fig.update_layout(title="Open issues and pull requests on each week")
fig

In [55]:
data_path = Path("/Users/vlad/git/MultiQC/usage/data")
df = pd.read_csv(data_path / "github-pull-requests.csv")
df["date"] = pandas.to_datetime(df["date"])
contributors = set()
entries = []

entries_by_week = dict()
for i, row in df.iterrows():
    day = row.date.date()
    nearest_sunday = day - datetime.timedelta(days=day.weekday())
    if nearest_sunday not in entries_by_week:
        entries_by_week[nearest_sunday] = dict(prs=0, contributors=set(), new_contributors=set())
    entries_by_week[nearest_sunday]["prs"] += 1
    entries_by_week[nearest_sunday]["contributors"].add(row.author)
    if row.author not in contributors:
        entries_by_week[nearest_sunday]["new_contributors"].add(row.author)
    contributors.add(row.author)

entries = []
for day, data in entries_by_week.items():
    entries.append({"date": day, "prs": data["prs"], "contributors": len(data["contributors"]), "new_contributors": len(data["new_contributors"])})

df = pd.DataFrame(entries)
# df_prs = df_prs.groupby("date").sum().reset_index()
df["prs"] = df["prs"].astype(int)
df["contributors"] = df["contributors"].astype(int)
df["new_contributors"] = df["new_contributors"].astype(int)

# plot
import plotly.express as px
fig = px.line(
    df,
    x="date",
    y="contributors",
    title="Contributors on each week",
)
fig


In [71]:

import plotly.express as px
import pandas as pd
import os
from mysql.connector import connect, Error
import requests
import packaging.version

# Clean up the data and build the data frame
releases_r = requests.get("https://api.github.com/repos/ewels/MultiQC/releases")
valid_versions = [v["tag_name"] for v in releases_r.json()] + ["v1.18"]

versions_by_week = []

try:
    # Establish a database connection
    with connect(
        host=os.getenv("HOST"),
        user=os.getenv("USERNAME"),
        passwd=os.getenv("PASSWORD"),
        db=os.getenv("DATABASE"),
    ) as connection:
        # Execute the query
        query = "SELECT row_key, num_checks FROM version_check_weekly"
        with connection.cursor() as cursor:
            cursor.execute(query)
            for row_key, num_checks in cursor.fetchall():
                print(row_key)
                week, version = row_key.split("_", 1)
                version = version.replace(".dev", "").replace("dev", "")
                version = ".".join(version.split(".")[:2])
                if f"v{version}" not in valid_versions:
                    # version = "unrecognised"
                    pass
                else:
                    version = packaging.version.parse(version)
                    versions_by_week.append([week, version, num_checks])
except Error as e:
    print(f"Error: {e}")

df = pd.DataFrame(versions_by_week, columns=["week", "version", "count"])

2016-03-07_0.5dev
2016-03-14_0.5dev
2016-03-21_0.5dev
2016-03-28_0.5
2016-03-28_0.5dev
2016-03-28_0.6dev
2016-04-04_0.5
2016-04-04_0.5dev
2016-04-04_0.6dev
2016-04-11_0.5
2016-04-11_0.5dev
2016-04-11_0.6dev
2016-04-18_0.5
2016-04-18_0.5dev
2016-04-18_0.6dev
2016-04-25_0.5
2016-04-25_0.5dev
2016-04-25_0.6
2016-04-25_0.6dev
2016-04-25_0.7dev
2016-05-02_0.5
2016-05-02_0.5dev
2016-05-02_0.6
2016-05-02_0.6dev
2016-05-02_0.7dev
2016-05-09_0.5
2016-05-09_0.6
2016-05-09_0.6dev
2016-05-09_0.7dev
2016-05-16_0.5
2016-05-16_0.5dev
2016-05-16_0.6
2016-05-16_0.6dev
2016-05-16_0.7dev
2016-05-23_0.5
2016-05-23_0.6
2016-05-23_0.7dev
2016-05-30_0.5
2016-05-30_0.5dev
2016-05-30_0.6
2016-05-30_0.6dev
2016-05-30_0.7dev
2016-06-06_0.5
2016-06-06_0.6
2016-06-06_0.7dev
2016-06-13_0.5
2016-06-13_0.6
2016-06-13_0.6dev
2016-06-13_0.7dev
2016-06-20_0.5
2016-06-20_0.6
2016-06-20_0.6dev
2016-06-20_0.7dev
2016-06-27_0.5
2016-06-27_0.6
2016-06-27_0.7dev
2016-07-04_0.5
2016-07-04_0.6
2016-07-04_0.7
2016-07-04_0.7dev
2

In [72]:
df = df.sort_values(["version", "week"], ascending=[False, True])
fig = px.bar(
    df,
    x="week",
    y="count",
    color="version",
    width=800,
    height=700,
    title="Usage per version over time",
).update_xaxes(categoryorder="total descending")
fig

In [66]:
# export plot to svg
import plotly.io as pio
from pathlib import Path
DIR = Path("/Users/vlad/git/MultiQC/usage/plots")
pio.write_image(fig, DIR / "multiqc_version_check_weekly.svg", width=1000, height=800)

In [136]:
# Twitter

from bs4 import BeautifulSoup
import pandas as pd
import plotly.express as px

svg_content_old = """<path class="sparkline" d="M3,15L7.428571428571428,15.461538461538462L11.857142857142856,15.461538461538462L16.285714285714285,15L20.71428571428571,15L25.142857142857142,15L29.571428571428573,13.153846153846153L34,13.153846153846153L38.42857142857142,10.384615384615387L42.85714285714286,9.46153846153846L47.285714285714285,8.538461538461537L51.714285714285715,8.538461538461537L56.142857142857146,8.538461538461537L60.57142857142858,7.615384615384613L65,6.69230769230769L69.42857142857143,7.153846153846153L73.85714285714285,6.69230769230769L78.28571428571429,6.69230769230769L82.71428571428572,3L87.14285714285715,3L91.57142857142857,3L96,3L100.42857142857143,3L104.85714285714286,3L109.28571428571429,3L113.71428571428572,3L118.14285714285715,3L122.57142857142857,3"></path>"""

svg_content = """<path class="sparkline" d="M3,15L7.428571428571428,14.612903225806452L11.857142857142856,14.612903225806452L16.285714285714285,14.612903225806452L20.71428571428571,13.06451612903226L25.142857142857142,13.06451612903226L29.571428571428573,10.741935483870968L34,9.967741935483872L38.42857142857142,9.193548387096776L42.85714285714286,9.193548387096776L47.285714285714285,9.193548387096776L51.714285714285715,8.41935483870968L56.142857142857146,7.645161290322584L60.57142857142858,8.032258064516128L65,7.645161290322584L69.42857142857143,7.645161290322584L73.85714285714285,3L78.28571428571429,3L82.71428571428572,3L87.14285714285715,3L91.57142857142857,3L96,3L100.42857142857143,3L104.85714285714286,3L109.28571428571429,3L113.71428571428572,3L118.14285714285715,3L122.57142857142857,3"></path>"""

# Parse the SVG content using BeautifulSoup
soup = BeautifulSoup(svg_content, 'html.parser')
path = soup.find('path', {'class': 'sparkline'})['d']

# Extracting the coordinates from the 'path' element
# The format is generally "Mx,yLx,yLx,y..."
points = path.split('L')

# The first pair of coordinates follows 'M', so we handle it separately
first_point = points[0].split('M')[1]
coordinates = [tuple(map(float, first_point.split(',')))]

# Process the remaining points
for point in points[1:]:
    coordinates.append(tuple(map(float, point.split(','))))

# Creating a DataFrame from the extracted coordinates
df = pd.DataFrame(coordinates, columns=['x', 'y'])

# Display the DataFrame to verify
df

Unnamed: 0,x,y
0,3.0,15.0
1,7.428571,14.612903
2,11.857143,14.612903
3,16.285714,14.612903
4,20.714286,13.064516
5,25.142857,13.064516
6,29.571429,10.741935
7,34.0,9.967742
8,38.428571,9.193548
9,42.857143,9.193548


In [146]:
df["Followers"] = df["y"] - min(df["y"])
df["Followers"] = 31 - df["Followers"] * 31 / max(df["Followers"])
df["Days"] = df["x"] - min(df["x"])
df["Days"] = df["Days"] * 27 / max(df["Days"])
df

Unnamed: 0,x,y,Days,Followers
0,3.0,15.0,0.0,0.0
1,7.428571,14.612903,1.0,1.0
2,11.857143,14.612903,2.0,1.0
3,16.285714,14.612903,3.0,1.0
4,20.714286,13.064516,4.0,5.0
5,25.142857,13.064516,5.0,5.0
6,29.571429,10.741935,6.0,11.0
7,34.0,9.967742,7.0,13.0
8,38.428571,9.193548,8.0,15.0
9,42.857143,9.193548,9.0,15.0


In [149]:
# reversed x-axis
fig = px.line(
    df,
    x="Days",
    y="Followers",
    title="New Twitter followers over the last 28 days",
    range_x=[0, 27],
    range_y=[0, 33],
)
fig

In [152]:
# export plot to svg
import plotly.io as pio
from pathlib import Path
DIR = Path("/Users/vlad/git/MultiQC/usage/plots")
pio.write_image(fig, DIR / "twitter_followers.svg") 

In [3]:
from github import Github

g = Github("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")
repo = g.get_repo("ewels/MultiQC")

pulls = repo.get_pulls(state='all')
num_pull_requests = pulls.totalCount
print(f"Number of pull requests: {num_pull_requests}")

Number of pull requests: 812


In [40]:
import requests
url = "https://api.us-east-1.gallery.ecr.aws/getRepositoryCatalogData"
headers = {"Content-Type": "application/json"}
data = {"registryAliasName": "biocontainers", "repositoryName": "multiqc"}
response = requests.post(url, headers=headers, json=data)
count = response.json()["insightData"]["downloadCount"]
print(count)

{'catalogData': {}, 'insightData': {'downloadCount': 3130}}


In [61]:
url = f"https://hub.docker.com/v2/repositories/ewels/multiqc"
response = requests.get(url)
data = json.loads(response.text)
data

{'user': 'ewels',
 'name': 'multiqc',
 'namespace': 'ewels',
 'repository_type': 'image',
 'status': 1,
 'status_description': 'active',
 'description': 'Aggregate results from bioinformatics analyses across many samples into a single report.',
 'is_private': False,
 'is_automated': True,
 'star_count': 6,
 'pull_count': 45553,
 'last_updated': '2023-11-08T13:33:36.238861Z',
 'date_registered': '2017-10-02T15:06:14.663411Z',
 'collaborator_count': 0,
 'affiliation': None,
 'hub_user': 'ewels',
 'has_starred': False,
 'full_description': "# ![MultiQC](https://raw.githubusercontent.com/ewels/MultiQC/master/docs/images/MultiQC_logo.png)\n\n### Aggregate bioinformatics results across many samples into a single report\n\n##### Find [documentation](http://multiqc.info/docs) and [example reports](http://multiqc.info/examples/rna-seq/multiqc_report.html) at [http://multiqc.info](http://multiqc.info)\n\n[![PyPI Version](https://img.shields.io/pypi/v/multiqc.svg?style=flat-square)](https://pypi.

In [63]:
url = "https://api.us-east-1.gallery.ecr.aws/getRepositoryCatalogData"
headers = {"Content-Type": "application/json"}
data = {"registryAliasName": "biocontainers", "repositoryName": "multiqc"}
response = requests.post(url, headers=headers, json=data)
data = response.json()
data

{'catalogData': {}, 'insightData': {'downloadCount': 3128}}

In [29]:
import requests, json
url = "https://raw.githubusercontent.com/bioconda/bioconda-plots/main/plots/multiqc/versions.json"
response = requests.get(url)
# [{"date":"2023-09-05","total":15949,"delta":18,"version":"1.10.1"}, ...
conda = json.loads(response.text)
# for version in data:
#     downloads_counter[version["version"]] += version["total"]
# 
# downloads_by_version: dict[str, int] = {str(packaging.version.parse(k)): v for k, v in downloads_counter.items()}
conda

[{'date': '2023-10-26', 'total': 28718, 'delta': 92, 'version': '1.12'},
 {'date': '2023-10-27', 'total': 28783, 'delta': 65, 'version': '1.12'},
 {'date': '2023-10-28', 'total': 28825, 'delta': 42, 'version': '1.12'},
 {'date': '2023-10-29', 'total': 28834, 'delta': 9, 'version': '1.12'},
 {'date': '2023-10-30', 'total': 28840, 'delta': 6, 'version': '1.12'},
 {'date': '2023-10-31', 'total': 28864, 'delta': 24, 'version': '1.12'},
 {'date': '2023-11-01', 'total': 28921, 'delta': 57, 'version': '1.12'},
 {'date': '2023-11-02', 'total': 28976, 'delta': 55, 'version': '1.12'},
 {'date': '2023-11-03', 'total': 29014, 'delta': 38, 'version': '1.12'},
 {'date': '2023-11-04', 'total': 29052, 'delta': 38, 'version': '1.12'},
 {'date': '2023-11-05', 'total': 29057, 'delta': 5, 'version': '1.12'},
 {'date': '2023-11-06', 'total': 29072, 'delta': 15, 'version': '1.12'},
 {'date': '2023-11-07', 'total': 29138, 'delta': 66, 'version': '1.12'},
 {'date': '2023-11-08', 'total': 29197, 'delta': 59, '

In [49]:
import pandas as pd
import packaging.version

channel = "bioconda"
package = "multiqc"

PACKAGE_API_URL_TEMPLATE = "https://api.anaconda.org/package/{channel}/{package}"
url = PACKAGE_API_URL_TEMPLATE.format(channel=channel, package=package)
response = requests.get(url)
assert response.status_code == 200
    
package_info = json.loads(response.text)
from pprint import pprint
pprint(package_info)

{'app_entry': {},
 'app_summary': {},
 'app_type': {},
 'builds': ['py36_1',
            'py36_0',
            'py36_2',
            'py36_4',
            'py_0',
            'py_1',
            'py27h24bf2e0_0',
            'py27h24bf2e0_1',
            'py27h24bf2e0_2',
            'py_3',
            'py_4',
            'py_2',
            'pyhdfd78af_0',
            'py27_4',
            'py27_3',
            'py27_2',
            'py27_1',
            'py27_0',
            'py36h24bf2e0_2',
            'py36h24bf2e0_0',
            'py36h24bf2e0_1',
            'py35h24bf2e0_1',
            'py35h24bf2e0_0',
            'py35h24bf2e0_2',
            'pyh9f0ad1d_0',
            'py34_1',
            'py34_0',
            'pyhdfd78af_1',
            'py35_4',
            'py35_2',
            'py35_0',
            'py35_1'],
 'conda_platforms': ['linux-64', 'noarch', 'osx-64'],
 'created_at': '2016-02-26 12:58:31.950000+00:00',
 'description': '',
 'dev_url': 'https://github.com/ewe

In [None]:
downloads = []
for package_file_info in package_info["files"]:
    if "main" not in package_file_info["labels"]:
        continue
    if "conda" != package_file_info["type"]:
        continue
    downloads.append(
        {
            "package": package,
            "version": package_file_info["version"],
            "subdir": package_file_info["attrs"]["subdir"],
            "total": max(0, package_file_info["ndownloads"]),
        }
    )
df = pd.DataFrame(
    sorted(
        downloads,
        key=lambda e: (
            e["package"],
            packaging.version.parse(e["version"]),
            e["version"],
            e["subdir"],
        ),
    )
)
df

In [47]:
# merge all rows with different subdirs, sum df["total"]
df2 = df[df["version"] == "0.4"]
df2

Unnamed: 0,package,version,subdir,total
0,multiqc,0.4,linux-64,2237
1,multiqc,0.4,linux-64,568
2,multiqc,0.4,linux-64,728
3,multiqc,0.4,osx-64,115
4,multiqc,0.4,osx-64,3277
5,multiqc,0.4,osx-64,218


In [48]:
df2.groupby(["package", "version"]).sum().reset_index()

Unnamed: 0,package,version,subdir,total
0,multiqc,0.4,linux-64linux-64linux-64osx-64osx-64osx-64,7143
