In [1]:
import os
import datetime
from dotenv import load_dotenv
import pandas as pd
import altair as alt

In [2]:
pd.options.display.max_rows = 30

In [3]:
WIDTH = 650

In [4]:
from IPython.display import Markdown
from IPython.core.magic import register_cell_magic

@register_cell_magic
def markdown(line, cell):
    return Markdown(cell.format(**globals()))

In [5]:
load_dotenv()
GH_TOKEN = os.getenv("GH_TOKEN")

# Living with Machines GitHub Statistics report
This report provides some high level statistics about GitHub repositories contained under the [Living with Machines Organization](https://github.com/Living-with-machines). The report is mainly intended to provide some stats to aid in reporting to our funder.

### Contents
* [Report details](#report-details)
* [Report statistics](#repo-stats)
    * [Overview of repositories](#repo-overview)
    * [Repository file content](#repo-content)
    * [Stars and Forks](#repo-stars)
    * [Views](#repo-views)
    * [Clones](#repo-clones)

## Report details <a class="anchor" id="report-details"></a>

In [6]:
Markdown(f"This report is updated automatically every week, last generated on: **{datetime.date.today()}**")

This report is updated automatically every week, last generated on: **2021-05-31**

In [7]:
from gh_orgstats.stats import *

In [8]:
lwm_stats = OrgStats(GH_TOKEN, "Living-with-machines")

In [9]:
public_names = [repo.name for repo in lwm_stats.public_repos]

# Repository Statistics <a class="anchor" id="repo-stats"></a>

The rest of this document outlines some high level stats for GitHub repositories under the Living with Machines [GitHub Organization](github.com/living-with-machines).

## Overview of repositories <a class="anchor" id="repo-overview"></a>

Currently Living with Machines has the following public repositories: 

In [10]:
df = pd.DataFrame(public_names,columns=['Repositories']); df.index +=1 
df

Unnamed: 0,Repositories
1,lwm_ARTIDIGH_2020_OCR_impact_downstream_NLP_tasks
2,lwm_GIR19_resolving_places
3,D3_JS_viz_in_a_Python_Jupyter_notebook
4,DeezyMatch
5,LwM_SIGSPATIAL2020_ToponymMatching
6,histLM
7,AzureAudit
8,maps-at-scale-hack-day-notebooks
9,deduplify
10,AtypicalAnimacy


In [11]:
%%markdown
Additionaly Living with Machines has {lwm_stats.private_repo_count} private repositories which have not yet been published

Additionaly Living with Machines has 14 private repositories which have not yet been published


## Repository file content <a class="anchor" id="repo-content"></a>
This section provides an overview of what type of content is in Living with Machines repositories by looking at the file extension counts **note that these counts are based only on default branches so will under count for private repositories**

In [12]:
df = pd.DataFrame.from_dict(lwm_stats.get_org_file_ext_frequency("public"))
df_private = pd.DataFrame.from_dict(lwm_stats.get_org_file_ext_frequency("private"))

In [13]:
public_notebook_count = round(df.loc['.ipynb'].sum())
private_notebook_count = round(df_private.loc['.ipynb'].sum())

### Number of Jupyter notebooks
Living with Machines has promised a particular focus on making methods available to other researchers, one way in which the project is aiming to this is through the production of Jupyter notebooks. 

In [14]:
%%markdown
Living with Machines currently has {public_notebook_count} Jupyter notebooks in public repositories and {private_notebook_count} in private repositories.

Living with Machines currently has 72 Jupyter notebooks in public repositories and 28 in private repositories.


### File extensions counts by repository

In [15]:
df = pd.DataFrame.from_dict(lwm_stats.get_org_file_ext_frequency("public"))
df.reset_index(inplace=True)
data = pd.melt(frame=df, id_vars=["index"], value_name="count")
chart = (
    alt.Chart(data)
    .mark_bar()
    .encode(
        y=alt.Y(
            "index:N",
            title=None,
            sort=alt.EncodingSortField(field="count", order="descending", op="sum"),
        ),
        x=alt.X("count:Q"),
        color=alt.Color("variable:N", title="Repository"),
        tooltip=[alt.Tooltip("variable:N", title="Repository"), "count:Q"],
    )
    .properties(width=WIDTH, title='File Extension Counts')
)
chart

### File extensions by repository

This chart shows the number and type of files for each public Living with Machines repository **note** that this only considers files on the default branch of the repository.  

In [16]:
df = pd.DataFrame.from_dict(lwm_stats.get_org_file_ext_frequency("public"))
df.reset_index(inplace=True)
data = pd.melt(frame=df, id_vars=["index"], value_name="count")
chart = (
    alt.Chart(data)
    .mark_bar()
    .encode(
        y=alt.Y(
            "variable:N",
            title=None,
            sort=alt.EncodingSortField(field="count", order="descending", op="sum"),
        ),
        x=alt.X("count:Q", title="File counts", axis=alt.Axis(tickRound=False)),
        color=alt.Color("index:N", title="File extension"),
        tooltip=[alt.Tooltip("index:N", title="file extension"), "count:Q"],
    )
    .properties(width=WIDTH, title='File extensions by Repository')
)
chart

## Stars and Forks <a class="anchor" id="repo-stars"></a>

[Stars](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/saving-repositories-with-stars) are a GitHub feature which allows registered GitHub users to 'bookmark' a repository. A GitHub [fork](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/fork-a-repo) indicates that someone has made a copy of the repository into their own account. Since stars and forks are only relevant for public repos we only report these below

In [17]:
df = pd.DataFrame.from_dict(lwm_stats.get_org_snapshot_stats(lwm_stats.public_repos), orient='index')
df.loc['Total']= df.sum(numeric_only=True, axis=0)
df

Unnamed: 0,stars,forks
lwm_ARTIDIGH_2020_OCR_impact_downstream_NLP_tasks,5,1
lwm_GIR19_resolving_places,5,1
D3_JS_viz_in_a_Python_Jupyter_notebook,6,1
DeezyMatch,53,5
LwM_SIGSPATIAL2020_ToponymMatching,0,0
histLM,2,0
AzureAudit,0,0
maps-at-scale-hack-day-notebooks,0,1
deduplify,0,1
AtypicalAnimacy,2,1


## Views <a class="anchor" id="repo-views"></a>

GitHub provides owners of a repository with some traffic statistics, including view stats. These are broken into 'unique' and 'total' views.

### Total Views

In [18]:
df = lwm_stats.get_org_views_traffic(public_only=True, load=True)

In [19]:
idx = pd.IndexSlice
df = df.resample("W").mean()
total = df.loc[idx[:], idx[:,'total_views']].sum().sum()
mean = df.loc[idx[:], idx[:,'total_views']].sum().mean()
Markdown(f"Living with Machines public repositories have generated {round(total)} total views to date with an average of {round(mean)} daily views")

Living with Machines public repositories have generated 2143 total views to date with an average of 113 daily views

In [20]:
data = pd.melt(frame=df.reset_index(), id_vars=['index'],value_name='count')

In [21]:
brush = alt.selection(type="interval")
chart = (
    alt.Chart(data)
    .mark_area()
    .encode(
        x=alt.X("index:T", axis=alt.Axis(tickCount=24, title="Date")),
        y="sum(count):Q",
        color=alt.Color("variable_0:N", title="Repository"),
        tooltip="sum(count)",
    )
    .transform_filter(alt.FieldEqualPredicate(field="variable_1", equal="total_views"))
    .properties(width=WIDTH, title="Total views")
    .add_selection(brush)
)

bar = (
    alt.Chart(data)
    .mark_bar()
    .encode(
        y=alt.Y("variable_1:N", title=None),
        x="count:Q",
        color=alt.Color("variable_0:N", title="Repository"),
    )
    .properties(title="View by type")
    .transform_filter(brush)
)


## Views over time

The top chart shows us the total views over time by repository, the bottom histogram breaks this down by view type

In [22]:
chart & bar

## A breakdown of total views for each public Living with Machines repository:

In [23]:
df_total = df.loc[idx[:], idx[:,'total_views']].stack()
df_total.index = df_total.index.droplevel(1)
pd.DataFrame(df_total.sum().round(0).astype(int),columns=['total_views'])

Unnamed: 0,total_views
AtypicalAnimacy,164
AzureAudit,18
Computer-Vision-for-the-Humanities-workshop,34
D3_JS_viz_in_a_Python_Jupyter_notebook,97
DeezyMatch,577
GazFuse,8
Jupyter-Notebooks-The-Weird-and-Wonderful,46
LwM_SIGSPATIAL2020_ToponymMatching,150
PressPicker_public,58
computer-vision-DHNordic-2020-workshop,30


### Unique views
Unique views aim to not count the same person visiting a repository multiple times

In [24]:
idx = pd.IndexSlice

In [25]:
total = df.loc[idx[:], idx[:,'unique_views']].sum().sum()
mean = df.loc[idx[:], idx[:,'unique_views']].sum().mean()
Markdown(f"Living with Machines public repositories have generated {round(total)} unique views to date with an average of {round(mean)} daily views per repository")

Living with Machines public repositories have generated 519 unique views to date with an average of 27 daily views per repository

### A breakdown of total unique views for each public Living with Machines repository:

In [26]:
df = df.loc[idx[:], idx[:,'unique_views']].stack()
df.index = df.index.droplevel(1)
pd.DataFrame(df.sum().round(0).astype(int),columns=['unique_views'])

Unnamed: 0,unique_views
AtypicalAnimacy,30
AzureAudit,10
Computer-Vision-for-the-Humanities-workshop,7
D3_JS_viz_in_a_Python_Jupyter_notebook,39
DeezyMatch,138
GazFuse,6
Jupyter-Notebooks-The-Weird-and-Wonderful,9
LwM_SIGSPATIAL2020_ToponymMatching,29
PressPicker_public,23
computer-vision-DHNordic-2020-workshop,12


### Mean views by day of week 

In [27]:
df = lwm_stats.get_org_views_traffic(public_only=True, load=True)
df = df.resample("2D").mean()
data = pd.melt(frame=df.reset_index(), id_vars=['index'],value_name='count')
alt.Chart(data).mark_bar().encode(
    x="mean(count)",
    y=alt.Y("day(index):O", title="Day"),
    color=alt.Color("variable_1", title="View type"),
).properties(width=WIDTH, title="Mean views by Day")


## Clones <a class="anchor" id="repo-clones"></a>

Clones indicate how often a repository is 'downloaded' from GitHub:
> [Cloning a repository pulls down a full copy of all the repository data that GitHub has at that point in time, including all versions of every file and folder for the project.](https://docs.github.com/en/free-pro-team@latest/github/creating-cloning-and-archiving-repositories/cloning-a-repository)

Clones are on way in which we may also be able to assess whether people are making use of a repository. Like views, clones are also broken down into unique and total values. 

### Total Clones

In [28]:
df = lwm_stats.get_org_clones_traffic(public_only=True, load=True)

In [29]:
total = df.loc[idx[:], idx[:,'total_clones']].sum().sum()
mean = df.loc[idx[:], idx[:,'total_clones']].sum().mean()
Markdown(f"Living with Machines public repositories have generated {round(total)} clones to date with an average of {round(mean)} daily clones")

Living with Machines public repositories have generated 898 clones to date with an average of 47 daily clones

## Total clones over time across all repositories 

In [30]:
df = lwm_stats.get_org_clones_traffic(public_only=True, load=True)
df = df.resample("2D").mean()
data = pd.melt(frame=df.reset_index(), id_vars=['index'],value_name='count')

In [31]:
brush = alt.selection(type="interval")
chart = (
    alt.Chart(data)
    .mark_area()
    .encode(
        x=alt.X("index:T", axis=alt.Axis(tickCount=24, title="Date")),
        y="sum(count):Q",
        color=alt.Color("variable_0:N", title="Repository"),
     
    )
    .transform_filter(alt.FieldEqualPredicate(field="variable_1", equal="total_clones"))
    .properties(width=WIDTH, title="Total clones")
    .add_selection(brush)
)

bar = (
    alt.Chart(data)
    .mark_bar()
    .encode(
        y=alt.Y("variable_1:N", title=None),
        x="count:Q",
        color=alt.Color("variable_0:N", title="Repository"),tooltip="count:Q"
    )
    .properties(title="Clons by type")
    .transform_filter(brush)
)

chart & bar

### Total clones counts by repository 

In [32]:
df = lwm_stats.get_org_clones_traffic(public_only=True, load=True)
pd.DataFrame(df.sum().round(0).astype(int),columns=['Clone counts'])

Unnamed: 0,Unnamed: 1,Clone counts
lwm_ARTIDIGH_2020_OCR_impact_downstream_NLP_tasks,total_clones,20
lwm_ARTIDIGH_2020_OCR_impact_downstream_NLP_tasks,unique_clones,18
lwm_GIR19_resolving_places,total_clones,10
lwm_GIR19_resolving_places,unique_clones,10
D3_JS_viz_in_a_Python_Jupyter_notebook,total_clones,119
...,...,...
PressPicker_public,unique_clones,26
Computer-Vision-for-the-Humanities-workshop,total_clones,6
Computer-Vision-for-the-Humanities-workshop,unique_clones,6
Jupyter-Notebooks-The-Weird-and-Wonderful,total_clones,13


In [33]:
df = df.resample("2D").mean()

### Mean clones by day of week

In [34]:
data = pd.melt(frame=df.reset_index(), id_vars=['index'],value_name='count')
alt.Chart(data).mark_bar().encode(
    x="mean(count)",
    y=alt.Y("day(index):O", title="Day"),
    color=alt.Color("variable_1", title="Clone type"),
).properties(width=WIDTH, title="Mean clones by Day")