# Data Analysis

## Global Imports

In [1]:
import numpy as np
import pandas as pd

## Functions

### Data access functions

In [2]:
from pathlib import Path


def data(root: Path) -> Path:
    return root / "data"


def output(root: Path) -> Path:
    return data(root) / "output"


def raw_results(root: Path) -> Path:
    return data(root) / "raw_results" / "raw_results.csv"


def subjects(root: Path) -> Path:
    return data(root) / "raw_results" / "subjects.csv"


def commits(root: Path) -> Path:
    return data(root) / "raw_results" / "commits.csv"


def truth(root: Path) -> Path:
    return data(root) / "truth" / "truth.csv"


### Function to summarize DataFrame

In [3]:
from collections.abc import Callable, Sequence
from dataclasses import dataclass


MetricFn = Callable[[pd.DataFrame], object]


def _require_cols(g: pd.DataFrame, cols: Sequence[str]) -> None:
    missing = [c for c in cols if c not in g.columns]
    if missing:
        raise KeyError(f"Missing required columns: {missing}")


# ---------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------
def m_n_subjects(g: pd.DataFrame) -> int:
    return int(len(g))


def m_n_repos(g: pd.DataFrame) -> int:
    _require_cols(g, ["full_name_of_repo"])
    return int(g["full_name_of_repo"].nunique())


def m_n_commits(g: pd.DataFrame) -> int:
    _require_cols(g, ["commit_sha"])
    return int(g["commit_sha"].nunique())


def m_n_distinct_paths(g: pd.DataFrame) -> int:
    _require_cols(g, ["path"])
    return int(g["path"].nunique())


def m_positive_rate(g: pd.DataFrame) -> float:
    _require_cols(g, ["is_ccdc_event"])
    n = len(g)
    return np.nan if n == 0 else float(g["is_ccdc_event"].sum() / n)


def m_n_distinct_channels(g: pd.DataFrame) -> int:
    _require_cols(g, ["detected_channels"])
    return int(
        g["detected_channels"]
        .explode()
        .dropna()
        .nunique()
    )


# ---------------------------------------------------------------------
# Registry: metric_name -> (fn, required_columns)
# ---------------------------------------------------------------------
METRICS: dict[str, tuple[MetricFn, tuple[str, ...]]] = {
    "n_subjects": (m_n_subjects, ()),
    "n_repos": (m_n_repos, ("full_name_of_repo",)),
    "n_commits": (m_n_commits, ("commit_sha",)),
    "n_distinct_paths": (m_n_distinct_paths, ("path",)),
    "positive_rate": (m_positive_rate, ("is_ccdc_event",)),
    "n_distinct_channels": (
        m_n_distinct_channels,
        ("detected_channels",),
    ),
}


# ---------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------
@dataclass(frozen=True)
class SummarizeConfig:
    include_metrics: tuple[str, ...]
    group_keys: tuple[str, ...] = ()
    forbid_access_to_group_keys: bool = True
    drop_group_keys_from_frame: bool = True
    deny_columns: tuple[str, ...] = ()


# ---------------------------------------------------------------------
# Core summarization
# ---------------------------------------------------------------------
def summarize_subjects_configurable(
    g: pd.DataFrame,
    cfg: SummarizeConfig,
) -> pd.Series:

    g_eff = (
        g.drop(columns=list(cfg.group_keys), errors="ignore")
        if cfg.drop_group_keys_from_frame and cfg.group_keys
        else g
    )

    forbidden = set(cfg.deny_columns)
    if cfg.forbid_access_to_group_keys:
        forbidden |= set(cfg.group_keys)

    out: dict[str, object] = {}

    for name in cfg.include_metrics:
        if name not in METRICS:
            raise KeyError(
                f"Unknown metric: {name}. "
                f"Known: {sorted(METRICS)}"
            )

        fn, required = METRICS[name]

        illegal = [c for c in required if c in forbidden]
        if illegal:
            raise ValueError(
                f"Metric '{name}' requires forbidden columns {illegal}. "
                f"Forbidden: {sorted(forbidden)}"
            )

        out[name] = fn(g_eff)

    return pd.Series(out)

### Tupel with each and every available metric

In [4]:
ALL_METRICS = (
    "n_subjects",
    "n_repos",
    "n_commits",
    "n_distinct_paths",
    "positive_rate",
    "n_distinct_channels",
)

### Function to cast columns of DataFrame to Int64

In [5]:
from collections.abc import Iterable

def col_to_int(df: pd.DataFrame, int_cols: Iterable[str]) -> pd.DataFrame:
    cols = [c for c in int_cols if c in df.columns]
    df[cols] = df[cols].astype("Int64")
    return df

#### "Popular" Int64 columns

In [6]:
INT_COLS_OF_SUMMARY = [
    "n_subjects",
    "n_repos",
    "n_commits",
    "n_distinct_paths",
    "n_distinct_channels",
]

INT_COLS_OF_VC_STATS = [
    "n_categories",
    "total_count",
    "min_count",
    "max_count",
    "median_count",
    "q1_count",
    "q3_count",
    "iqr_count",
]

### Boxplot stats function

**Includes the mean average.**

In [7]:
def boxplot_stats(s: pd.Series) -> pd.Series:
    s = s.dropna()

    q1 = s.quantile(0.25)
    q2 = s.quantile(0.50)
    q3 = s.quantile(0.75)
    iqr = q3 - q1

    lower_whisker = s[s >= q1 - 1.5 * iqr].min()
    upper_whisker = s[s <= q3 + 1.5 * iqr].max()

    n_outliers = ((s < lower_whisker) | (s > upper_whisker)).sum()

    mean = s.mean()

    return pd.Series({
        "q1": q1,
        "median": q2,
        "mean": mean,
        "q3": q3,
        "iqr": iqr,
        "lower_whisker": lower_whisker,
        "upper_whisker": upper_whisker,
        "n_outliers": n_outliers,
        "min": s.min(),
        "max": s.max(),
        "n": len(s),
    })


### Function to get descriptive statistics / stats for a given value_counts() result

In [8]:
def value_counts_stats(vc: pd.Series) -> pd.Series:
    """
    Descriptive statistics for a value_counts() result.

    Parameters
    ----------
    vc : pd.Series
        Output of value_counts(): index = category, values = counts

    Returns
    -------
    pd.Series
        Descriptive statistics of the distribution of counts
    """
    vc = vc.dropna()

    if vc.empty:
        return pd.Series(dtype="float64")

    counts = vc.values

    total = counts.sum()
    n_categories = len(vc)

    q1 = vc.quantile(0.25)
    q2 = vc.quantile(0.50)
    q3 = vc.quantile(0.75)
    iqr = q3 - q1

    top_1 = vc.iloc[0]
    top_5_sum = vc.iloc[:5].sum() if n_categories >= 5 else total
    top_10_sum = vc.iloc[:10].sum() if n_categories >= 10 else total

    return pd.Series({
        # structure
        "n_categories": n_categories,
        "total_count": total,

        # distribution of counts
        "min_count": vc.min(),
        "max_count": vc.max(),
        "mean_count": vc.mean(),
        "median_count": q2,
        "q1_count": q1,
        "q3_count": q3,
        "iqr_count": iqr,

        # concentration / dominance
        "top_1_count": top_1,
        "top_1_share": top_1 / total,
        "top_5_share": top_5_sum / total,
        "top_10_share": top_10_sum / total,
    })

## Data Preparation

### Reading the raw data

In [9]:
raw_data = pd.read_csv(
    raw_results(Path.cwd()),
    low_memory=False,
)

### Working with a copy of the raw data

In [10]:
data = raw_data.copy()

### Droping unnecessary columns / Keeping interesting ones

I have decided not to analyse the snapshot data of a repo. Why not? Because it is snapshot data for the point in time when I gathered the data. If a repo "has GitHub discussions", I have no idea since when that is the case. What if I draw conclusions when actually GitHub discussions has been introduced for that repo just the day before pulling the data?

In [11]:
COLS_OF_INTEREST = [
    "full_name_of_repo",
    "commit_sha",
    "path",
    "is_ccdc_event",
    "detected_channel",
    "created_at",
    "pushed_at",
    "updated_at",
    "date",
]

UNNECESSARY_COLS = []

for col in data.columns:
    if col not in COLS_OF_INTEREST:
        UNNECESSARY_COLS.append(col)

data.drop(columns=UNNECESSARY_COLS, inplace=True)

### Data conversion – if needed

In [12]:
def to_bool(series: pd.Series) -> pd.Series:
    if series.dtype == bool:
        return series
    s = series.astype("string").str.strip().str.lower()
    mapping = {"true": True, "false": False}
    return s.map(mapping)


data["is_ccdc_event"] = to_bool(data["is_ccdc_event"])

data["detected_channel"] = data["detected_channel"].astype("string").fillna("")

DATETIME_COLS = ["created_at", "pushed_at", "updated_at", "date"]

for col in DATETIME_COLS:
    if col in data.columns:
        data[col] = pd.to_datetime(data[col], errors="coerce", utc=True)


### Channel aggregation – Introducing the subjects dataframe

Objective: One row = one subject

In [13]:
KEY_COLS = [
    "full_name_of_repo",
    "commit_sha",
    "path",
]

gb = data.groupby(KEY_COLS, dropna=False)


def agg_channels(x: pd.Series) -> tuple[str, ...]:
    vals = [
        v
        for v in x.astype("string").tolist()
        if isinstance(v, str) and v.strip() != ""
    ]
    return tuple(sorted(set(vals)))


agg = gb.agg(
    is_ccdc_event=("is_ccdc_event", "first"),
    detected_channels=("detected_channel", agg_channels),
)

for col in COLS_OF_INTEREST:
    if col not in KEY_COLS and col not in agg.columns and col != "detected_channel":
        agg[col] = gb[col].first()

subjects = agg.reset_index()

### Adding "first_activity" column

The first activity is the timestamp that is earlier: the commit or when the repo was officially created.

In [14]:
subjects["first_activity"] = (
    subjects[["date", "created_at"]]
        .min(axis=1)
        .groupby(subjects["full_name_of_repo"], dropna=False)
        .transform("min")
)

### Adding "repo_age" column

In [15]:
subjects["repo_age"] = (
    (subjects["date"] - subjects["first_activity"]).to_numpy()
    / np.timedelta64(1, "D")
)

subjects.sort_values(["full_name_of_repo", "repo_age"], inplace=True)
subjects.reset_index(drop=True, inplace=True)

### Adding "repo_age_group" column

In [16]:
AGE_GROUPS = [
    (0, 1, "0-1"),
    (1, 2, "1-2"),
    (2, 3, "2-3"),
    (3, 4, "3-4"),
    (4, 5, "4-5"),
    (5, 6, "5-6"),
    (6, 7, "6-7"),
    (7, 8, "7-8"),
    (8, 9, "8-9"),
    (9, 10, "9-10"),
    (10, 11, "10-11"),
    (11, 12, "11-12"),
    (12, 13, "12-13"),
    (13, 14, "13-14"),
    (14, 15, "14-15"),
    (15, 999, "15+"),
]


def assign_age_group(age_in_days: float) -> int | None:
    if pd.isna(age_in_days):
        return None
    for lo, hi, label in AGE_GROUPS:
        if lo * 365.25 <= age_in_days < hi * 365.25:
            return lo
    return None


subjects["repo_age_group"] = subjects["repo_age"].apply(assign_age_group)

### Adding "year" column for temporal analysis

In [17]:
subjects["year"] = subjects["date"].dt.year

### Extracting a dedicated repos dataframe

In [18]:
repos = (
    subjects
        .groupby("full_name_of_repo", as_index=False)
        .agg(
            created_at=("created_at", "first"),
            pushed_at=("pushed_at", "first"),
            updated_at=("updated_at", "first"),
            first_activity=("first_activity", "first"),
        )
)

subjects.drop(
    columns=["created_at", "pushed_at", "updated_at", "first_activity"],
    inplace=True,
)

### Adding "last_activity" and "age_today" columns to repos dataframe

In [19]:
repos["last_activity"] = repos[["pushed_at", "updated_at"]].max(axis=1)

repos["age_today"] = repos["last_activity"] - repos["first_activity"]

repos.sort_values("age_today", ascending=False, inplace=True)
repos.reset_index(drop=True, inplace=True)

## Descriptive Statistics

### How many of this and that?

In [20]:
group_keys = []
cfg = SummarizeConfig(
    include_metrics=ALL_METRICS,
    group_keys=tuple(group_keys),
)

summary = summarize_subjects_configurable(subjects, cfg).to_frame().T

summary = col_to_int(summary, INT_COLS_OF_SUMMARY)

In [21]:
summary

Unnamed: 0,n_subjects,n_repos,n_commits,n_distinct_paths,positive_rate,n_distinct_channels
0,17694,364,17523,5,0.204928,33


### Earliest commit and latest commit

In [22]:
earliest_commit = subjects["date"].idxmin()
earliest_commit_repo = subjects.loc[earliest_commit, "full_name_of_repo"]
earliest_commit_sha = subjects.loc[earliest_commit, "commit_sha"]
print(f"Earliest commit? {earliest_commit_sha}@{earliest_commit_repo} at {subjects["date"].min()}")
latest_commit = subjects["repo_age"].idxmax()
latest_commit_repo = subjects.loc[latest_commit, "full_name_of_repo"]
latest_commit_sha = subjects.loc[latest_commit, "commit_sha"]
print(f"Latest commit? {latest_commit_sha}@{latest_commit_repo} at {subjects["date"].max()}")

Earliest commit? 77533b76fbc2f0fd72445f8f3afb5d5278d4f4aa@gctools-outilsgc/gcconnex at 2008-08-04 11:09:52+00:00
Latest commit? 8f00f28a3fe1629d6fae66507ad9af30f7698f58@twilio/twilio-java at 2025-12-30 15:55:36+00:00


### Subjects grouped by repo

In [23]:
group_keys = ["full_name_of_repo"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_subjects",
        "n_commits",
        "n_distinct_paths",
        "positive_rate",
        "n_distinct_channels",
    ),
    group_keys=tuple(group_keys),
)

repo_summaries = (
    subjects
        .set_index("full_name_of_repo")
        .groupby(group_keys, dropna=True)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

repo_summaries = col_to_int(repo_summaries, INT_COLS_OF_SUMMARY)
repo_summaries.sort_values("n_subjects", ascending=False, inplace=True)

In [24]:
repo_summaries

Unnamed: 0_level_0,n_subjects,n_commits,n_distinct_paths,positive_rate,n_distinct_channels
full_name_of_repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
spencermountain/compromise,570,565,2,0.150877,10
flexera-public/policy_templates,506,502,2,0.150198,9
plotly/plotly.js,367,365,2,0.130790,12
apache/shardingsphere,348,344,2,0.212644,15
danieleteti/delphimvcframework,331,331,1,0.190332,17
...,...,...,...,...,...
lessworks/translation,1,1,1,0.000000,1
mackmobile/Contador,1,1,1,0.000000,0
hwdtech/smartforms,1,1,1,0.000000,0
minio/minio-boshrelease,1,1,1,1.000000,2


### Subjects grouped by path

In [25]:
group_keys = ["path"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_subjects",
        "n_repos",
        "n_commits",
        "positive_rate",
        "n_distinct_channels",
    ),
    group_keys=tuple(group_keys),
)

path_summaries = (
    subjects
        .set_index("path")
        .groupby(group_keys, dropna=True)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

path_summaries = col_to_int(path_summaries, INT_COLS_OF_SUMMARY)
path_summaries.sort_values("n_subjects", ascending=False, inplace=True)

path_summaries

Unnamed: 0_level_0,n_subjects,n_repos,n_commits,positive_rate,n_distinct_channels
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
README.md,16172,355,16172,0.195709,32
CONTRIBUTING.md,909,99,909,0.333333,30
readme.md,412,10,412,0.274272,19
README.txt,174,15,174,0.229885,14
contributing.md,27,3,27,0.185185,3


### Detected channels

In [26]:
channel_counts = (
    subjects["detected_channels"]
        .explode()
        .dropna()
        .value_counts()
)

In [27]:
channel_counts

detected_channels
form                  643
website               624
forum                 580
pull_request          574
fork                  562
issues                545
mail                  460
mailing_list          284
github_issues         256
github_wiki           242
ping_on_github        233
telegram              192
blog                  152
jira                  148
linkedin              134
slack                 126
facebook               95
twitter                89
rss_feed               86
github_discussions     68
irc                    67
discord                66
gitter                 57
stack_overflow         53
google_group           27
patreon                20
medium                 15
youtube                13
meetup                  5
reddit                  5
newsletter              5
skype                   4
zulip                   2
Name: count, dtype: int64

In [28]:
channel_stats = value_counts_stats(channel_counts).to_frame().T
channel_stats = col_to_int(channel_stats, INT_COLS_OF_VC_STATS)

channel_stats

Unnamed: 0,n_categories,total_count,min_count,max_count,mean_count,median_count,q1_count,q3_count,iqr_count,top_1_count,top_1_share,top_5_share,top_10_share
0,33,6432,2,643,194.909091,95,27,256,229,643.0,0.099969,0.463775,0.741604


In [29]:
print(f"Most frequently detected channel? {channel_counts.idxmax()} with {channel_counts.max()} counts")
print(f"Least commonly recognized channel? {channel_counts.idxmin()} with {channel_counts.min()} counts")

Most frequently detected channel? form with 643 counts
Least commonly recognized channel? zulip with 2 counts


### Age (in days) of the repo at the time of the commit

In [30]:
repo_age_stats = boxplot_stats(subjects["repo_age"])
print(repo_age_stats)

q1                 170.259450
median             733.159954
mean              1103.426051
q3                1758.364465
iqr               1588.105014
lower_whisker        0.000000
upper_whisker     4139.273380
n_outliers         288.000000
min                  0.000000
max               5859.094514
n                17694.000000
dtype: float64


### Age group of the repo at the time of the commit

In [31]:
repo_age_group_stats = boxplot_stats(subjects["repo_age_group"])
print(repo_age_group_stats)

q1                   0.000000
median               2.000000
mean                 2.606533
q3                   4.000000
iqr                  4.000000
lower_whisker        0.000000
upper_whisker       10.000000
n_outliers         343.000000
min                  0.000000
max                 15.000000
n                17694.000000
dtype: float64


### Year in which the commit took place

In [32]:
year_stats = boxplot_stats(subjects["year"])
print(year_stats)

q1                2015.000000
median            2017.000000
mean              2017.629875
q3                2020.000000
iqr                  5.000000
lower_whisker     2008.000000
upper_whisker     2025.000000
n_outliers           0.000000
min               2008.000000
max               2025.000000
n                17694.000000
dtype: float64


## Temporal Trends

### Yearly summary

In [33]:
group_keys = ["year"]

cfg = SummarizeConfig(
    include_metrics=ALL_METRICS,
    group_keys=tuple(group_keys),
)

yearly_summary = (
    subjects
        .set_index("year")
        .groupby(group_keys)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

yearly_summary = col_to_int(yearly_summary, INT_COLS_OF_SUMMARY)

In [34]:
yearly_summary

Unnamed: 0_level_0,n_subjects,n_repos,n_commits,n_distinct_paths,positive_rate,n_distinct_channels
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008,6,1,6,1,0.166667,3
2009,34,2,34,2,0.117647,3
2010,59,6,58,2,0.152542,11
2011,52,9,51,2,0.173077,9
2012,246,26,244,4,0.231707,15
2013,644,71,643,4,0.256211,20
2014,1459,123,1441,4,0.210418,27
2015,1941,175,1919,5,0.210716,23
2016,2935,219,2910,5,0.191482,29
2017,2571,242,2549,5,0.196033,29


## Project Lifecycle Analysis

In [35]:
group_keys = ["repo_age_group"]

cfg = SummarizeConfig(
    include_metrics=ALL_METRICS,
    group_keys=tuple(group_keys),
)

summary_per_repo_age_group = (
    subjects
        .set_index("repo_age_group")
        .groupby(group_keys)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

summary_per_repo_age_group = col_to_int(summary_per_repo_age_group, INT_COLS_OF_SUMMARY)

In [36]:
summary_per_repo_age_group

Unnamed: 0_level_0,n_subjects,n_repos,n_commits,n_distinct_paths,positive_rate,n_distinct_channels
repo_age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6397,339,6335,5,0.205878,32
1,2430,251,2421,4,0.212346,29
2,1995,216,1978,4,0.202506,26
3,1439,211,1423,5,0.1918,26
4,1214,173,1198,5,0.215815,27
5,1024,138,1020,5,0.230469,25
6,895,124,880,4,0.175419,26
7,674,114,666,5,0.183976,25
8,586,92,578,3,0.191126,25
9,377,82,364,3,0.238727,25


## Channel Evolution

### Function to get value counts for a group

In [37]:
def vc_detected_channels(
    g: pd.DataFrame,
    *,
    drop_empty: bool = True,
) -> pd.Series:
    """
    Returns value_counts of channels for one group g.
    Assumes g["detected_channels"] contains iterables/tuples of channels, possibly empty () or NaN.
    """
    col: str = "detected_channels"
    s = g[col]

    # explode expects list-like; NaN stays NaN; () becomes empty -> drops on explode
    exploded = s.explode()

    # Optional cleanup
    exploded = exploded.dropna()
    exploded = exploded.astype("string")

    if drop_empty:
        exploded = exploded[exploded.str.strip() != ""]

    # value_counts -> counts per channel
    vc = exploded.value_counts(dropna=False)

    # Make the result stable/consistent
    vc.index.name = "channel"
    vc.name = "count"
    return vc

### Channel counts per year

In [38]:
group_keys = ["year"]

channels_per_year = (
    subjects
        .groupby(group_keys)[["detected_channels"]]
        .apply(vc_detected_channels)
)

channels_matrix = channels_per_year.unstack(fill_value=0).reset_index()

channels_per_year = channels_per_year.reset_index()

stats = (
    channels_per_year
        .groupby(group_keys)["count"]
        .apply(value_counts_stats)
).unstack(fill_value=0)

n = 3

top_n_per_year = (
    channels_per_year
        .sort_values(["year", "count"], ascending=[True, False])
        .groupby("year", dropna=False, sort=False)
        .head(n)
        .reset_index(drop=True)
)

bottom_n_per_year = (
    channels_per_year
        .sort_values(["year", "count"], ascending=[True, False])
        .groupby("year", dropna=False, sort=False)
        .tail(n)
        .reset_index(drop=True)
)

In [39]:
channels_matrix

channel,year,blog,discord,facebook,fork,form,forum,github_discussions,github_issues,github_wiki,...,reddit,rss_feed,skype,slack,stack_overflow,telegram,twitter,website,youtube,zulip
0,2008,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,3,0,0
1,2009,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,3,0,0
2,2010,0,0,0,0,7,7,0,0,6,...,0,0,0,0,3,0,2,6,0,0
3,2011,0,1,0,0,12,10,0,0,2,...,0,0,0,0,0,0,0,3,0,0
4,2012,1,1,0,2,12,10,0,1,4,...,0,0,0,0,0,0,1,3,0,0
5,2013,7,4,0,18,32,36,4,11,1,...,0,2,0,0,0,0,9,39,0,0
6,2014,14,3,2,22,57,48,7,19,12,...,2,4,1,0,6,2,7,71,0,0
7,2015,20,0,7,85,76,74,7,23,20,...,0,12,0,12,4,0,6,56,0,0
8,2016,19,2,12,109,89,71,10,50,34,...,0,29,1,28,5,20,12,90,3,0
9,2017,11,3,4,66,103,74,7,29,31,...,1,11,0,9,12,6,10,69,8,0


In [40]:
stats

Unnamed: 0_level_0,n_categories,total_count,min_count,max_count,mean_count,median_count,q1_count,q3_count,iqr_count,top_1_count,top_1_share,top_5_share,top_10_share
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2008,3.0,5.0,1.0,3.0,1.666667,1.0,1.0,2.0,1.0,3.0,0.6,1.0,1.0
2009,3.0,5.0,1.0,3.0,1.666667,1.0,1.0,2.0,1.0,3.0,0.6,1.0,1.0
2010,11.0,42.0,1.0,7.0,3.818182,3.0,2.0,6.0,4.0,7.0,0.166667,0.714286,0.97619
2011,9.0,38.0,1.0,12.0,4.222222,3.0,2.0,3.0,1.0,12.0,0.315789,0.815789,1.0
2012,15.0,60.0,1.0,12.0,4.0,2.0,1.0,5.5,4.5,12.0,0.2,0.716667,0.916667
2013,20.0,297.0,1.0,39.0,14.85,9.5,6.25,21.5,15.25,39.0,0.131313,0.599327,0.818182
2014,27.0,545.0,1.0,71.0,20.185185,12.0,3.5,26.0,22.5,71.0,0.130275,0.550459,0.80367
2015,23.0,710.0,2.0,85.0,30.869565,20.0,7.0,58.5,51.5,85.0,0.119718,0.509859,0.807042
2016,29.0,985.0,1.0,109.0,33.965517,20.0,10.0,50.0,40.0,109.0,0.11066,0.481218,0.75533
2017,29.0,775.0,1.0,103.0,26.724138,12.0,6.0,35.0,29.0,103.0,0.132903,0.51871,0.781935


In [41]:
top_n_per_year

Unnamed: 0,year,channel,count
0,2008,website,3
1,2008,mail,1
2,2008,skype,1
3,2009,website,3
4,2009,mail,1
5,2009,skype,1
6,2010,form,7
7,2010,forum,7
8,2010,github_wiki,6
9,2011,form,12


In [42]:
bottom_n_per_year

Unnamed: 0,year,channel,count
0,2008,website,3
1,2008,mail,1
2,2008,skype,1
3,2009,website,3
4,2009,mail,1
5,2009,skype,1
6,2010,ping_on_github,2
7,2010,jira,1
8,2010,linkedin,1
9,2011,github_wiki,2


### Channel counts per repo age group

In [43]:
group_keys = ["repo_age_group"]

channels_per_ag = (
    subjects
        .groupby(group_keys)[["detected_channels"]]
        .apply(vc_detected_channels)
)

channels_matrix_per_ag = channels_per_ag.unstack(fill_value=0).reset_index()
channels_matrix_per_ag.set_index("repo_age_group", inplace=True)


channels_per_ag = channels_per_ag.reset_index()

stats_per_ag = (
    channels_per_ag
        .groupby(group_keys)["count"]
        .apply(value_counts_stats)
).unstack(fill_value=0)

n = 3

top_n_per_ag = (
    channels_per_ag
        .sort_values(["repo_age_group", "count"], ascending=[True, False])
        .groupby("repo_age_group", dropna=False, sort=False)
        .head(n)
        .reset_index(drop=True)
)

bottom_n_per_ag = (
    channels_per_ag
        .sort_values(["repo_age_group", "count"], ascending=[True, False])
        .groupby("repo_age_group", dropna=False, sort=False)
        .tail(n)
        .reset_index(drop=True)
)

In [44]:
channels_matrix_per_ag

channel,blog,discord,facebook,fork,form,forum,github_discussions,github_issues,github_wiki,gitter,...,reddit,rss_feed,skype,slack,stack_overflow,telegram,twitter,website,youtube,zulip
repo_age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,48,17,45,186,232,165,21,76,78,16,...,3,23,3,30,23,20,28,216,8,0
1,22,16,17,72,84,88,7,28,29,8,...,0,36,1,17,4,13,6,67,2,0
2,17,11,8,63,69,67,3,28,21,10,...,0,3,0,18,3,6,11,93,1,0
3,5,5,8,53,54,57,4,22,23,6,...,0,2,0,5,7,8,6,60,0,0
4,4,6,7,41,47,48,6,19,15,4,...,0,3,0,18,1,11,6,37,0,0
5,10,3,0,23,37,31,3,15,11,5,...,0,2,0,17,2,4,8,37,0,1
6,4,0,3,35,33,33,4,12,17,2,...,0,0,0,5,1,35,2,22,1,0
7,6,1,1,30,30,29,3,14,13,0,...,0,8,0,5,0,34,4,21,0,1
8,12,2,0,14,21,23,2,12,7,1,...,1,0,0,4,2,55,5,26,0,0
9,5,3,1,21,19,14,2,8,10,0,...,0,7,0,3,2,5,4,16,1,0


In [45]:
stats_per_ag

Unnamed: 0_level_0,n_categories,total_count,min_count,max_count,mean_count,median_count,q1_count,q3_count,iqr_count,top_1_count,top_1_share,top_5_share,top_10_share
repo_age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,32.0,2051.0,3.0,232.0,64.09375,29.0,10.0,81.0,71.0,232.0,0.113116,0.484642,0.760117
1,29.0,888.0,1.0,88.0,30.62069,18.0,7.0,43.0,36.0,88.0,0.099099,0.453829,0.739865
2,26.0,712.0,1.0,93.0,27.384615,17.5,8.5,44.5,36.0,93.0,0.130618,0.502809,0.769663
3,26.0,523.0,1.0,60.0,20.115385,8.5,5.0,38.0,33.0,60.0,0.114723,0.525813,0.808795
4,27.0,467.0,1.0,49.0,17.296296,10.0,4.0,30.0,26.0,49.0,0.104925,0.501071,0.788009
5,25.0,346.0,1.0,37.0,13.84,9.0,3.0,23.0,20.0,37.0,0.106936,0.488439,0.786127
6,26.0,334.0,1.0,35.0,12.846154,10.0,3.0,21.0,18.0,35.0,0.10479,0.482036,0.766467
7,25.0,314.0,1.0,34.0,12.56,8.0,3.0,21.0,18.0,34.0,0.10828,0.461783,0.767516
8,25.0,263.0,1.0,55.0,10.52,5.0,2.0,15.0,13.0,55.0,0.209125,0.56654,0.828897
9,25.0,208.0,1.0,21.0,8.32,7.0,3.0,14.0,11.0,21.0,0.100962,0.423077,0.716346


In [46]:
top_n_per_ag

Unnamed: 0,repo_age_group,channel,count
0,0,form,232
1,0,website,216
2,0,fork,186
3,1,forum,88
4,1,form,84
5,1,pull_request,78
6,2,website,93
7,2,form,69
8,2,forum,67
9,3,website,60


In [47]:
bottom_n_per_ag

Unnamed: 0,repo_age_group,channel,count
0,0,reddit,3
1,0,medium,3
2,0,skype,3
3,1,youtube,2
4,1,skype,1
5,1,patreon,1
6,2,github_discussions,3
7,2,rss_feed,3
8,2,youtube,1
9,3,patreon,4


### Per repo

In [48]:
group_keys = ["full_name_of_repo"]

channels_per_repo = (
    subjects
        .groupby(group_keys)[["detected_channels"]]
        .apply(vc_detected_channels)
)

channels_matrix_per_repo = channels_per_repo.unstack(fill_value=0).reset_index()
channels_matrix_per_repo.set_index("full_name_of_repo", inplace=True)


channels_per_repo = channels_per_repo.reset_index()

stats_per_repo = (
    channels_per_repo
        .groupby(group_keys)["count"]
        .apply(value_counts_stats)
).unstack(fill_value=0)



In [49]:
stats_per_repo.sort_values("n_categories", ascending=False)

Unnamed: 0_level_0,n_categories,total_count,min_count,max_count,mean_count,median_count,q1_count,q3_count,iqr_count,top_1_count,top_1_share,top_5_share,top_10_share
full_name_of_repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
danieleteti/delphimvcframework,17.0,175.0,1.0,48.0,10.294118,7.0,3.0,9.0,6.0,48.0,0.274286,0.708571,0.914286
navikt/modiapersonoversikt,17.0,38.0,2.0,6.0,2.235294,2.0,2.0,2.0,0.0,6.0,0.157895,0.368421,0.631579
theworkingmen/idb,17.0,39.0,2.0,5.0,2.294118,2.0,2.0,2.0,0.0,5.0,0.128205,0.384615,0.641026
bvaughn/personal-logger,17.0,35.0,2.0,3.0,2.058824,2.0,2.0,2.0,0.0,3.0,0.085714,0.314286,0.600000
userfrosting/UserFrosting,17.0,113.0,2.0,19.0,6.647059,5.0,3.0,7.0,4.0,19.0,0.168142,0.566372,0.814159
...,...,...,...,...,...,...,...,...,...,...,...,...,...
tokuhirom/Amon,1.0,1.0,1.0,1.0,1.000000,1.0,1.0,1.0,0.0,1.0,1.000000,1.000000,1.000000
toleda/audio_CloverALC,1.0,2.0,2.0,2.0,2.000000,2.0,2.0,2.0,0.0,2.0,1.000000,1.000000,1.000000
mapbox/vtquery,1.0,1.0,1.0,1.0,1.000000,1.0,1.0,1.0,0.0,1.0,1.000000,1.000000,1.000000
cooler-SAI/MopCore547,1.0,1.0,1.0,1.0,1.000000,1.0,1.0,1.0,0.0,1.0,1.000000,1.000000,1.000000
