# Data Analysis

## Preparing The Data

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
def data(root: Path) -> Path:
    return root / "data"


def output(root: Path) -> Path:
    return data(root) / "output"


def raw_results(root: Path) -> Path:
    return data(root) / "raw_results" / "raw_results.csv"


def subjects(root: Path) -> Path:
    return data(root) / "raw_results" / "subjects.csv"


def commits(root: Path) -> Path:
    return data(root) / "raw_results" / "commits.csv"


def truth(root: Path) -> Path:
    return data(root) / "truth" / "truth.csv"


In [3]:
raw_results_df = pd.read_csv(
    raw_results(Path.cwd()),
    low_memory=False,
)

In [4]:
df = raw_results_df.copy()

for column in [
    "full_name",
    "clone_url",
    "git_url",
    "teams_url",
    "sha",
    "url",
    "message",
]:
    if column in df.columns:
        df = df.drop(columns=[column])

In [5]:
if df["is_ccdc_event"].isna().any():
    n = int(df["is_ccdc_event"].isna().sum())
    print(f"WARNING: {n} rows have non-parseable is_ccdc_event -> NaN")

def to_bool(series: pd.Series) -> pd.Series:
    if series.dtype == bool:
        return series
    s = series.astype("string").str.strip().str.lower()
    mapping = {"true": True, "false": False}
    return s.map(mapping)

df["is_ccdc_event"] = to_bool(df["is_ccdc_event"])

In [6]:
df["detected_channel"] = df["detected_channel"].astype("string").fillna("")

In [7]:
key_cols = ["full_name_of_repo", "commit_sha", "path"]

gb = df.groupby(key_cols, dropna=False)

def agg_channels(x: pd.Series) -> tuple[str, ...]:
    vals = [
        v
        for v in x.astype("string").tolist()
        if isinstance(v, str) and v.strip() != ""
    ]
    return tuple(sorted(set(vals)))

agg = gb.agg(
    is_ccdc_event=("is_ccdc_event", "first"),
    date=("date", "first"),
    detected_channels=("detected_channel", agg_channels),
)

for column in [
    "id",
    "homepage",
    "created_at",
    "pushed_at",
    "updated_at",
    "has_discussions",
    "has_issues",
    "has_pages",
    "has_projects",
    "has_wiki",
    "forks_count",
    "open_issues_count",
    "stargazers_count",
    "subscribers_count",
    "size",
]:
    if column in df.columns:
        agg[column] = gb[column].first()

agg = agg.reset_index()

df = agg

In [8]:
date_cols = ["created_at", "pushed_at", "updated_at", "date"]

for column in date_cols:
    if column in df.columns:
        df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)

In [9]:
rowwise_min = df[["date", "created_at"]].min(axis=1)

df["birthday"] = (
    rowwise_min
        .groupby(df["full_name_of_repo"], dropna=False)
        .transform("min")
)

In [10]:
df["age_in_days"] = (
    (df["date"] - df["birthday"]).to_numpy()
    / np.timedelta64(1, "D")
)

In [11]:
AGE_GROUPS_YEARS = [
    (0, 1, "0-1"),
    (1, 2, "1-2"),
    (2, 3, "2-3"),
    (3, 4, "3-4"),
    (4, 5, "4-5"),
    (5, 6, "5-6"),
    (6, 7, "6-7"),
    (7, 8, "7-8"),
    (8, 9, "8-9"),
    (9, 10, "9-10"),
    (10, 11, "10-11"),
    (11, 12, "11-12"),
    (12, 13, "12-13"),
    (13, 14, "13-14"),
    (14, 15, "14-15"),
    (15, 999, "15+"),
]


def assign_age_group_and_index(age_in_days: float) -> tuple[str, int | None]:
    if pd.isna(age_in_days):
        return "unknown", None
    for lo, hi, label in AGE_GROUPS_YEARS:
        if lo * 365.25 <= age_in_days < hi * 365.25:
            return label, lo
    return "unknown", None


df[["repo_age_group_at_commit", "repo_age_group_index"]] = (
    df["age_in_days"]
      .apply(assign_age_group_and_index)
      .apply(pd.Series)
)

df["repo_age_group_at_commit"].value_counts(dropna=False)

repo_age_group_at_commit
0-1      6397
1-2      2430
2-3      1995
3-4      1439
4-5      1214
5-6      1024
6-7       895
7-8       674
8-9       586
9-10      377
10-11     320
11-12     163
12-13     120
13-14      42
14-15      11
15+         7
Name: count, dtype: int64

In [12]:
subjects = df.copy()

for column in [
    "id",
    "homepage",
    "pushed_at",
    "updated_at",
    "has_discussions",
    "has_issues",
    "has_pages",
    "has_projects",
    "has_wiki",
    "forks_count",
    "open_issues_count",
    "stargazers_count",
    "subscribers_count",
    "size",
    "birthday",
]:
    if column in subjects.columns:
        subjects = subjects.drop(columns=[column])

In [13]:
subjects = subjects.sort_values(
    by=["full_name_of_repo", "date", "commit_sha", "path"],
    ascending=[True, True, True, True],
).reset_index(drop=True)
subjects["subject_id"] = df.index
subjects = subjects.set_index("subject_id")

subjects.head(30)

Unnamed: 0_level_0,full_name_of_repo,commit_sha,path,is_ccdc_event,date,detected_channels,created_at,age_in_days,repo_age_group_at_commit,repo_age_group_index
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,05bit/peewee-async,fc8edc1d35cbda8477ceb4dd672d2ed115b18635,README.md,False,2014-09-27 22:15:28+00:00,(),2014-09-27 22:15:28+00:00,0.0,0-1,0
1,05bit/peewee-async,03d7da284e47d3018e071995e2d9cd7a8625f41e,README.md,True,2014-09-28 13:44:56+00:00,"(issues,)",2014-09-27 22:15:28+00:00,0.645463,0-1,0
2,05bit/peewee-async,53130218e08eaacf34f3d68338bb008fdfeb6c72,README.md,False,2014-09-28 13:49:37+00:00,(),2014-09-27 22:15:28+00:00,0.648715,0-1,0
3,05bit/peewee-async,c90ae2ca4a4d6cdac078e9918c53e9ea0ddcb632,README.md,False,2014-09-29 10:33:36+00:00,(),2014-09-27 22:15:28+00:00,1.512593,0-1,0
4,05bit/peewee-async,4db3f204d4ee60a91146e27d3d82b9edfc1086c2,README.md,False,2014-10-11 09:08:29+00:00,(),2014-09-27 22:15:28+00:00,13.453484,0-1,0
5,05bit/peewee-async,f546a0d2c41b2c29f718ec4f6459c44eedd00983,README.md,False,2014-10-11 12:07:44+00:00,(),2014-09-27 22:15:28+00:00,13.577963,0-1,0
6,05bit/peewee-async,95506301a3c0279812ee6802e30f1741435c5aca,README.md,False,2014-10-11 12:11:48+00:00,(),2014-09-27 22:15:28+00:00,13.580787,0-1,0
7,05bit/peewee-async,eead5b75cf1ef4c06cfce3d09184230b4e64ffa8,README.md,False,2014-10-11 14:38:03+00:00,(),2014-09-27 22:15:28+00:00,13.68235,0-1,0
8,05bit/peewee-async,7bce99a67f17685aeec8f756908f8ad19a2cd82e,README.md,False,2014-10-11 15:09:02+00:00,(),2014-09-27 22:15:28+00:00,13.703866,0-1,0
9,05bit/peewee-async,0b25d439c214c32d531928a5114593a6db7762b8,README.md,False,2014-10-11 15:39:33+00:00,(),2014-09-27 22:15:28+00:00,13.725058,0-1,0


In [14]:
repos = df.groupby("full_name_of_repo").agg(
    id=("id", "first"),
    n_subjects=("commit_sha", "count"),
    n_ccdc_events=("is_ccdc_event", lambda s: int(s.fillna(False).sum())),
    ccdc_rate=("is_ccdc_event", lambda s: float(s.fillna(False).mean())),
    birthday=("birthday", "first"),
    pushed_at=("pushed_at", "first"),
    updated_at=("updated_at", "first"),
    homepage=("homepage", "first"),
    has_discussions=("has_discussions", "first"),
    has_issues=("has_issues", "first"),
    has_pages=("has_pages", "first"),
    has_projects=("has_projects", "first"),
    has_wiki=("has_wiki", "first"),
    forks_count=("forks_count", "first"),
    open_issues_count=("open_issues_count", "first"),
    stargazers_count=("stargazers_count", "first"),
    subscribers_count=("subscribers_count", "first"),
    size=("size", "first"),
).reset_index()

repos["last_activity"] = repos[["pushed_at", "updated_at"]].max(axis=1)

repos["age"] = repos["last_activity"] - repos["birthday"]

expected = len(subjects)
actual = repos["n_subjects"].sum()
assert expected == actual, "ERROR"

## Descriptive Statistics

TODO: Remove

In [15]:
def summarize_subjects(g: pd.DataFrame) -> pd.Series:
    n_repos = g["full_name_of_repo"].nunique()
    n_commits = g["commit_sha"].nunique()
    n_distinct_paths = g["path"].nunique()
    n_subjects = len(g)
    positives = g["is_ccdc_event"]
    negatives = ~positives
    n_positives = positives.sum()
    n_negatives = negatives.sum()
    positive_rate = (n_positives / n_subjects) if n_subjects else np.nan
    channels = (
        g["detected_channels"]
        .explode()
        .dropna()    
    )
    n_distinct_channels = len(channels.unique())
    has_channel = g["detected_channels"] != ()
    no_channel = g["detected_channels"] == ()
    n_positives_with_channel = (positives & has_channel).sum()
    n_positives_without_channel = (positives & no_channel).sum()
    n_negatives_with_channel = (negatives & has_channel).sum()
    n_negatives_without_channel = (negatives & no_channel).sum()
    path_vc = (
        g["path"]
        .value_counts()
    )
    most_popular_path = path_vc.idxmax()
    least_popular_path = path_vc.idxmin()
    channels_vc = (
        channels
        .value_counts()
    )
    most_frequently_detected_channel = channels_vc.idxmax()
    least_commonly_detected_channel = channels_vc.idxmin()
    return pd.Series(
        {
            "n_repos": n_repos,
            "n_commits": n_commits,
            "n_distinct_paths": n_distinct_paths,
            "n_subjects": n_subjects,
            "n_positives": n_positives,
            "n_negatives": n_negatives,
            "positive_rate": positive_rate,
            "n_distinct_channels": n_distinct_channels,
            "n_positives_with_channel": n_positives_with_channel,
            "n_positives_without_channel": n_positives_without_channel,
            "n_negatives_with_channel": n_negatives_with_channel,
            "n_negatives_without_channel": n_negatives_without_channel,
            "most_popular_path": most_popular_path,
            "least_popular_path": least_popular_path,
            "most_frequently_detected_channel": most_frequently_detected_channel,
            "least_commonly_detected_channel": least_commonly_detected_channel,
        }
    )

In [16]:
from collections.abc import Callable, Sequence
from dataclasses import dataclass
import pandas as pd
import numpy as np

MetricFn = Callable[[pd.DataFrame], object]

def _require_cols(g: pd.DataFrame, cols: Sequence[str]) -> None:
    missing = [c for c in cols if c not in g.columns]
    if missing:
        raise KeyError(f"Missing required columns: {missing}")

# --- metrics ---
def m_n_repos(g: pd.DataFrame) -> int:
    _require_cols(g, ["full_name_of_repo"])
    return int(g["full_name_of_repo"].nunique())

def m_n_commits(g: pd.DataFrame) -> int:
    _require_cols(g, ["commit_sha"])
    return int(g["commit_sha"].nunique())

def m_n_distinct_paths(g: pd.DataFrame) -> int:
    _require_cols(g, ["path"])
    return int(g["path"].nunique())

def m_n_subjects(g: pd.DataFrame) -> int:
    return int(len(g))

def m_positive_rate(g: pd.DataFrame) -> float:
    _require_cols(g, ["is_ccdc_event"])
    n = len(g)
    if n == 0:
        return np.nan
    return float(g["is_ccdc_event"].sum() / n)

def m_n_distinct_channels(g: pd.DataFrame) -> int:
    _require_cols(g, ["detected_channels"])
    channels = g["detected_channels"].explode().dropna()
    return int(channels.nunique())

def m_most_popular_path(g: pd.DataFrame) -> object:
    _require_cols(g, ["path"])
    vc = g["path"].value_counts()
    return vc.idxmax() if len(vc) else np.nan

def m_least_popular_path(g: pd.DataFrame) -> object:
    _require_cols(g, ["path"])
    vc = g["path"].value_counts()
    return vc.idxmin() if len(vc) else np.nan

def m_most_frequently_detected_channel(g: pd.DataFrame) -> object:
    _require_cols(g, ["detected_channels"])
    channels = g["detected_channels"].explode().dropna()
    vc = channels.value_counts()
    return vc.idxmax() if len(vc) else np.nan

def m_least_commonly_detected_channel(g: pd.DataFrame) -> object:
    _require_cols(g, ["detected_channels"])
    channels = g["detected_channels"].explode().dropna()
    vc = channels.value_counts()
    return vc.idxmin() if len(vc) else np.nan

def m_channel_pos_neg_breakdown(g: pd.DataFrame) -> dict:
    _require_cols(g, ["is_ccdc_event", "detected_channels"])
    positives = g["is_ccdc_event"].fillna(False).astype(bool)
    negatives = ~positives

    has_channel = g["detected_channels"] != ()
    no_channel = g["detected_channels"] == ()

    return {
        "n_positives_with_channel": int((positives & has_channel).sum()),
        "n_positives_without_channel": int((positives & no_channel).sum()),
        "n_negatives_with_channel": int((negatives & has_channel).sum()),
        "n_negatives_without_channel": int((negatives & no_channel).sum()),
    }

# Registry: Name -> (fn, required_columns)
METRICS: dict[str, tuple[Callable[[pd.DataFrame], object], tuple[str, ...]]] = {
    "n_repos": (m_n_repos, ("full_name_of_repo",)),
    "n_commits": (m_n_commits, ("commit_sha",)),
    "n_distinct_paths": (m_n_distinct_paths, ("path",)),
    "n_subjects": (m_n_subjects, ()),
    "positive_rate": (m_positive_rate, ("is_ccdc_event",)),
    "n_distinct_channels": (m_n_distinct_channels, ("detected_channels",)),
    "most_popular_path": (m_most_popular_path, ("path",)),
    "least_popular_path": (m_least_popular_path, ("path",)),
    "most_frequently_detected_channel": (m_most_frequently_detected_channel, ("detected_channels",)),
    "least_commonly_detected_channel": (m_least_commonly_detected_channel, ("detected_channels",)),
    # “expands” to 4 metrics:
    "channel_pos_neg_breakdown": (m_channel_pos_neg_breakdown, ("is_ccdc_event", "detected_channels")),
}

@dataclass(frozen=True)
class SummarizeConfig:
    include_metrics: tuple[str, ...]
    group_keys: tuple[str, ...]
    forbid_access_to_group_keys: bool = True
    drop_group_keys_from_frame: bool = True
    deny_columns: tuple[str, ...] = ()

def summarize_subjects_configurable(g: pd.DataFrame, cfg: SummarizeConfig) -> pd.Series:
    if cfg.drop_group_keys_from_frame and cfg.group_keys:
        g_eff = g.drop(columns=list(cfg.group_keys), errors="ignore")
    else:
        g_eff = g
    forbidden = set(cfg.deny_columns) | (set(cfg.group_keys) if cfg.forbid_access_to_group_keys else set())
    out: dict[str, object] = {}
    for name in cfg.include_metrics:
        if name not in METRICS:
            raise KeyError(f"Unknown metric: {name}. Known: {sorted(METRICS)}")
        fn, required = METRICS[name]
        illegal_required = [c for c in required if c in forbidden]
        if illegal_required:
            raise ValueError(
                f"Metric '{name}' requires forbidden columns {illegal_required}. "
                f"Forbidden: {sorted(forbidden)}"
            )
        val = fn(g_eff)
        if isinstance(val, dict):
            out.update(val)
        else:
            out[name] = val
    return pd.Series(out)

### Overall, how many of this and that?

In [17]:
group_keys = []

cfg = SummarizeConfig(
    include_metrics=(
        "n_repos",
        "n_commits",
        "n_distinct_paths",
        "n_subjects",
        "positive_rate",
        "n_distinct_channels",
        "most_popular_path",
        "least_popular_path",
        "most_frequently_detected_channel",
        "least_commonly_detected_channel",
        "channel_pos_neg_breakdown",
    ),
    group_keys=tuple(group_keys),
)

print(summarize_subjects_configurable(subjects, cfg))

n_repos                                         364
n_commits                                     17523
n_distinct_paths                                  5
n_subjects                                    17694
positive_rate                              0.204928
n_distinct_channels                              33
most_popular_path                         README.md
least_popular_path                  contributing.md
most_frequently_detected_channel               form
least_commonly_detected_channel               zulip
n_positives_with_channel                       1691
n_positives_without_channel                    1935
n_negatives_with_channel                       1909
n_negatives_without_channel                   12159
dtype: object


In [18]:
active_repos = repos[
    repos["last_activity"].dt.year == 2025
].copy()
n_active_repos = len(active_repos)
print(f"How many repos are still active? {n_active_repos}")

How many repos are still active? 143


### Outliers – overall

In [19]:
print(f"The repo with the earliest birthday? {repos["birthday"].min()}")
print(f"The repo with the latest birthday? {repos["birthday"].max()}")

print(f"The oldest repo? {repos.loc[repos["age"].idxmax(), "full_name_of_repo"]}")

earliest_commit = subjects["date"].idxmin()
earliest_commit_repo = subjects.loc[earliest_commit, "full_name_of_repo"]
earliest_commit_sha = subjects.loc[earliest_commit, "commit_sha"]
print(f"Earliest commit? {earliest_commit_sha}@{earliest_commit_repo}")
latest_commit = subjects["age_in_days"].idxmax()
latest_commit_repo = subjects.loc[latest_commit, "full_name_of_repo"]
latest_commit_sha = subjects.loc[latest_commit, "commit_sha"]
print(f"Oldest commit? {latest_commit_sha}@{latest_commit_repo}")

path_counts = (
    subjects["path"]
    .value_counts()
)
print(f"Most popular path? {path_counts.idxmax()} with {path_counts.max()} counts")
print(f"Least popular path? {path_counts.idxmin()} with {path_counts.min()} counts")

channel_counts = (
    subjects["detected_channels"]
    .explode()
    .dropna()
    .value_counts()
)
print(f"Most frequently detected channel? {channel_counts.idxmax()} with {channel_counts.max()} counts")
print(f"Least commonly recognized channel? {channel_counts.idxmin()} with {channel_counts.min()} counts")

The repo with the earliest birthday? 2008-08-04 11:09:52+00:00
The repo with the latest birthday? 2022-11-12 18:36:31+00:00
The oldest repo? gctools-outilsgc/gcconnex
Earliest commit? 77533b76fbc2f0fd72445f8f3afb5d5278d4f4aa@gctools-outilsgc/gcconnex
Oldest commit? 8f00f28a3fe1629d6fae66507ad9af30f7698f58@twilio/twilio-java
Most popular path? README.md with 16172 counts
Least popular path? contributing.md with 27 counts
Most frequently detected channel? form with 643 counts
Least commonly recognized channel? zulip with 2 counts


### Across each and every repo, Over the years…

In [20]:
subjects_y = subjects.copy()
subjects_y["year"] = subjects_y["date"].dt.year

group_keys = ["year"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_repos",
        "n_commits",
        "n_distinct_paths",
        "n_subjects",
        "positive_rate",
        "n_distinct_channels",
        "most_popular_path",
        "least_popular_path",
        "most_frequently_detected_channel",
        "least_commonly_detected_channel",
        "channel_pos_neg_breakdown",
    ),
    group_keys=tuple(group_keys),
)

yearly_summary = (
    subjects_y
    .set_index("year")
    .groupby(group_keys, dropna=False)
    .apply(lambda g: summarize_subjects_configurable(g, cfg))
    .reset_index()
    .sort_values("year")
)

repos_born_per_year = (
    repos.assign(born_year=repos["birthday"].dt.year)
    .groupby("born_year")
    .size()
    .rename("n_repos_born")
    .reset_index()
    .rename(columns={"born_year": "year"})
)

yearly_summary = (
    yearly_summary
    .merge(repos_born_per_year, on="year", how="left")
)
yearly_summary["n_repos_born"] = yearly_summary["n_repos_born"].fillna(0).astype(int)

# 3) Distribution data for plots:
#    How many channels per positive?
#    -> One row per positive subject, per year, with n_channels
channels_per_positive = (
    subjects_y.loc[subjects_y["is_ccdc_event"] == True, ["year", "detected_channels"]]
    .assign(n_channels=lambda d: d["detected_channels"].map(len).astype("int64"))
    .drop(columns=["detected_channels"])
    .reset_index(drop=True)
)

yearly_summary

Unnamed: 0,year,n_repos,n_commits,n_distinct_paths,n_subjects,positive_rate,n_distinct_channels,most_popular_path,least_popular_path,most_frequently_detected_channel,least_commonly_detected_channel,n_positives_with_channel,n_positives_without_channel,n_negatives_with_channel,n_negatives_without_channel,n_repos_born
0,2008,1,6,1,6,0.166667,3,README.txt,README.txt,website,mail,1,0,2,3,1
1,2009,2,34,2,34,0.117647,3,README.md,README.txt,website,mail,1,3,3,27,4
2,2010,6,58,2,59,0.152542,11,README.md,README.txt,form,jira,8,1,15,35,6
3,2011,9,51,2,52,0.173077,9,README.md,README.txt,form,linkedin,5,4,13,30,4
4,2012,26,244,4,246,0.231707,15,README.md,CONTRIBUTING.md,form,google_group,18,39,15,174,28
5,2013,71,643,4,644,0.256211,20,README.md,CONTRIBUTING.md,website,github_wiki,90,75,67,412,44
6,2014,123,1441,4,1459,0.210418,27,README.md,readme.md,website,medium,146,161,169,983,72
7,2015,175,1919,5,1941,0.210716,23,README.md,contributing.md,fork,gitter,182,227,216,1316,66
8,2016,219,2910,5,2935,0.191482,29,README.md,contributing.md,fork,skype,265,297,296,2077,68
9,2017,242,2549,5,2571,0.196033,29,README.md,contributing.md,form,reddit,219,285,216,1851,53


### Repo specific boxplots

- Birthday
- Age – max(pushed_at, updated_at) - birthday
- How many commits?
- How many distinct paths?
- How many subjects?
- How many positives?
- How many negatives?
- What is the rate of positives?
- How many distinct channels?
- How many subjects without a channel?
- How many positives without a channel?
- How many negatives with a channel?
- How many channels per positive? Boxplot, Histogram

In [21]:
group_keys = ["full_name_of_repo"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_commits",
        "n_distinct_paths",
        "n_subjects",
        "positive_rate",
        "n_distinct_channels",
        "most_popular_path",
        "least_popular_path",
        "most_frequently_detected_channel",
        "least_commonly_detected_channel",
        "channel_pos_neg_breakdown",
    ),
    group_keys=tuple(group_keys),
)

repo_summary = (
    subjects
    .set_index("full_name_of_repo")
    .groupby(group_keys, dropna=False)
    .apply(lambda g: summarize_subjects_configurable(g, cfg))
    .reset_index()
    .sort_values("full_name_of_repo")
)

repo_summary

Unnamed: 0,full_name_of_repo,n_commits,n_distinct_paths,n_subjects,positive_rate,n_distinct_channels,most_popular_path,least_popular_path,most_frequently_detected_channel,least_commonly_detected_channel,n_positives_with_channel,n_positives_without_channel,n_negatives_with_channel,n_negatives_without_channel
0,05bit/peewee-async,49,1,49,0.224490,4,README.md,README.md,form,ping_on_github,5,6,4,34
1,15HCB2-Wind/DoAn-PT-HTTT-HD,1,1,1,0.000000,0,README.md,README.md,,,0,0,0,1
2,4Catalyzer/graphql-validation-complexity,11,1,11,0.090909,0,README.md,README.md,,,0,1,0,10
3,4pr0n/ripme,20,1,20,0.400000,6,README.md,README.md,website,twitter,7,1,6,6
4,A3Wasteland/ArmA3_Wasteland.Altis,12,1,12,0.083333,0,README.md,README.md,,,0,1,0,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,yola/yolapy,6,1,6,0.166667,1,README.md,README.md,mail,mail,0,1,1,4
360,zdavatz/oddb.org,32,2,33,0.151515,2,README.md,README.txt,mail,ping_on_github,2,3,3,25
361,zendesk/sunshine-conversations-web,57,1,57,0.543860,8,README.md,README.md,website,linkedin,13,18,1,25
362,zenhack/haskell-capnp,58,2,59,0.237288,5,README.md,CONTRIBUTING.md,form,pull_request,7,7,5,40


### Within a repo, Over the years…

In [22]:
group_keys = ["repo_age_group_index", "repo_age_group_at_commit"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_repos",
        "n_commits",
        "n_distinct_paths",
        "n_subjects",
        "positive_rate",
        "n_distinct_channels",
        "most_popular_path",
        "least_popular_path",
        "most_frequently_detected_channel",
        "least_commonly_detected_channel",
        "channel_pos_neg_breakdown",
    ),
    group_keys=tuple(group_keys),
)

repo_age_group_summary = (
    subjects
    .set_index(group_keys)
    .groupby(group_keys, dropna=False)
    .apply(lambda g: summarize_subjects_configurable(g, cfg))
    .reset_index()
    .drop(columns=["repo_age_group_index"])
)

repo_age_group_summary

Unnamed: 0,repo_age_group_at_commit,n_repos,n_commits,n_distinct_paths,n_subjects,positive_rate,n_distinct_channels,most_popular_path,least_popular_path,most_frequently_detected_channel,least_commonly_detected_channel,n_positives_with_channel,n_positives_without_channel,n_negatives_with_channel,n_negatives_without_channel
0,0-1,339,6335,5,6397,0.205878,32,README.md,contributing.md,form,reddit,562,755,627,4453
1,1-2,251,2421,4,2430,0.212346,29,README.md,README.txt,forum,skype,236,280,262,1652
2,2-3,216,1978,4,1995,0.202506,26,README.md,README.txt,website,youtube,198,206,206,1385
3,3-4,211,1423,5,1439,0.1918,26,README.md,README.txt,website,google_group,129,147,151,1012
4,4-5,173,1198,5,1214,0.215815,27,README.md,README.txt,issues,stack_overflow,132,130,123,829
5,5-6,138,1020,5,1024,0.230469,25,README.md,contributing.md,website,patreon,105,131,93,695
6,6-7,124,880,4,895,0.175419,26,README.md,README.txt,fork,medium,81,76,116,622
7,7-8,114,666,5,674,0.183976,25,README.md,contributing.md,telegram,facebook,59,65,113,437
8,8-9,92,578,3,586,0.191126,25,README.md,readme.md,telegram,medium,70,42,103,371
9,9-10,82,364,3,377,0.238727,25,README.md,readme.md,fork,facebook,47,43,47,240


### Further Path analysis

- For each and every distinct path incl. suffix:
    - overall
        - How many repos?
        - How many subjects?
        - How many positives?
        - How many negatives?
        - What is the rate of positives?
        - How many distinct channels?
        - How many subjects without a channel?
        - How many positives without a channel?
        - How many negatives with a channel?
        - Most often detected channel
        - Rarest channel
        - How many channels per positive?
    - per repo
        - How many subjects? Boxplot
        - How many positives? Boxplot
        - How many negatives? Boxplot
        - What is the rate of positives? Boxplot
        - How many distinct channels? Boxplot
        - How many subjects without a channel? Boxplot
        - How many positives without a channel? Boxplot
        - How many negatives with a channel? Boxplot
        - How many channels per positive? Boxplot
    - per year
        - How many repos?
        - How many subjects?
        - How many positives?
        - How many negatives?
        - What is the rate of positives?
        - How many distinct channels?
        - How many subjects without a channel?
        - How many positives without a channel?
        - How many negatives with a channel?
        - How many channels per positive?
    - per repo age group
        - How many repos?
        - How many subjects?
        - How many positives?
        - How many negatives?
        - What is the rate of positives?
        - How many distinct channels?
        - How many subjects without a channel?
        - How many positives without a channel?
        - How many negatives with a channel?
        - How many channels per positive?

- for each path suffix (e.g., ".txt", ".md"):
    - per year:
        - How many subjects? – popularity of txt vs. md

### Further Channel analysis

- overall:
    - top N most often detected channels
    - top N rarest channels
- per year:
    - top N most often detected channels – race of the channels…
    - top N rarest channels
- per repo age group:
    - top N most often detected channels
    - top N rarest channels
- for each distinct channel:
    - first time
    - last time
    - How many repos?
    - How many subjects?
    - …

### Correlations?

- What indicates a higher ccdc_rate?
    - The year?
    - The age of the repo?
    - The number of channels in use??
    - The ccdc_rate itself???
    - The number of channels detected for that subject?