# Data Analysis

## Global Imports

In [1]:
import numpy as np
import pandas as pd

## Functions

### Data access functions

In [2]:
from pathlib import Path


def data(root: Path) -> Path:
    return root / "data"


def output(root: Path) -> Path:
    return data(root) / "output"


def raw_results(root: Path) -> Path:
    return data(root) / "raw_results" / "raw_results.csv"


def subjects(root: Path) -> Path:
    return data(root) / "raw_results" / "subjects.csv"


def commits(root: Path) -> Path:
    return data(root) / "raw_results" / "commits.csv"


def truth(root: Path) -> Path:
    return data(root) / "truth" / "truth.csv"


### Function to summarize DataFrame

In [3]:
from collections.abc import Callable, Sequence
from dataclasses import dataclass


MetricFn = Callable[[pd.DataFrame], object]


def _require_cols(g: pd.DataFrame, cols: Sequence[str]) -> None:
    missing = [c for c in cols if c not in g.columns]
    if missing:
        raise KeyError(f"Missing required columns: {missing}")


# ---------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------
def m_n_subjects(g: pd.DataFrame) -> int:
    return int(len(g))


def m_n_repos(g: pd.DataFrame) -> int:
    _require_cols(g, ["full_name_of_repo"])
    return int(g["full_name_of_repo"].nunique())


def m_n_commits(g: pd.DataFrame) -> int:
    _require_cols(g, ["commit_sha"])
    return int(g["commit_sha"].nunique())


def m_n_distinct_paths(g: pd.DataFrame) -> int:
    _require_cols(g, ["path"])
    return int(g["path"].nunique())


def m_positive_rate(g: pd.DataFrame) -> float:
    _require_cols(g, ["is_ccdc_event"])
    n = len(g)
    return np.nan if n == 0 else float(g["is_ccdc_event"].sum() / n)


def m_n_distinct_channels(g: pd.DataFrame) -> int:
    _require_cols(g, ["detected_channels"])
    return int(
        g["detected_channels"]
        .explode()
        .dropna()
        .nunique()
    )


# ---------------------------------------------------------------------
# Registry: metric_name -> (fn, required_columns)
# ---------------------------------------------------------------------
METRICS: dict[str, tuple[MetricFn, tuple[str, ...]]] = {
    "n_subjects": (m_n_subjects, ()),
    "n_repos": (m_n_repos, ("full_name_of_repo",)),
    "n_commits": (m_n_commits, ("commit_sha",)),
    "n_distinct_paths": (m_n_distinct_paths, ("path",)),
    "positive_rate": (m_positive_rate, ("is_ccdc_event",)),
    "n_distinct_channels": (
        m_n_distinct_channels,
        ("detected_channels",),
    ),
}


# ---------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------
@dataclass(frozen=True)
class SummarizeConfig:
    include_metrics: tuple[str, ...]
    group_keys: tuple[str, ...] = ()
    forbid_access_to_group_keys: bool = True
    drop_group_keys_from_frame: bool = True
    deny_columns: tuple[str, ...] = ()


# ---------------------------------------------------------------------
# Core summarization
# ---------------------------------------------------------------------
def summarize_subjects_configurable(
    g: pd.DataFrame,
    cfg: SummarizeConfig,
) -> pd.Series:

    g_eff = (
        g.drop(columns=list(cfg.group_keys), errors="ignore")
        if cfg.drop_group_keys_from_frame and cfg.group_keys
        else g
    )

    forbidden = set(cfg.deny_columns)
    if cfg.forbid_access_to_group_keys:
        forbidden |= set(cfg.group_keys)

    out: dict[str, object] = {}

    for name in cfg.include_metrics:
        if name not in METRICS:
            raise KeyError(
                f"Unknown metric: {name}. "
                f"Known: {sorted(METRICS)}"
            )

        fn, required = METRICS[name]

        illegal = [c for c in required if c in forbidden]
        if illegal:
            raise ValueError(
                f"Metric '{name}' requires forbidden columns {illegal}. "
                f"Forbidden: {sorted(forbidden)}"
            )

        out[name] = fn(g_eff)

    return pd.Series(out)

### Function to cast columns of DataFrame to Int64

In [4]:
from collections.abc import Iterable

def col_to_int(df: pd.DataFrame, int_cols: Iterable[str]) -> pd.DataFrame:
    cols = [c for c in int_cols if c in df.columns]
    df[cols] = df[cols].astype("Int64")
    return df

#### "Popular" Int64 columns

In [5]:
INT_COLS_OF_SUMMARY = [
    "n_subjects",
    "n_repos",
    "n_commits",
    "n_distinct_paths",
    "n_distinct_channels",
]

INT_COLS_OF_VC_STATS = [
    "n_categories",
    "total_count",
    "min_count",
    "max_count",
    "median_count",
    "q1_count",
    "q3_count",
    "iqr_count",
]

### Boxplot stats function

**Includes the mean average.**

In [6]:
def boxplot_stats(s: pd.Series) -> pd.Series:
    s = s.dropna()

    q1 = s.quantile(0.25)
    q2 = s.quantile(0.50)
    q3 = s.quantile(0.75)
    iqr = q3 - q1

    lower_whisker = s[s >= q1 - 1.5 * iqr].min()
    upper_whisker = s[s <= q3 + 1.5 * iqr].max()

    n_outliers = ((s < lower_whisker) | (s > upper_whisker)).sum()

    mean = s.mean()

    return pd.Series({
        "q1": q1,
        "median": q2,
        "mean": mean,
        "q3": q3,
        "iqr": iqr,
        "lower_whisker": lower_whisker,
        "upper_whisker": upper_whisker,
        "n_outliers": n_outliers,
        "min": s.min(),
        "max": s.max(),
        "n": len(s),
    })


### Function to get descriptive statistics / stats for a given value_counts() result

In [7]:
def value_counts_stats(vc: pd.Series) -> pd.Series:
    """
    Descriptive statistics for a value_counts() result.

    Parameters
    ----------
    vc : pd.Series
        Output of value_counts(): index = category, values = counts

    Returns
    -------
    pd.Series
        Descriptive statistics of the distribution of counts
    """
    vc = vc.dropna()

    if vc.empty:
        return pd.Series(dtype="float64")

    counts = vc.values

    total = counts.sum()
    n_categories = len(vc)

    q1 = vc.quantile(0.25)
    q2 = vc.quantile(0.50)
    q3 = vc.quantile(0.75)
    iqr = q3 - q1

    top_1 = vc.iloc[0]
    top_5_sum = vc.iloc[:5].sum() if n_categories >= 5 else total
    top_10_sum = vc.iloc[:10].sum() if n_categories >= 10 else total

    return pd.Series({
        # structure
        "n_categories": n_categories,
        "total_count": total,

        # distribution of counts
        "min_count": vc.min(),
        "max_count": vc.max(),
        "mean_count": vc.mean(),
        "median_count": q2,
        "q1_count": q1,
        "q3_count": q3,
        "iqr_count": iqr,

        # concentration / dominance
        "top_1_count": top_1,
        "top_1_share": top_1 / total,
        "top_5_share": top_5_sum / total,
        "top_10_share": top_10_sum / total,
    })

## Data Preparation

### Reading the raw data

In [8]:
raw_data = pd.read_csv(
    raw_results(Path.cwd()),
    low_memory=False,
)

### Working with a copy of the raw data

In [9]:
data = raw_data.copy()

### Droping unnecessary columns / Keeping interesting ones

I have decided not to analyse the snapshot data of a repo. Why not? Because it is snapshot data for the point in time when I gathered the data. If a repo "has GitHub discussions", I have no idea since when that is the case. What if I draw conclusions when actually GitHub discussions has been introduced for that repo just the day before pulling the data?

In [10]:
COLS_OF_INTEREST = [
    "full_name_of_repo",
    "commit_sha",
    "path",
    "is_ccdc_event",
    "detected_channel",
    "created_at",
    "pushed_at",
    "updated_at",
    "date",
]

UNNECESSARY_COLS = []

for col in data.columns:
    if col not in COLS_OF_INTEREST:
        UNNECESSARY_COLS.append(col)

data.drop(columns=UNNECESSARY_COLS, inplace=True)

### Data conversion – if needed

In [11]:
def to_bool(series: pd.Series) -> pd.Series:
    if series.dtype == bool:
        return series
    s = series.astype("string").str.strip().str.lower()
    mapping = {"true": True, "false": False}
    return s.map(mapping)


data["is_ccdc_event"] = to_bool(data["is_ccdc_event"])

data["detected_channel"] = data["detected_channel"].astype("string").fillna("")

DATETIME_COLS = ["created_at", "pushed_at", "updated_at", "date"]

for col in DATETIME_COLS:
    if col in data.columns:
        data[col] = pd.to_datetime(data[col], errors="coerce", utc=True)


### Channel aggregation – Introducing the subjects dataframe

In [12]:
KEY_COLS = [
    "full_name_of_repo",
    "commit_sha",
    "path",
]

gb = data.groupby(KEY_COLS, dropna=False)


def agg_channels(x: pd.Series) -> tuple[str, ...]:
    vals = [
        v
        for v in x.astype("string").tolist()
        if isinstance(v, str) and v.strip() != ""
    ]
    return tuple(sorted(set(vals)))


agg = gb.agg(
    is_ccdc_event=("is_ccdc_event", "first"),
    detected_channels=("detected_channel", agg_channels),
)

for col in COLS_OF_INTEREST:
    if col not in KEY_COLS and col not in agg.columns and col != "detected_channel":
        agg[col] = gb[col].first()

subjects = agg.reset_index()

### Adding "first_activity" column

The first activity is the timestamp that is earlier: the commit or when the repo was officially created.

In [13]:
subjects["first_activity"] = (
    subjects[["date", "created_at"]]
        .min(axis=1)
        .groupby(subjects["full_name_of_repo"], dropna=False)
        .transform("min")
)

### Adding "age_in_days" column

In [14]:
subjects["repo_age"] = (
    (subjects["date"] - subjects["first_activity"]).to_numpy()
    / np.timedelta64(1, "D")
)

subjects.sort_values(["full_name_of_repo", "repo_age"], inplace=True)
subjects.reset_index(drop=True, inplace=True)

### Adding "repo_age_group" column

In [15]:
AGE_GROUPS = [
    (0, 1, "0-1"),
    (1, 2, "1-2"),
    (2, 3, "2-3"),
    (3, 4, "3-4"),
    (4, 5, "4-5"),
    (5, 6, "5-6"),
    (6, 7, "6-7"),
    (7, 8, "7-8"),
    (8, 9, "8-9"),
    (9, 10, "9-10"),
    (10, 11, "10-11"),
    (11, 12, "11-12"),
    (12, 13, "12-13"),
    (13, 14, "13-14"),
    (14, 15, "14-15"),
    (15, 999, "15+"),
]


def assign_age_group(age_in_days: float) -> int | None:
    if pd.isna(age_in_days):
        return None
    for lo, hi, label in AGE_GROUPS:
        if lo * 365.25 <= age_in_days < hi * 365.25:
            return lo
    return None


subjects["repo_age_group"] = subjects["repo_age"].apply(assign_age_group)

### Adding "year" column for temporal analysis

In [16]:
subjects["year"] = subjects["date"].dt.year

### Extracting a dedicated repos dataframe

In [17]:
repos = (
    subjects
        .groupby("full_name_of_repo", as_index=False)
        .agg(
            created_at=("created_at", "first"),
            pushed_at=("pushed_at", "first"),
            updated_at=("updated_at", "first"),
            first_activity=("first_activity", "first"),
        )
)

subjects.drop(
    columns=["created_at", "pushed_at", "updated_at", "first_activity"],
    inplace=True,
)

### Adding "last_activity" and "age_today" columns to repos dataframe

In [18]:
repos["last_activity"] = repos[["pushed_at", "updated_at"]].max(axis=1)

repos["age_today"] = repos["last_activity"] - repos["first_activity"]

repos.sort_values("age_today", ascending=False, inplace=True)
repos.reset_index(drop=True, inplace=True)

## Data Overview

### How many of this and that?

In [19]:
group_keys = []
cfg = SummarizeConfig(
    include_metrics=(
        "n_subjects",
        "n_repos",
        "n_commits",
        "n_distinct_paths",
        "positive_rate",
        "n_distinct_channels",
    ),
    group_keys=tuple(group_keys),
)

summary = summarize_subjects_configurable(subjects, cfg).to_frame().T

summary = col_to_int(summary, INT_COLS_OF_SUMMARY)

summary

Unnamed: 0,n_subjects,n_repos,n_commits,n_distinct_paths,positive_rate,n_distinct_channels
0,17694,364,17523,5,0.204928,33


### Earliest commit and latest commit

In [20]:
earliest_commit = subjects["date"].idxmin()
earliest_commit_repo = subjects.loc[earliest_commit, "full_name_of_repo"]
earliest_commit_sha = subjects.loc[earliest_commit, "commit_sha"]
print(f"Earliest commit? {earliest_commit_sha}@{earliest_commit_repo} at {subjects["date"].min()}")
latest_commit = subjects["repo_age"].idxmax()
latest_commit_repo = subjects.loc[latest_commit, "full_name_of_repo"]
latest_commit_sha = subjects.loc[latest_commit, "commit_sha"]
print(f"Oldest commit? {latest_commit_sha}@{latest_commit_repo} at {subjects["date"].max()}")

Earliest commit? 77533b76fbc2f0fd72445f8f3afb5d5278d4f4aa@gctools-outilsgc/gcconnex at 2008-08-04 11:09:52+00:00
Oldest commit? 8f00f28a3fe1629d6fae66507ad9af30f7698f58@twilio/twilio-java at 2025-12-30 15:55:36+00:00


### Subjects grouped by year

In [21]:
group_keys = ["year"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_subjects",
        "n_repos",
        "n_commits",
        "n_distinct_paths",
        "positive_rate",
        "n_distinct_channels",
    ),
    group_keys=tuple(group_keys),
)

over_the_years = (
    subjects
        .set_index("year")
        .groupby(group_keys, dropna=True)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

over_the_years = col_to_int(over_the_years, INT_COLS_OF_SUMMARY)

over_the_years

Unnamed: 0_level_0,n_subjects,n_repos,n_commits,n_distinct_paths,positive_rate,n_distinct_channels
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008,6,1,6,1,0.166667,3
2009,34,2,34,2,0.117647,3
2010,59,6,58,2,0.152542,11
2011,52,9,51,2,0.173077,9
2012,246,26,244,4,0.231707,15
2013,644,71,643,4,0.256211,20
2014,1459,123,1441,4,0.210418,27
2015,1941,175,1919,5,0.210716,23
2016,2935,219,2910,5,0.191482,29
2017,2571,242,2549,5,0.196033,29


### Subjects grouped by repo

In [22]:
group_keys = ["full_name_of_repo"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_subjects",
        "n_commits",
        "n_distinct_paths",
        "positive_rate",
        "n_distinct_channels",
    ),
    group_keys=tuple(group_keys),
)

repo_summaries = (
    subjects
        .set_index("full_name_of_repo")
        .groupby(group_keys, dropna=True)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

repo_summaries = col_to_int(repo_summaries, INT_COLS_OF_SUMMARY)
repo_summaries.sort_values("n_subjects", ascending=False, inplace=True)

repo_summaries

Unnamed: 0_level_0,n_subjects,n_commits,n_distinct_paths,positive_rate,n_distinct_channels
full_name_of_repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
spencermountain/compromise,570,565,2,0.150877,10
flexera-public/policy_templates,506,502,2,0.150198,9
plotly/plotly.js,367,365,2,0.130790,12
apache/shardingsphere,348,344,2,0.212644,15
danieleteti/delphimvcframework,331,331,1,0.190332,17
...,...,...,...,...,...
lessworks/translation,1,1,1,0.000000,1
mackmobile/Contador,1,1,1,0.000000,0
hwdtech/smartforms,1,1,1,0.000000,0
minio/minio-boshrelease,1,1,1,1.000000,2


### Subjects grouped by repo age group

In [23]:
group_keys = ["repo_age_group"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_subjects",
        "n_repos",
        "n_commits",
        "n_distinct_paths",
        "positive_rate",
        "n_distinct_channels",
    ),
    group_keys=tuple(group_keys),
)

repo_age_group_summaries = (
    subjects
        .set_index("repo_age_group")
        .groupby(group_keys, dropna=True)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

repo_age_group_summaries = col_to_int(repo_age_group_summaries, INT_COLS_OF_SUMMARY)

repo_age_group_summaries

Unnamed: 0_level_0,n_subjects,n_repos,n_commits,n_distinct_paths,positive_rate,n_distinct_channels
repo_age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6397,339,6335,5,0.205878,32
1,2430,251,2421,4,0.212346,29
2,1995,216,1978,4,0.202506,26
3,1439,211,1423,5,0.1918,26
4,1214,173,1198,5,0.215815,27
5,1024,138,1020,5,0.230469,25
6,895,124,880,4,0.175419,26
7,674,114,666,5,0.183976,25
8,586,92,578,3,0.191126,25
9,377,82,364,3,0.238727,25


### Subjects grouped by path

In [24]:
group_keys = ["path"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_subjects",
        "n_repos",
        "n_commits",
        "positive_rate",
        "n_distinct_channels",
    ),
    group_keys=tuple(group_keys),
)

path_summaries = (
    subjects
        .set_index("path")
        .groupby(group_keys, dropna=True)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

path_summaries = col_to_int(path_summaries, INT_COLS_OF_SUMMARY)
path_summaries.sort_values("n_subjects", ascending=False, inplace=True)

path_summaries

Unnamed: 0_level_0,n_subjects,n_repos,n_commits,positive_rate,n_distinct_channels
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
README.md,16172,355,16172,0.195709,32
CONTRIBUTING.md,909,99,909,0.333333,30
readme.md,412,10,412,0.274272,19
README.txt,174,15,174,0.229885,14
contributing.md,27,3,27,0.185185,3


### Detected channels

In [25]:
channel_counts = (
    subjects["detected_channels"]
        .explode()
        .dropna()
        .value_counts()
)

channel_counts

detected_channels
form                  643
website               624
forum                 580
pull_request          574
fork                  562
issues                545
mail                  460
mailing_list          284
github_issues         256
github_wiki           242
ping_on_github        233
telegram              192
blog                  152
jira                  148
linkedin              134
slack                 126
facebook               95
twitter                89
rss_feed               86
github_discussions     68
irc                    67
discord                66
gitter                 57
stack_overflow         53
google_group           27
patreon                20
medium                 15
youtube                13
meetup                  5
reddit                  5
newsletter              5
skype                   4
zulip                   2
Name: count, dtype: int64

In [26]:
channel_stats = value_counts_stats(channel_counts).to_frame().T
channel_stats = col_to_int(channel_stats, INT_COLS_OF_VC_STATS)

channel_stats

Unnamed: 0,n_categories,total_count,min_count,max_count,mean_count,median_count,q1_count,q3_count,iqr_count,top_1_count,top_1_share,top_5_share,top_10_share
0,33,6432,2,643,194.909091,95,27,256,229,643.0,0.099969,0.463775,0.741604


In [27]:
print(f"Most frequently detected channel? {channel_counts.idxmax()} with {channel_counts.max()} counts")
print(f"Least commonly recognized channel? {channel_counts.idxmin()} with {channel_counts.min()} counts")

Most frequently detected channel? form with 643 counts
Least commonly recognized channel? zulip with 2 counts


## Descriptive Statistics

### Age (in days) of the repo at the time of the commit

In [28]:
repo_age_stats = boxplot_stats(subjects["repo_age"])
print(repo_age_stats)

q1                 170.259450
median             733.159954
mean              1103.426051
q3                1758.364465
iqr               1588.105014
lower_whisker        0.000000
upper_whisker     4139.273380
n_outliers         288.000000
min                  0.000000
max               5859.094514
n                17694.000000
dtype: float64


### Age group of the repo at the time of the commit

In [29]:
repo_age_group_stats = boxplot_stats(subjects["repo_age_group"])
print(repo_age_group_stats)

q1                   0.000000
median               2.000000
mean                 2.606533
q3                   4.000000
iqr                  4.000000
lower_whisker        0.000000
upper_whisker       10.000000
n_outliers         343.000000
min                  0.000000
max                 15.000000
n                17694.000000
dtype: float64


### Year in which the commit took place

In [30]:
year_stats = boxplot_stats(subjects["year"])
print(year_stats)

q1                2015.000000
median            2017.000000
mean              2017.629875
q3                2020.000000
iqr                  5.000000
lower_whisker     2008.000000
upper_whisker     2025.000000
n_outliers           0.000000
min               2008.000000
max               2025.000000
n                17694.000000
dtype: float64


## Temporal Trends

## Project Lifecycle Analysis

## Regression Model