# Data Analysis

## Global Imports

In [None]:
import numpy as np
import pandas as pd

## Functions

### Data access functions

In [None]:
from pathlib import Path


def data(root: Path) -> Path:
    return root / "data"


def output(root: Path) -> Path:
    return data(root) / "output"


def raw_results(root: Path) -> Path:
    return data(root) / "raw_results" / "raw_results.csv"


def subjects(root: Path) -> Path:
    return data(root) / "raw_results" / "subjects.csv"


def commits(root: Path) -> Path:
    return data(root) / "raw_results" / "commits.csv"


def truth(root: Path) -> Path:
    return data(root) / "truth" / "truth.csv"


### Function to summarize DataFrame

In [None]:
from collections.abc import Callable, Sequence
from dataclasses import dataclass


MetricFn = Callable[[pd.DataFrame], object]


def _require_cols(g: pd.DataFrame, cols: Sequence[str]) -> None:
    missing = [c for c in cols if c not in g.columns]
    if missing:
        raise KeyError(f"Missing required columns: {missing}")


# ---------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------
def m_n_subjects(g: pd.DataFrame) -> int:
    return int(len(g))


def m_n_repos(g: pd.DataFrame) -> int:
    _require_cols(g, ["full_name_of_repo"])
    return int(g["full_name_of_repo"].nunique())


def m_n_commits(g: pd.DataFrame) -> int:
    _require_cols(g, ["commit_sha"])
    return int(g["commit_sha"].nunique())


def m_n_distinct_paths(g: pd.DataFrame) -> int:
    _require_cols(g, ["path"])
    return int(g["path"].nunique())


def m_positive_rate(g: pd.DataFrame) -> float:
    _require_cols(g, ["is_ccdc_event"])
    n = len(g)
    return np.nan if n == 0 else float(g["is_ccdc_event"].sum() / n)


def m_n_distinct_channels(g: pd.DataFrame) -> int:
    _require_cols(g, ["detected_channels"])
    return int(
        g["detected_channels"]
        .explode()
        .dropna()
        .nunique()
    )


# ---------------------------------------------------------------------
# Registry: metric_name -> (fn, required_columns)
# ---------------------------------------------------------------------
METRICS: dict[str, tuple[MetricFn, tuple[str, ...]]] = {
    "n_subjects": (m_n_subjects, ()),
    "n_repos": (m_n_repos, ("full_name_of_repo",)),
    "n_commits": (m_n_commits, ("commit_sha",)),
    "n_distinct_paths": (m_n_distinct_paths, ("path",)),
    "positive_rate": (m_positive_rate, ("is_ccdc_event",)),
    "n_distinct_channels": (
        m_n_distinct_channels,
        ("detected_channels",),
    ),
}


# ---------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------
@dataclass(frozen=True)
class SummarizeConfig:
    include_metrics: tuple[str, ...]
    group_keys: tuple[str, ...] = ()
    forbid_access_to_group_keys: bool = True
    drop_group_keys_from_frame: bool = True
    deny_columns: tuple[str, ...] = ()


# ---------------------------------------------------------------------
# Core summarization
# ---------------------------------------------------------------------
def summarize_subjects_configurable(
    g: pd.DataFrame,
    cfg: SummarizeConfig,
) -> pd.Series:

    g_eff = (
        g.drop(columns=list(cfg.group_keys), errors="ignore")
        if cfg.drop_group_keys_from_frame and cfg.group_keys
        else g
    )

    forbidden = set(cfg.deny_columns)
    if cfg.forbid_access_to_group_keys:
        forbidden |= set(cfg.group_keys)

    out: dict[str, object] = {}

    for name in cfg.include_metrics:
        if name not in METRICS:
            raise KeyError(
                f"Unknown metric: {name}. "
                f"Known: {sorted(METRICS)}"
            )

        fn, required = METRICS[name]

        illegal = [c for c in required if c in forbidden]
        if illegal:
            raise ValueError(
                f"Metric '{name}' requires forbidden columns {illegal}. "
                f"Forbidden: {sorted(forbidden)}"
            )

        out[name] = fn(g_eff)

    return pd.Series(out)

### Tupel with each and every available metric

In [None]:
ALL_METRICS = (
    "n_subjects",
    "n_repos",
    "n_commits",
    "n_distinct_paths",
    "positive_rate",
    "n_distinct_channels",
)

### Function to cast columns of DataFrame to Int64

In [None]:
from collections.abc import Iterable

def col_to_int(df: pd.DataFrame, int_cols: Iterable[str]) -> pd.DataFrame:
    cols = [c for c in int_cols if c in df.columns]
    df[cols] = df[cols].astype("Int64")
    return df

#### "Popular" Int64 columns

In [None]:
INT_COLS_OF_SUMMARY = [
    "n_subjects",
    "n_repos",
    "n_commits",
    "n_distinct_paths",
    "n_distinct_channels",
]

INT_COLS_OF_VC_STATS = [
    "n_categories",
    "total_count",
    "min_count",
    "max_count",
    "median_count",
    "q1_count",
    "q3_count",
    "iqr_count",
]

### Boxplot stats function

**Includes the mean average.**

In [None]:
def boxplot_stats(s: pd.Series) -> pd.Series:
    s = s.dropna()

    q1 = s.quantile(0.25)
    q2 = s.quantile(0.50)
    q3 = s.quantile(0.75)
    iqr = q3 - q1

    lower_whisker = s[s >= q1 - 1.5 * iqr].min()
    upper_whisker = s[s <= q3 + 1.5 * iqr].max()

    n_outliers = ((s < lower_whisker) | (s > upper_whisker)).sum()

    mean = s.mean()

    return pd.Series({
        "q1": q1,
        "median": q2,
        "mean": mean,
        "q3": q3,
        "iqr": iqr,
        "lower_whisker": lower_whisker,
        "upper_whisker": upper_whisker,
        "n_outliers": n_outliers,
        "min": s.min(),
        "max": s.max(),
        "n": len(s),
    })


### Function to get descriptive statistics / stats for a given value_counts() result

In [None]:
def value_counts_stats(vc: pd.Series) -> pd.Series:
    """
    Descriptive statistics for a value_counts() result.

    Parameters
    ----------
    vc : pd.Series
        Output of value_counts(): index = category, values = counts

    Returns
    -------
    pd.Series
        Descriptive statistics of the distribution of counts
    """
    vc = vc.dropna()

    if vc.empty:
        return pd.Series(dtype="float64")

    counts = vc.values

    total = counts.sum()
    n_categories = len(vc)

    q1 = vc.quantile(0.25)
    q2 = vc.quantile(0.50)
    q3 = vc.quantile(0.75)
    iqr = q3 - q1

    top_1 = vc.iloc[0]
    top_5_sum = vc.iloc[:5].sum() if n_categories >= 5 else total
    top_10_sum = vc.iloc[:10].sum() if n_categories >= 10 else total

    return pd.Series({
        # structure
        "n_categories": n_categories,
        "total_count": total,

        # distribution of counts
        "min_count": vc.min(),
        "max_count": vc.max(),
        "mean_count": vc.mean(),
        "median_count": q2,
        "q1_count": q1,
        "q3_count": q3,
        "iqr_count": iqr,

        # concentration / dominance
        "top_1_count": top_1,
        "top_1_share": top_1 / total,
        "top_5_share": top_5_sum / total,
        "top_10_share": top_10_sum / total,
    })

## Data Preparation

### Reading the raw data

In [None]:
raw_data = pd.read_csv(
    raw_results(Path.cwd()),
    low_memory=False,
)

### Working with a copy of the raw data

In [None]:
data = raw_data.copy()

### Droping unnecessary columns / Keeping interesting ones

I have decided not to analyse the snapshot data of a repo. Why not? Because it is snapshot data for the point in time when I gathered the data. If a repo "has GitHub discussions", I have no idea since when that is the case. What if I draw conclusions when actually GitHub discussions has been introduced for that repo just the day before pulling the data?

In [None]:
COLS_OF_INTEREST = [
    "full_name_of_repo",
    "commit_sha",
    "path",
    "is_ccdc_event",
    "detected_channel",
    "created_at",
    "pushed_at",
    "updated_at",
    "date",
]

UNNECESSARY_COLS = []

for col in data.columns:
    if col not in COLS_OF_INTEREST:
        UNNECESSARY_COLS.append(col)

data.drop(columns=UNNECESSARY_COLS, inplace=True)

### Data conversion – if needed

In [None]:
def to_bool(series: pd.Series) -> pd.Series:
    if series.dtype == bool:
        return series
    s = series.astype("string").str.strip().str.lower()
    mapping = {"true": True, "false": False}
    return s.map(mapping)


data["is_ccdc_event"] = to_bool(data["is_ccdc_event"])

data["detected_channel"] = data["detected_channel"].astype("string").fillna("")

DATETIME_COLS = ["created_at", "pushed_at", "updated_at", "date"]

for col in DATETIME_COLS:
    if col in data.columns:
        data[col] = pd.to_datetime(data[col], errors="coerce", utc=True)


### Channel aggregation – Introducing the subjects dataframe

Objective: One row = one subject

In [None]:
KEY_COLS = [
    "full_name_of_repo",
    "commit_sha",
    "path",
]

gb = data.groupby(KEY_COLS, dropna=False)


def agg_channels(x: pd.Series) -> tuple[str, ...]:
    vals = [
        v
        for v in x.astype("string").tolist()
        if isinstance(v, str) and v.strip() != ""
    ]
    return tuple(sorted(set(vals)))


agg = gb.agg(
    is_ccdc_event=("is_ccdc_event", "first"),
    detected_channels=("detected_channel", agg_channels),
)

for col in COLS_OF_INTEREST:
    if col not in KEY_COLS and col not in agg.columns and col != "detected_channel":
        agg[col] = gb[col].first()

subjects = agg.reset_index()

### Adding "first_activity" column

The first activity is the timestamp that is earlier: the commit or when the repo was officially created.

In [None]:
subjects["first_activity"] = (
    subjects[["date", "created_at"]]
        .min(axis=1)
        .groupby(subjects["full_name_of_repo"], dropna=False)
        .transform("min")
)

### Adding "repo_age" column

In [None]:
subjects["repo_age"] = (
    (subjects["date"] - subjects["first_activity"]).to_numpy()
    / np.timedelta64(1, "D")
)

subjects.sort_values(["full_name_of_repo", "repo_age"], inplace=True)
subjects.reset_index(drop=True, inplace=True)

### Adding "repo_age_group" column

In [None]:
AGE_GROUPS = [
    (0, 1, "0-1"),
    (1, 2, "1-2"),
    (2, 3, "2-3"),
    (3, 4, "3-4"),
    (4, 5, "4-5"),
    (5, 6, "5-6"),
    (6, 7, "6-7"),
    (7, 8, "7-8"),
    (8, 9, "8-9"),
    (9, 10, "9-10"),
    (10, 11, "10-11"),
    (11, 12, "11-12"),
    (12, 13, "12-13"),
    (13, 14, "13-14"),
    (14, 15, "14-15"),
    (15, 999, "15+"),
]


def assign_age_group(age_in_days: float) -> int | None:
    if pd.isna(age_in_days):
        return None
    for lo, hi, label in AGE_GROUPS:
        if lo * 365.25 <= age_in_days < hi * 365.25:
            return lo
    return None


subjects["repo_age_group"] = subjects["repo_age"].apply(assign_age_group)

### Adding "year" column for temporal analysis

In [None]:
subjects["year"] = subjects["date"].dt.year

### Extracting a dedicated repos dataframe

In [None]:
repos = (
    subjects
        .groupby("full_name_of_repo", as_index=False)
        .agg(
            created_at=("created_at", "first"),
            pushed_at=("pushed_at", "first"),
            updated_at=("updated_at", "first"),
            first_activity=("first_activity", "first"),
        )
)

subjects.drop(
    columns=["created_at", "pushed_at", "updated_at", "first_activity"],
    inplace=True,
)

### Adding "last_activity" and "age_today" columns to repos dataframe

In [None]:
repos["last_activity"] = repos[["pushed_at", "updated_at"]].max(axis=1)

repos["age_today"] = repos["last_activity"] - repos["first_activity"]

repos.sort_values("age_today", ascending=False, inplace=True)
repos.reset_index(drop=True, inplace=True)

## Descriptive Statistics

### How many of this and that?

In [None]:
group_keys = []
cfg = SummarizeConfig(
    include_metrics=ALL_METRICS,
    group_keys=tuple(group_keys),
)

summary = summarize_subjects_configurable(subjects, cfg).to_frame().T

summary = col_to_int(summary, INT_COLS_OF_SUMMARY)

In [None]:
summary

### Earliest commit and latest commit

In [None]:
earliest_commit = subjects["date"].idxmin()
earliest_commit_repo = subjects.loc[earliest_commit, "full_name_of_repo"]
earliest_commit_sha = subjects.loc[earliest_commit, "commit_sha"]
print(f"Earliest commit? {earliest_commit_sha}@{earliest_commit_repo} at {subjects["date"].min()}")
latest_commit = subjects["repo_age"].idxmax()
latest_commit_repo = subjects.loc[latest_commit, "full_name_of_repo"]
latest_commit_sha = subjects.loc[latest_commit, "commit_sha"]
print(f"Latest commit? {latest_commit_sha}@{latest_commit_repo} at {subjects["date"].max()}")

### Subjects grouped by repo

In [None]:
group_keys = ["full_name_of_repo"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_subjects",
        "n_commits",
        "n_distinct_paths",
        "positive_rate",
        "n_distinct_channels",
    ),
    group_keys=tuple(group_keys),
)

repo_summaries = (
    subjects
        .set_index("full_name_of_repo")
        .groupby(group_keys, dropna=True)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

repo_summaries = col_to_int(repo_summaries, INT_COLS_OF_SUMMARY)
repo_summaries.sort_values("n_subjects", ascending=False, inplace=True)

In [None]:
repo_summaries

### Subjects grouped by path

In [None]:
group_keys = ["path"]

cfg = SummarizeConfig(
    include_metrics=(
        "n_subjects",
        "n_repos",
        "n_commits",
        "positive_rate",
        "n_distinct_channels",
    ),
    group_keys=tuple(group_keys),
)

path_summaries = (
    subjects
        .set_index("path")
        .groupby(group_keys, dropna=True)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

path_summaries = col_to_int(path_summaries, INT_COLS_OF_SUMMARY)
path_summaries.sort_values("n_subjects", ascending=False, inplace=True)

path_summaries

### Detected channels

In [None]:
channel_counts = (
    subjects["detected_channels"]
        .explode()
        .dropna()
        .value_counts()
)

In [None]:
channel_counts

In [None]:
channel_stats = value_counts_stats(channel_counts).to_frame().T
channel_stats = col_to_int(channel_stats, INT_COLS_OF_VC_STATS)

channel_stats

In [None]:
print(f"Most frequently detected channel? {channel_counts.idxmax()} with {channel_counts.max()} counts")
print(f"Least commonly recognized channel? {channel_counts.idxmin()} with {channel_counts.min()} counts")

### Age (in days) of the repo at the time of the commit

In [None]:
repo_age_stats = boxplot_stats(subjects["repo_age"])
print(repo_age_stats)

### Age group of the repo at the time of the commit

In [None]:
repo_age_group_stats = boxplot_stats(subjects["repo_age_group"])
print(repo_age_group_stats)

### Year in which the commit took place

In [None]:
year_stats = boxplot_stats(subjects["year"])
print(year_stats)

## Temporal Trends

### Yearly summary

In [None]:
group_keys = ["year"]

cfg = SummarizeConfig(
    include_metrics=ALL_METRICS,
    group_keys=tuple(group_keys),
)

yearly_summary = (
    subjects
        .set_index("year")
        .groupby(group_keys)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

yearly_summary = col_to_int(yearly_summary, INT_COLS_OF_SUMMARY)

In [None]:
yearly_summary

## Project Lifecycle Analysis

In [None]:
group_keys = ["repo_age_group"]

cfg = SummarizeConfig(
    include_metrics=ALL_METRICS,
    group_keys=tuple(group_keys),
)

summary_per_repo_age_group = (
    subjects
        .set_index("repo_age_group")
        .groupby(group_keys)
        .apply(lambda g: summarize_subjects_configurable(g, cfg))
)

summary_per_repo_age_group = col_to_int(summary_per_repo_age_group, INT_COLS_OF_SUMMARY)

In [None]:
summary_per_repo_age_group

## Channel Evolution

### Function to get value counts for a group

In [None]:
def vc_detected_channels(
    g: pd.DataFrame,
    *,
    drop_empty: bool = True,
) -> pd.Series:
    """
    Returns value_counts of channels for one group g.
    Assumes g["detected_channels"] contains iterables/tuples of channels, possibly empty () or NaN.
    """
    col: str = "detected_channels"
    s = g[col]

    # explode expects list-like; NaN stays NaN; () becomes empty -> drops on explode
    exploded = s.explode()

    # Optional cleanup
    exploded = exploded.dropna()
    exploded = exploded.astype("string")

    if drop_empty:
        exploded = exploded[exploded.str.strip() != ""]

    # value_counts -> counts per channel
    vc = exploded.value_counts(dropna=False)

    # Make the result stable/consistent
    vc.index.name = "channel"
    vc.name = "count"
    return vc

### Channel counts per year

In [None]:
group_keys = ["year"]

channels_per_year = (
    subjects
        .groupby(group_keys)[["detected_channels"]]
        .apply(vc_detected_channels)
)

channels_matrix = channels_per_year.unstack(fill_value=0).reset_index()

channels_per_year = channels_per_year.reset_index()

stats = (
    channels_per_year
        .groupby(group_keys)["count"]
        .apply(value_counts_stats)
).unstack(fill_value=0)

n = 3

top_n_per_year = (
    channels_per_year
        .sort_values(["year", "count"], ascending=[True, False])
        .groupby("year", dropna=False, sort=False)
        .head(n)
        .reset_index(drop=True)
)

bottom_n_per_year = (
    channels_per_year
        .sort_values(["year", "count"], ascending=[True, False])
        .groupby("year", dropna=False, sort=False)
        .tail(n)
        .reset_index(drop=True)
)

In [None]:
channels_matrix

In [None]:
stats

In [None]:
top_n_per_year

In [None]:
bottom_n_per_year

### Channel counts per repo age group

In [None]:
group_keys = ["repo_age_group"]

channels_per_ag = (
    subjects
        .groupby(group_keys)[["detected_channels"]]
        .apply(vc_detected_channels)
)

channels_matrix_per_ag = channels_per_ag.unstack(fill_value=0).reset_index()
channels_matrix_per_ag.set_index("repo_age_group", inplace=True)


channels_per_ag = channels_per_ag.reset_index()

stats_per_ag = (
    channels_per_ag
        .groupby(group_keys)["count"]
        .apply(value_counts_stats)
).unstack(fill_value=0)

n = 3

top_n_per_ag = (
    channels_per_ag
        .sort_values(["repo_age_group", "count"], ascending=[True, False])
        .groupby("repo_age_group", dropna=False, sort=False)
        .head(n)
        .reset_index(drop=True)
)

bottom_n_per_ag = (
    channels_per_ag
        .sort_values(["repo_age_group", "count"], ascending=[True, False])
        .groupby("repo_age_group", dropna=False, sort=False)
        .tail(n)
        .reset_index(drop=True)
)

In [None]:
channels_matrix_per_ag

In [None]:
stats_per_ag

In [None]:
top_n_per_ag

In [None]:
bottom_n_per_ag

### Per repo

In [None]:
group_keys = ["full_name_of_repo"]

channels_per_repo = (
    subjects
        .groupby(group_keys)[["detected_channels"]]
        .apply(vc_detected_channels)
)

channels_matrix_per_repo = channels_per_repo.unstack(fill_value=0).reset_index()
channels_matrix_per_repo.set_index("full_name_of_repo", inplace=True)


channels_per_repo = channels_per_repo.reset_index()

stats_per_repo = (
    channels_per_repo
        .groupby(group_keys)["count"]
        .apply(value_counts_stats)
).unstack(fill_value=0)



In [None]:
stats_per_repo.sort_values("n_categories", ascending=False)