# Data Analysis

## Preparing The Data

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
def data(root: Path) -> Path:
    return root / "data"


def output(root: Path) -> Path:
    return data(root) / "output"


def raw_results(root: Path) -> Path:
    return data(root) / "raw_results" / "raw_results.csv"


def subjects(root: Path) -> Path:
    return data(root) / "raw_results" / "subjects.csv"


def commits(root: Path) -> Path:
    return data(root) / "raw_results" / "commits.csv"


def truth(root: Path) -> Path:
    return data(root) / "truth" / "truth.csv"


In [None]:
raw_results_df = pd.read_csv(
    raw_results(Path.cwd()),
    low_memory=False,
)

In [None]:
df = raw_results_df.copy()

for column in [
    "full_name",
    "clone_url",
    "git_url",
    "teams_url",
    "created_at",
    "sha",
    "url",
    "message",
]:
    if column in df.columns:
        df = df.drop(columns=[column])

In [None]:
if df["is_ccdc_event"].isna().any():
    n = int(df["is_ccdc_event"].isna().sum())
    print(f"WARNING: {n} rows have non-parseable is_ccdc_event -> NaN")

def to_bool(series: pd.Series) -> pd.Series:
    if series.dtype == bool:
        return series
    s = series.astype("string").str.strip().str.lower()
    mapping = {"true": True, "false": False}
    return s.map(mapping)

df["is_ccdc_event"] = to_bool(df["is_ccdc_event"])

In [None]:
df["detected_channel"] = df["detected_channel"].astype("string").fillna("")

In [None]:
key_cols = ["full_name_of_repo", "commit_sha", "path"]

gb = df.groupby(key_cols, dropna=False)

def agg_channels(x: pd.Series) -> tuple[str, ...]:
    vals = [
        v
        for v in x.astype("string").tolist()
        if isinstance(v, str) and v.strip() != ""
    ]
    return tuple(sorted(set(vals)))

agg = gb.agg(
    is_ccdc_event=("is_ccdc_event", "first"),
    date=("date", "first"),
    detected_channels=("detected_channel", agg_channels),
)

for column in [
    "id",
    "homepage",
    "pushed_at",
    "updated_at",
    "has_discussions",
    "has_issues",
    "has_pages",
    "has_projects",
    "has_wiki",
    "forks_count",
    "open_issues_count",
    "stargazers_count",
    "subscribers_count",
    "size",
]:
    if column in df.columns:
        agg[column] = gb[column].first()

agg = agg.reset_index()

df = agg

In [None]:
date_cols = ["pushed_at", "updated_at", "date"]

for column in date_cols:
    if column in df.columns:
        df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)

In [None]:
key_cols = ["full_name_of_repo"]

df["birthday"] = (
    df.groupby(key_cols, dropna=False)["date"]
      .transform("min")
)

expected = (
    df.groupby("full_name_of_repo", dropna=False)["date"]
      .min()
)

actual = (
    df.groupby("full_name_of_repo", dropna=False)["birthday"]
      .first()
)

assert expected.equals(actual), "ERROR"

In [None]:
df["age_in_days"] = (
    (df["date"] - df["birthday"]).to_numpy()
    / np.timedelta64(1, "D")
)

In [None]:
AGE_GROUPS_YEARS = [
    (0, 1, "0-1"),
    (1, 2, "1-2"),
    (2, 3, "2-3"),
    (3, 4, "3-4"),
    (4, 5, "4-5"),
    (5, 6, "5-6"),
    (6, 7, "6-7"),
    (7, 8, "7-8"),
    (8, 9, "8-9"),
    (9, 10, "9-10"),
    (10, 11, "10-11"),
    (11, 12, "11-12"),
    (12, 13, "12-13"),
    (13, 14, "13-14"),
    (14, 15, "14-15"),
    (15, 999, "14+"),
]

def assign_age_group(age_in_days: float) -> str:
    if pd.isna(age_in_days):
        return "unknown"
    for lo, hi, label in AGE_GROUPS_YEARS:
        if age_in_days >= lo * 365.25 and age_in_days < hi * 365.25:
            return label
    return "unknown"

df["repo_age_group_at_commit"] = df["age_in_days"].map(assign_age_group)

df["repo_age_group_at_commit"].value_counts(dropna=False)

In [None]:
subjects = df.copy()

for column in [
    "id",
    "homepage",
    "pushed_at",
    "updated_at",
    "has_discussions",
    "has_issues",
    "has_pages",
    "has_projects",
    "has_wiki",
    "forks_count",
    "open_issues_count",
    "stargazers_count",
    "subscribers_count",
    "size",
    "birthday",
]:
    if column in subjects.columns:
        subjects = subjects.drop(columns=[column])

In [None]:
subjects = subjects.sort_values(
    by=["full_name_of_repo", "date", "commit_sha", "path"],
    ascending=[True, True, True, True],
).reset_index(drop=True)

subjects.head(30)

In [None]:
repos = df.groupby("full_name_of_repo").agg(
    id=("id", "first"),
    n_subjects=("commit_sha", "count"),
    n_ccdc_events=("is_ccdc_event", lambda s: int(s.fillna(False).sum())),
    ccdc_rate=("is_ccdc_event", lambda s: float(s.fillna(False).mean())),
    birthday=("birthday", "first"),
    pushed_at=("pushed_at", "first"),
    updated_at=("updated_at", "first"),
    homepage=("homepage", "first"),
    has_discussions=("has_discussions", "first"),
    has_issues=("has_issues", "first"),
    has_pages=("has_pages", "first"),
    has_projects=("has_projects", "first"),
    has_wiki=("has_wiki", "first"),
    forks_count=("forks_count", "first"),
    open_issues_count=("open_issues_count", "first"),
    stargazers_count=("stargazers_count", "first"),
    subscribers_count=("subscribers_count", "first"),
    size=("size", "first"),
).reset_index()

expected = len(subjects)
actual = repos["n_subjects"].sum()
assert expected == actual, "ERROR"

## Descriptive Statistics

## Overall, How many?

- How many repos?
- How many repos are still active? (max(pushed_at, updated_at) in 2025?)
- How many commits?
- How many distinct paths?
- How many subjects?
- How many positives?
- How many negatives?
- What is the rate of positives?
- How many distinct channels?
- How many subjects without a channel?
- How many positives without a channel?
- How many negatives with a channel?

## Outliers – overall

- First repo – earliest birthday
- Latest repo – latest birthday
- Oldest repo – max(subjects["age_in_days"]) -> repo ?!
- First commit
- Latest commit
- Most popular path
- Least popular path
- Most often detected channel
- Rarest channel

## Across each and every repo, Over the years…

- For each year, since the first commit:
    - How many repos (were born in that year)?
    - How many commits?
    - How many distinct paths?
    - How many subjects?
    - How many positives?
    - How many negatives?
    - What is the rate of positives?
    - How many distinct channels?
    - How many subjects without a channel?
    - How many positives without a channel?
    - How many negatives with a channel?
    - Most popular path
    - Least popular path
    - Most often detected channel
    - Rarest channel
    - How many channels per positive? Boxplot, Histogram

## Repo specific boxplots

- Birthday
- Age – max(pushed_at, updated_at) - birthday
- How many commits?
- How many distinct paths?
- How many subjects?
- How many positives?
- How many negatives?
- What is the rate of positives?
- How many distinct channels?
- How many subjects without a channel?
- How many positives without a channel?
- How many negatives with a channel?
- How many channels per positive? Boxplot, Histogram

## Within a repo, Over the years…

- For each repo age group:
    - How many repos?
    - How many commits?
    - How many distinct paths?
    - How many subjects?
    - How many positives?
    - How many negatives?
    - What is the rate of positives?
    - How many distinct channels?
    - How many subjects without a channel?
    - How many positives without a channel?
    - How many negatives with a channel?
    - Most popular path
    - Least popular path
    - Most often detected channel
    - Rarest channel
    - How many channels per positive? Boxplot, Histogram

## Further Path analysis

- For each and every distinct path incl. suffix:
    - overall
        - How many repos?
        - How many subjects?
        - How many positives?
        - How many negatives?
        - What is the rate of positives?
        - How many distinct channels?
        - How many subjects without a channel?
        - How many positives without a channel?
        - How many negatives with a channel?
        - Most often detected channel
        - Rarest channel
        - How many channels per positive?
    - per repo
        - How many subjects? Boxplot
        - How many positives? Boxplot
        - How many negatives? Boxplot
        - What is the rate of positives? Boxplot
        - How many distinct channels? Boxplot
        - How many subjects without a channel? Boxplot
        - How many positives without a channel? Boxplot
        - How many negatives with a channel? Boxplot
        - How many channels per positive? Boxplot
    - per year
        - How many repos?
        - How many subjects?
        - How many positives?
        - How many negatives?
        - What is the rate of positives?
        - How many distinct channels?
        - How many subjects without a channel?
        - How many positives without a channel?
        - How many negatives with a channel?
        - How many channels per positive?
    - per repo age group
        - How many repos?
        - How many subjects?
        - How many positives?
        - How many negatives?
        - What is the rate of positives?
        - How many distinct channels?
        - How many subjects without a channel?
        - How many positives without a channel?
        - How many negatives with a channel?
        - How many channels per positive?

- for each path suffix (e.g., ".txt", ".md"):
    - per year:
        - How many subjects? – popularity of txt vs. md

## Further Channel analysis

- overall:
    - top N most often detected channels
    - top N rarest channels
- per year:
    - top N most often detected channels – race of the channels…
    - top N rarest channels
- per repo age group:
    - top N most often detected channels
    - top N rarest channels
- for each distinct channel:
    - first time
    - last time
    - How many repos?
    - How many subjects?
    - …

In [None]:
repos = repos.sort_values("birthday")

repos.head(30)