# Data Analysis

## Preparing The Data

In [1]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
def data(root: Path) -> Path:
    return root / "data"


def output(root: Path) -> Path:
    return data(root) / "output"


def raw_results(root: Path) -> Path:
    return data(root) / "raw_results" / "raw_results.csv"


def subjects(root: Path) -> Path:
    return data(root) / "raw_results" / "subjects.csv"


def commits(root: Path) -> Path:
    return data(root) / "raw_results" / "commits.csv"


def truth(root: Path) -> Path:
    return data(root) / "truth" / "truth.csv"


In [3]:
raw_results_df = pd.read_csv(
    raw_results(Path.cwd()),
    low_memory=False,
)

In [4]:
df = raw_results_df.copy()

for column in [
    "full_name",
    "clone_url",
    "git_url",
    "teams_url",
    "created_at",
    "sha",
    "url",
    "message",
]:
    if column in df.columns:
        df = df.drop(columns=[column])

In [5]:
if df["is_ccdc_event"].isna().any():
    n = int(df["is_ccdc_event"].isna().sum())
    print(f"WARNING: {n} rows have non-parseable is_ccdc_event -> NaN")

def to_bool(series: pd.Series) -> pd.Series:
    if series.dtype == bool:
        return series
    s = series.astype("string").str.strip().str.lower()
    mapping = {"true": True, "false": False}
    return s.map(mapping)

df["is_ccdc_event"] = to_bool(df["is_ccdc_event"])

In [6]:
df["detected_channel"] = df["detected_channel"].astype("string").fillna("")

In [7]:
key_cols = ["full_name_of_repo", "commit_sha", "path"]

gb = df.groupby(key_cols, dropna=False)

def agg_channels(x: pd.Series) -> tuple[str, ...]:
    vals = [
        v
        for v in x.astype("string").tolist()
        if isinstance(v, str) and v.strip() != ""
    ]
    return tuple(sorted(set(vals)))

agg = gb.agg(
    is_ccdc_event=("is_ccdc_event", "first"),
    date=("date", "first"),
    detected_channels=("detected_channel", agg_channels),
)

for column in [
    "id",
    "homepage",
    "pushed_at",
    "updated_at",
    "has_discussions",
    "has_issues",
    "has_pages",
    "has_projects",
    "has_wiki",
    "forks_count",
    "open_issues_count",
    "stargazers_count",
    "subscribers_count",
    "size",
]:
    if column in df.columns:
        agg[column] = gb[column].first()

agg = agg.reset_index()

df = agg

In [8]:
date_cols = ["pushed_at", "updated_at", "date"]

for column in date_cols:
    if column in df.columns:
        df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)

In [9]:
key_cols = ["full_name_of_repo"]

df["birthday"] = (
    df.groupby(key_cols, dropna=False)["date"]
      .transform("min")
)

expected = (
    df.groupby("full_name_of_repo", dropna=False)["date"]
      .min()
)

actual = (
    df.groupby("full_name_of_repo", dropna=False)["birthday"]
      .first()
)

assert expected.equals(actual), "ERROR"

In [10]:
df["age_in_days"] = (
    (df["date"] - df["birthday"]).to_numpy()
    / np.timedelta64(1, "D")
)

In [11]:
AGE_GROUPS_YEARS = [
    (0, 1, "0-1"),
    (1, 2, "1-2"),
    (2, 3, "2-3"),
    (3, 4, "3-4"),
    (4, 5, "4-5"),
    (5, 6, "5-6"),
    (6, 7, "6-7"),
    (7, 8, "7-8"),
    (8, 9, "8-9"),
    (9, 10, "9-10"),
    (10, 11, "10-11"),
    (11, 12, "11-12"),
    (12, 13, "12-13"),
    (13, 14, "13-14"),
    (14, 15, "14-15"),
    (15, 999, "14+"),
]

def assign_age_group(age_in_days: float) -> str:
    if pd.isna(age_in_days):
        return "unknown"
    for lo, hi, label in AGE_GROUPS_YEARS:
        if age_in_days >= lo * 365.25 and age_in_days < hi * 365.25:
            return label
    return "unknown"

df["repo_age_group_at_commit"] = df["age_in_days"].map(assign_age_group)

df["repo_age_group_at_commit"].value_counts(dropna=False)

repo_age_group_at_commit
0-1      6759
1-2      2609
2-3      2077
3-4      1389
4-5      1179
5-6       868
6-7       795
7-8       674
8-9       514
9-10      336
10-11     253
11-12     120
12-13     105
13-14      13
14-15       3
Name: count, dtype: int64

In [12]:
subjects = df.copy()

for column in [
    "id",
    "homepage",
    "pushed_at",
    "updated_at",
    "has_discussions",
    "has_issues",
    "has_pages",
    "has_projects",
    "has_wiki",
    "forks_count",
    "open_issues_count",
    "stargazers_count",
    "subscribers_count",
    "size",
    "birthday",
]:
    if column in subjects.columns:
        subjects = subjects.drop(columns=[column])

In [13]:
subjects = subjects.sort_values(
    by=["full_name_of_repo", "date", "commit_sha", "path"],
    ascending=[True, True, True, True],
).reset_index(drop=True)

subjects.head(30)

Unnamed: 0,full_name_of_repo,commit_sha,path,is_ccdc_event,date,detected_channels,age_in_days,repo_age_group_at_commit
0,05bit/peewee-async,fc8edc1d35cbda8477ceb4dd672d2ed115b18635,README.md,False,2014-09-27 22:15:28+00:00,(),0.0,0-1
1,05bit/peewee-async,03d7da284e47d3018e071995e2d9cd7a8625f41e,README.md,True,2014-09-28 13:44:56+00:00,"(issues,)",0.645463,0-1
2,05bit/peewee-async,53130218e08eaacf34f3d68338bb008fdfeb6c72,README.md,False,2014-09-28 13:49:37+00:00,(),0.648715,0-1
3,05bit/peewee-async,c90ae2ca4a4d6cdac078e9918c53e9ea0ddcb632,README.md,False,2014-09-29 10:33:36+00:00,(),1.512593,0-1
4,05bit/peewee-async,4db3f204d4ee60a91146e27d3d82b9edfc1086c2,README.md,False,2014-10-11 09:08:29+00:00,(),13.453484,0-1
5,05bit/peewee-async,f546a0d2c41b2c29f718ec4f6459c44eedd00983,README.md,False,2014-10-11 12:07:44+00:00,(),13.577963,0-1
6,05bit/peewee-async,95506301a3c0279812ee6802e30f1741435c5aca,README.md,False,2014-10-11 12:11:48+00:00,(),13.580787,0-1
7,05bit/peewee-async,eead5b75cf1ef4c06cfce3d09184230b4e64ffa8,README.md,False,2014-10-11 14:38:03+00:00,(),13.68235,0-1
8,05bit/peewee-async,7bce99a67f17685aeec8f756908f8ad19a2cd82e,README.md,False,2014-10-11 15:09:02+00:00,(),13.703866,0-1
9,05bit/peewee-async,0b25d439c214c32d531928a5114593a6db7762b8,README.md,False,2014-10-11 15:39:33+00:00,(),13.725058,0-1


In [14]:
repos = df.groupby("full_name_of_repo").agg(
    id=("id", "first"),
    n_subjects=("commit_sha", "count"),
    n_ccdc_events=("is_ccdc_event", lambda s: int(s.fillna(False).sum())),
    ccdc_rate=("is_ccdc_event", lambda s: float(s.fillna(False).mean())),
    birthday=("birthday", "first"),
    pushed_at=("pushed_at", "first"),
    updated_at=("updated_at", "first"),
    homepage=("homepage", "first"),
    has_discussions=("has_discussions", "first"),
    has_issues=("has_issues", "first"),
    has_pages=("has_pages", "first"),
    has_projects=("has_projects", "first"),
    has_wiki=("has_wiki", "first"),
    forks_count=("forks_count", "first"),
    open_issues_count=("open_issues_count", "first"),
    stargazers_count=("stargazers_count", "first"),
    subscribers_count=("subscribers_count", "first"),
    size=("size", "first"),
).reset_index()

expected = len(subjects)
actual = repos["n_subjects"].sum()
assert expected == actual, "ERROR"

## Descriptive Statistics

At this point, we start analyzing the dataâ€¦

In [15]:
repos = repos.sort_values("birthday")

repos.head(30)

Unnamed: 0,full_name_of_repo,id,n_subjects,n_ccdc_events,ccdc_rate,birthday,pushed_at,updated_at,homepage,has_discussions,has_issues,has_pages,has_projects,has_wiki,forks_count,open_issues_count,stargazers_count,subscribers_count,size
169,gctools-outilsgc/gcconnex,22766545,102,31,0.303922,2008-08-04 11:09:52+00:00,2025-10-24 17:50:40+00:00,2026-01-12 18:35:33+00:00,,False,True,False,True,True,43,317,49,22,121949
122,couchrest/couchrest_model,732196,81,10,0.123457,2009-01-11 10:27:22+00:00,2024-05-09 12:24:18+00:00,2025-06-29 02:50:55+00:00,,False,True,False,True,True,115,52,302,14,4758
241,mooz/js2-mode,563066,53,6,0.113208,2010-03-15 12:46:13+00:00,2024-12-05 01:40:12+00:00,2026-01-13 11:16:08+00:00,,False,True,False,True,True,180,63,1335,43,2316
302,senchalabs/jQTouch,745502,47,3,0.06383,2010-06-30 05:23:34+00:00,2021-02-18 21:43:34+00:00,2026-01-08 19:30:22+00:00,http://www.jqtouch.com/,False,True,True,True,True,585,61,2775,120,54399
206,kiegroup/jbpm,930571,9,1,0.111111,2010-09-23 10:59:18+00:00,2025-12-17 14:20:21+00:00,2026-01-11 14:41:59+00:00,http://www.jbpm.org,False,False,False,True,True,1212,47,1712,186,336696
360,zdavatz/oddb.org,7040784,33,5,0.151515,2010-12-21 08:34:07+00:00,2026-01-12 18:04:13+00:00,2026-01-12 18:04:17+00:00,https://ch.oddb.org,False,True,False,True,True,9,14,10,4,113712
306,simonmichael/hledger,9301414,155,68,0.43871,2011-04-18 17:32:39+00:00,2026-01-13 05:27:20+00:00,2026-01-13 08:50:27+00:00,https://hledger.org,False,True,False,False,False,352,340,3701,41,112458
188,humanoid-path-planner/hpp-core,18137993,5,2,0.4,2011-07-19 19:09:29+00:00,2026-01-12 17:31:18+00:00,2026-01-12 17:31:22+00:00,https://humanoid-path-planner.github.io/hpp-doc/,False,True,False,True,True,33,7,43,10,6200
223,localeapp/localeapp,2222347,39,12,0.307692,2011-09-06 13:49:21+00:00,2023-10-17 03:14:05+00:00,2025-10-24 10:16:43+00:00,http://www.localeapp.com,False,True,True,True,True,52,11,239,9,814
272,plasma-umass/doppio,3434927,86,4,0.046512,2012-02-13 23:12:52+00:00,2022-12-06 23:14:10+00:00,2025-12-16 11:37:57+00:00,http://plasma-umass.github.io/doppio-demo,False,True,True,True,True,176,55,2182,71,77363
