In [None]:
# Phase 2 â€” RQ2 (Self-Healing): Review Effort Prediction

This notebook is **self-healing**: if your review-comments target is missing, it reconstructs it from the raw files.

**What it does**
1. **Robust load** with a local cache (fixes timeouts from synced drives).
2. **Rebuild `n_review_comments`** by mapping review rows to PRs (via `commit_id â†’ sha â†’ pr_id`, and as a fallback using `pull_request_url`).
3. **Type-safe merge** into your modeling table.
4. Feature engineering (titles, churn, repo stats, simple task type).
5. **RQ2a**: Predict number of review comments.
6. **RQ2b**: Predict time-to-merge (hours).
7. Pooled, group-wise (agent vs human), and balanced (downsample agent to human).

> If your paths differ, change `BASE_PATH` in the next cell.


In [14]:
from pathlib import Path

BASE_PATH = Path("/Users/kartik/Desktop/aidev-phase2")  # <--- EDIT if needed
DATA = BASE_PATH / "data"
OUT  = BASE_PATH / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

print("BASE_PATH:", BASE_PATH)
print("DATA:", DATA)
print("OUT:", OUT)


BASE_PATH: /Users/kartik/Desktop/aidev-phase2
DATA: /Users/kartik/Desktop/aidev-phase2/data
OUT: /Users/kartik/Desktop/aidev-phase2/outputs


In [15]:
import os, shutil
import pandas as pd
from pathlib import Path

CACHE = Path("/tmp/aidev_phase2_cache")
CACHE.mkdir(parents=True, exist_ok=True)

def first_existing(paths):
    for p in paths:
        if p.exists():
            return p
    return None

def cached_chunk_read_csv(src_path: Path, usecols, chunksize=500_000):
    """
    Copy to local cache if needed, then read in chunks keeping only `usecols`.
    """
    local_copy = CACHE / src_path.name
    if (not local_copy.exists()) or (os.path.getmtime(local_copy) < os.path.getmtime(src_path)):
        print(f"Copying to local cache: {local_copy}")
        shutil.copy2(src_path, local_copy)
    else:
        print(f"Using cached file: {local_copy}")

    chunks = []
    try:
        for ck in pd.read_csv(local_copy, low_memory=True, chunksize=chunksize,
                              usecols=lambda c: c in usecols):
            chunks.append(ck)
    except TypeError:
        for ck in pd.read_csv(local_copy, low_memory=True, chunksize=chunksize):
            chunks.append(ck[usecols])

    df = pd.concat(chunks, ignore_index=True)
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
    return df


In [16]:
import pandas as pd

fixed_csv = OUT / "modeling_table_fixed.csv"
combo_csv = OUT / "modeling_table_combined.csv"

if fixed_csv.exists():
    df = pd.read_csv(fixed_csv, low_memory=False)
    src_used = fixed_csv
    print("Loaded FIXED:", fixed_csv)
elif combo_csv.exists():
    df = pd.read_csv(combo_csv, low_memory=False)
    src_used = combo_csv
    print("Loaded COMBINED:", combo_csv)
else:
    raise FileNotFoundError("Missing outputs/modeling_table_fixed.csv and modeling_table_combined.csv")

print("df shape:", df.shape)
print(df.head(2))


Loaded FIXED: /Users/kartik/Desktop/aidev-phase2/outputs/modeling_table_fixed.csv
df shape: (939409, 19)
   pr_id      repo_id                                              title  \
0   1688  843988367.0  `metta code` --> `metta clip` and additional p...   
1     41  992063465.0  feat: Comprehensive ruff error resolution with...   

                                                body   state  \
0  Remove unused `root_key` variable to fix ruff ...  closed   
1  ## ðŸŽ¯ Mission Accomplished: 100% Ruff Error Res...    open   

             created_at             merged_at             closed_at  \
0  2025-07-25T18:15:36Z  2025-07-25T19:17:23Z  2025-07-25T19:17:23Z   
1  2025-07-25T18:17:57Z                   NaN                   NaN   

         agent  loc_added  loc_deleted  files_touched  stars  forks  y_accept  \
0  Claude_Code        0.0          0.0            0.0   72.0   32.0       1.0   
1  Claude_Code        0.0          0.0            0.0    0.0    0.0       NaN   

   hours_to_

In [18]:
# --- Normalize key columns correctly (Series -> .str.strip) ---
for c in ["commit_id", "original_commit_id", "pull_request_url"]:
    if c in rev.columns:
        # robust: coerce to pandas 'string' dtype, strip whitespace; keep NaN as <NA>
        rev[c] = rev[c].astype("string").str.strip()

diff = diff.dropna().drop_duplicates()

# sha could be mixed/float; force to string and strip
diff["sha"] = diff["sha"].astype("string").str.strip()


In [19]:
# Map commit_id -> pr_id via sha
rev_join = rev.merge(diff, left_on="commit_id", right_on="sha", how="left", suffixes=("","_sha1"))

# Fallback: original_commit_id
if "original_commit_id" in rev.columns:
    rev_join = rev_join.merge(diff, left_on="original_commit_id", right_on="sha", how="left", suffixes=("","_sha2"))

# Prefer commit match; fallback to original_commit_id match
pr_from_sha = rev_join["pr_id"].copy()
if "pr_id_sha2" in rev_join.columns:
    pr_from_sha = pr_from_sha.fillna(rev_join["pr_id_sha2"])

# Fallback: extract /pull/<num> from pull_request_url
import numpy as np, re, pandas as pd
prnum_from_url = pd.Series(index=rev_join.index, dtype="Int64")
if "pull_request_url" in rev_join.columns:
    def extract_prnum(url):
        m = re.search(r"/pull/(\\d+)", str(url))
        return int(m.group(1)) if m else None
    prnum_from_url = pd.Series([extract_prnum(u) for u in rev_join["pull_request_url"]], dtype="Int64")

# Final reconstructed pr_id (Int64)
rev_join["pr_id_rec"] = pd.to_numeric(pr_from_sha, errors="coerce").astype("Int64")
rev_join.loc[rev_join["pr_id_rec"].isna(), "pr_id_rec"] = prnum_from_url
print("Reconstructed PR IDs for review rows:", rev_join["pr_id_rec"].notna().sum(), "of", len(rev_join))

# Aggregate comments per PR
if "created_at" in rev_join.columns:
    agg_rev = (
        rev_join.dropna(subset=["pr_id_rec"])
                .groupby("pr_id_rec")
                .agg(n_review_comments=("id","count"),
                     first_comment_at=("created_at","min"))
                .reset_index()
                .rename(columns={"pr_id_rec":"pr_id"})
    )
else:
    agg_rev = (
        rev_join.dropna(subset=["pr_id_rec"])
                .groupby("pr_id_rec")
                .agg(n_review_comments=("id","count"))
                .reset_index()
                .rename(columns={"pr_id_rec":"pr_id"})
    )
    agg_rev["first_comment_at"] = pd.NaT

# Merge with consistent dtype
df['pr_id']       = pd.to_numeric(df['pr_id'], errors='coerce').astype('Int64')
agg_rev['pr_id']  = pd.to_numeric(agg_rev['pr_id'], errors='coerce').astype('Int64')

df = df.drop(columns=[c for c in ["n_review_comments","first_comment_at"] if c in df.columns], errors="ignore")
df = df.merge(agg_rev, on="pr_id", how="left", validate="m:1")
df["n_review_comments"] = pd.to_numeric(df["n_review_comments"], errors="coerce").fillna(0)

print("Non-zero review PRs after merge:", (df["n_review_comments"] > 0).sum())
print(df["n_review_comments"].describe())


Reconstructed PR IDs for review rows: 19169 of 19670
Non-zero review PRs after merge: 0
count    939409.0
mean          0.0
std           0.0
min           0.0
25%           0.0
50%           0.0
75%           0.0
max           0.0
Name: n_review_comments, dtype: float64
