In [2]:
# RQ1 â€“ Predicting Pull Request Acceptance from Submission-Time Features

This notebook implements the **RQ1 pipeline** for the project:

> **RQ1:** How well can early PR signals (e.g., description clarity, diff size, files changed) predict acceptance or rejection?

It is designed to be:
- **Fully reproducible**
- **Heavily documented**
- **Easy to read for new contributors**

We:

1. Load a **row-per-PR feature table** (precomputed from the AIDev dataset).
2. Select **submission-time features only** (no leakage).
3. Train and evaluate **5 ML models**:
   - Logistic Regression
   - Random Forest
   - Gradient Boosting
   - Extra Trees
   - MLP (Simple neural network)
4. Add **baselines**:
   - Majority classifier
   - Simple heuristic rule
5. Run **5-fold stratified cross-validation** and compute:
   - Accuracy, Precision, Recall, F1, ROCâ€“AUC
6. Produce a **confusion matrix** for the best model and extract false positives / false negatives for qualitative analysis.


SyntaxError: invalid character 'â€“' (U+2013) (844362078.py, line 26)

In [3]:
# ============================================================
# 0. Imports and Global Configuration
# ============================================================

import numpy as np
import pandas as pd

from pathlib import Path

# Scikit-learn utilities
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier
)
from sklearn.neural_network import MLPClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


## 1. Load Feature Table

We assume that Phase 2 already produced a **feature table** with:

- One row per pull request (PR)
- One column for the **binary acceptance label** (e.g., `label_merged` where 1 = merged, 0 = closed)
- Multiple columns for **submission-time features**, such as:
  - Text features: body length, title length, presence of URLs, etc.
  - Diff features: lines added, lines deleted, files touched, churn ratio, etc.
  - Repo features: stars, forks, language, etc.
  - Temporal features: weekday, hour of creation, etc.

ðŸ‘‰ **You must edit the file path and column names below to match your actual dataset.**


In [4]:
# ============================================================
# 1. Load the feature table
# ============================================================

# TODO: change this to your actual CSV or Parquet file
DATA_PATH = Path("data/aiddev_pr_features.csv")  # <- EDIT THIS

df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'data/aiddev_pr_features.csv'

In [18]:
#2) Utility: train/test split + metrics

In [3]:
def split_xy(X, y, test_size=0.3, stratify=True, seed=42):
    return train_test_split(X, y, test_size=test_size, random_state=seed, stratify=y if (stratify and len(np.unique(y))==2) else None)

def cls_report(y_true, y_pred, y_prob=None):
    out = dict(
        acc = accuracy_score(y_true, y_pred),
        f1  = f1_score(y_true, y_pred)
    )
    if y_prob is not None:
        out['auc'] = roc_auc_score(y_true, y_prob)
    return out

def reg_report(y_true, y_pred):
    return dict(
        mae = mean_absolute_error(y_true, y_pred),
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    )


In [19]:
#3) RQ1 â€” Acceptance (Merged vs Closed)

In [20]:
#3a) Pooled

In [4]:
X_tr, X_te, y_tr, y_te = split_xy(X_rq1, y_rq1)

# Logistic (class_weight helps with imbalance within pooled data)
lr = LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=None)
lr.fit(X_tr, y_tr)
p_lr = lr.predict(X_te); proba_lr = lr.predict_proba(X_te)[:,1]
pooled_lr = cls_report(y_te, p_lr, proba_lr)

# Random Forest
rf = RandomForestClassifier(n_estimators=400, random_state=42, class_weight='balanced')
rf.fit(X_tr, y_tr)
p_rf = rf.predict(X_te); proba_rf = rf.predict_proba(X_te)[:,1]
pooled_rf = cls_report(y_te, p_rf, proba_rf)

pooled_rq1 = pd.DataFrame([dict(group='pooled', model='LogReg', **pooled_lr),
                           dict(group='pooled', model='RF', **pooled_rf)])
pooled_rq1


Unnamed: 0,group,model,acc,f1,auc
0,pooled,LogReg,0.771806,0.866579,0.61702
1,pooled,RF,0.821682,0.900686,0.557339


In [21]:
#3b) Per-group (agent / human)

In [5]:
def run_group_cls(mask):
    idx = df_rq1[mask.loc[df_rq1.index]].index
    if len(idx) < 300: return None  # skip tiny groups
    Xg, yg = X_rq1.loc[idx], y_rq1.loc[idx]
    X_tr, X_te, y_tr, y_te = split_xy(Xg, yg)

    lr = LogisticRegression(max_iter=1000, class_weight='balanced')
    lr.fit(X_tr, y_tr)
    p = lr.predict(X_te); proba = lr.predict_proba(X_te)[:,1]
    row_lr = dict(model='LogReg', **cls_report(y_te, p, proba))

    rf = RandomForestClassifier(n_estimators=400, random_state=42, class_weight='balanced')
    rf.fit(X_tr, y_tr)
    p = rf.predict(X_te); proba = rf.predict_proba(X_te)[:,1]
    row_rf = dict(model='RF', **cls_report(y_te, p, proba))
    return pd.DataFrame([row_lr, row_rf])

agent_rows = run_group_cls(is_agent)
if agent_rows is not None:
    agent_rows.insert(0, 'group', 'agent')

human_rows = run_group_cls(is_human)
if human_rows is not None:
    human_rows.insert(0, 'group', 'human')

rq1_groups = pd.concat([pooled_rq1, agent_rows, human_rows], ignore_index=True)
rq1_groups.to_csv(OUT/"rq1_metrics_groups.csv", index=False)
rq1_groups


Unnamed: 0,group,model,acc,f1,auc
0,pooled,LogReg,0.771806,0.866579,0.61702
1,pooled,RF,0.821682,0.900686,0.557339
2,agent,LogReg,0.771423,0.866336,0.61738
3,agent,RF,0.821989,0.900885,0.557868
4,human,LogReg,0.615176,0.735666,0.57723
5,human,RF,0.78374,0.873694,0.588314


In [22]:
#3c) Balanced comparison (downsample agents to human count)

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

# ---- 3c) Balanced (downsample agents to human count) â€” with imputation ----
rq1_df = df_rq1[['contrib_type']].copy()
rq1_df['y'] = y_rq1.values
rq1_df = pd.concat([rq1_df, X_rq1.reset_index(drop=True)], axis=1)

human_subset = rq1_df[rq1_df['contrib_type']=='human']
agent_subset = rq1_df[rq1_df['contrib_type']=='agent']

if len(human_subset) > 300 and len(agent_subset) > 300:
    agent_down = resample(agent_subset, n_samples=len(human_subset), replace=False, random_state=42)
    balanced = pd.concat([human_subset, agent_down], ignore_index=True)

    # Features/labels
    Xb = balanced[X.columns].copy()
    yb = balanced['y'].astype(int)

    # replace inf -> NaN, then impute
    Xb = Xb.replace([np.inf, -np.inf], np.nan)

    # split
    X_tr, X_te, y_tr, y_te = split_xy(Xb, yb)

    # Pipelines with imputer
    lr_pipe = make_pipeline(SimpleImputer(strategy='median'),
                            LogisticRegression(max_iter=1000))
    rf_pipe = make_pipeline(SimpleImputer(strategy='median'),
                            RandomForestClassifier(n_estimators=400, random_state=42))

    # fit/predict
    lr_pipe.fit(X_tr, y_tr)
    p = lr_pipe.predict(X_te); proba = lr_pipe.predict_proba(X_te)[:,1]
    bal_lr = cls_report(y_te, p, proba)

    rf_pipe.fit(X_tr, y_tr)
    p = rf_pipe.predict(X_te); proba = rf_pipe.predict_proba(X_te)[:,1]
    bal_rf = cls_report(y_te, p, proba)

    rq1_bal = pd.DataFrame([
        dict(group='balanced(agent=human)', model='LogReg', **bal_lr),
        dict(group='balanced(agent=human)', model='RF', **bal_rf)
    ])
    rq1_bal.to_csv(OUT/"rq1_metrics_balanced.csv", index=False)
    display(rq1_bal)
else:
    print("Not enough human or agent samples for balanced comparison.")


Unnamed: 0,group,model,acc,f1,auc
0,balanced(agent=human),LogReg,0.8729,0.932137,0.570903
1,balanced(agent=human),RF,0.86748,0.928955,0.609358


In [23]:
#3d) Feature importance (pooled RF)

In [8]:
rf_full = RandomForestClassifier(n_estimators=600, random_state=42, class_weight='balanced').fit(X_rq1, y_rq1)
imps = pd.Series(rf_full.feature_importances_, index=X_rq1.columns).sort_values(ascending=False).head(20)
imps.to_csv(OUT/"feature_importance_acceptance_top20.csv")
ax = imps.plot(kind='bar', figsize=(10,4)); ax.set_title("Top-20 Feature Importances (Acceptance)")
plt.tight_layout(); plt.savefig(OUT/"feature_importance_acceptance_top20.png", dpi=200)
plt.close()


In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    mean_absolute_error, mean_squared_error, r2_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier,
    RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
)

# ------------------------------------------------------------------
# 1) Define feature columns and targets (adapt to your DataFrame)
# ------------------------------------------------------------------

# Example: everything except label columns is a feature
label_cols = ["label_merged", "review_comments", "time_to_merge_hours"]
feature_cols = [c for c in df.columns if c not in label_cols]

X = df[feature_cols]
y_rq1 = df["label_merged"].astype(int)          # 0 = closed, 1 = merged
y_rq2_comments = df["review_comments"].astype(float)
y_rq2_ttm = df["time_to_merge_hours"].astype(float)

# Split numeric / categorical if you have them (simple version: all numeric)
numeric_features = feature_cols
categorical_features = []   # fill if you have categorical

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        # ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ],
    remainder="drop",
)


NameError: name 'df' is not defined