In [1]:
import pandas as pd

full_data = pd.read_csv("everything100percat.csv")

In [2]:

def calibrate_thresholds(
    df: pd.DataFrame,
    baseline_mask: pd.Series,
    features_params: dict
) -> dict:
    """
    Compute trigger thresholds for each feature based on a baseline window.

    Args:
      df             : full DataFrame with features
      baseline_mask  : boolean Series marking baseline rows (e.g., pre‑ChatGPT)
      features_params: dict mapping feature names to:
                       {
                         "direction": "greater" or "less",
                         "percentile": e.g. 95 (for greater) or 5 (for less)
                       }
    Returns:
      thresholds: dict mapping feature to {"threshold": float, "direction": str}
    """
    baseline_df = df.loc[baseline_mask]
    thresholds = {}
    for feat, params in features_params.items():
        pct = params.get("percentile", 95 if params["direction"]=="greater" else 5) / 100
        if params["direction"] == "greater":
            thr = baseline_df[feat].quantile(pct)
        else:
            thr = baseline_df[feat].quantile(pct)
        thresholds[feat] = {"threshold": thr, "direction": params["direction"]}
    return thresholds

def rule_based_flag(
    row: pd.Series,
    thresholds: dict,
    min_votes: int
) -> bool:
    """
    Apply rule‑based voting for a single revision.

    Args:
      row        : one DataFrame row of features
      thresholds : dict from calibrate_thresholds()
      min_votes  : minimum number of triggered rules to label as AI

    Returns:
      True if AI‑assisted (votes >= min_votes), else False
    """
    votes = 0
    for feat, params in thresholds.items():
        val = row.get(feat, None)
        if val is None or pd.isna(val):
            continue
        if params["direction"] == "greater" and val > params["threshold"]:
            votes += 1
        elif params["direction"] == "less" and val < params["threshold"]:
            votes += 1
    return votes >= min_votes

def apply_rule_based_flags(
    df: pd.DataFrame,
    thresholds: dict,
    min_votes: int
) -> pd.Series:
    """
    Vectorized application: returns a boolean Series of AI flags.
    """
    return df.apply(lambda r: rule_based_flag(r, thresholds, min_votes), axis=1)

# --- Example Usage ---

# 1) Define which features to include and how to calibrate
features_params = {
    "lexical_spike_delta": {"direction": "greater", "percentile": 95},
    "perplexity":          {"direction": "less",    "percentile": 5},
    "citation_delta":      {"direction": "less",    "percentile": 5},
    "burstiness":          {"direction": "less",    "percentile": 5},
    "nTTR":                {"direction": "less",    "percentile": 5},
}

# 2) Identify baseline window (e.g., pre‑ChatGPT)
baseline_mask = full_data["timestamp"] < "2022-11-01"

# 3) Calibrate thresholds from baseline
thresholds = calibrate_thresholds(full_data, baseline_mask, features_params)

# 4) Choose minimum votes (e.g., 2 out of 5 signals)
min_votes = 2

# 5) Flag revisions
full_data["ai_flag"] = apply_rule_based_flags(full_data, thresholds, min_votes)

# Inspect thresholds and AI‐flagged count
print("Thresholds per feature:", thresholds)
print("AI‐flagged revisions:", full_data["ai_flag"].sum())



Thresholds per feature: {'lexical_spike_delta': {'threshold': np.float64(0.03160831639765665), 'direction': 'greater'}, 'perplexity': {'threshold': np.float64(219.4595947265625), 'direction': 'less'}, 'citation_delta': {'threshold': np.float64(0.0006215535705276602), 'direction': 'less'}, 'burstiness': {'threshold': np.float64(0.0584613878072789), 'direction': 'less'}, 'nTTR': {'threshold': np.float64(5.700198591836499), 'direction': 'less'}}
AI‐flagged revisions: 654


In [3]:
# Save the results
full_data.to_csv("everything100percat_with_ai_flags.csv", index=False)

In [2]:
# read
full_data = pd.read_csv("everything100percat_with_ai_flags.csv")
# Check the first few rows
print(full_data.head())


   Unnamed: 0                snapshot_ts      rev_id  \
0           0  2023-06-30 00:00:00+00:00  1160763005   
1           1  2023-07-31 00:00:00+00:00  1166013433   
2           2  2023-08-31 00:00:00+00:00  1171485250   
3           3  2023-09-30 00:00:00+00:00  1177319025   
4           4  2023-10-31 00:00:00+00:00  1181899435   

                   timestamp                                   user  is_bot  \
0  2023-06-18 16:30:41+00:00  2601:483:C301:7360:BC05:287F:176:F15F   False   
1  2023-07-18 21:17:52+00:00                             TompaDompa   False   
2  2023-08-21 11:13:21+00:00                           79.41.96.200   False   
3  2023-09-27 04:45:15+00:00                         187.254.98.237   False   
4  2023-10-25 22:06:39+00:00                              Idulatria   False   

                                             content  article_id  \
0  {{Use dmy dates|date=July 2021}}\n{{Short desc...       20951   
1  {{Use dmy dates|date=July 2021}}\n{{Short desc...

In [4]:
corrs = (
    full_data
    .drop(columns=[
        'Unnamed: 0',
        'snapshot_ts',
        'timestamp',       # <- drop this!
        'content',
        'user',
        'title',
        'root',
        'stratum',
        'plain_text',
        'upos_props',
    ])
    .corrwith(full_data['ai_flag'].astype(int))
)
print(corrs.sort_values(ascending=False).head(10))


ai_flag                1.000000
word_density           0.210328
lexical_spike_delta    0.187886
p_t                    0.187886
fog                    0.183399
perplexity             0.066466
chars_per_sent         0.062179
mean_dep_depth         0.035625
rev_id                 0.004840
is_bot                -0.005568
dtype: float64
