In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import pickle

In [4]:
full_data = pd.read_csv("everything100percat.csv")


In [32]:
print(full_data.dtypes)

Unnamed: 0               int64
snapshot_ts             object
rev_id                   int64
timestamp               object
user                    object
is_bot                    bool
content                 object
article_id               int64
title                   object
root                    object
stratum                 object
plain_text              object
p_t                    float64
lexical_spike_delta    float64
perplexity             float64
burstiness             float64
upos_props              object
mean_dep_depth         float64
clause_ratio           float64
voice_ratio            float64
fre                    float64
fog                    float64
chars_per_sent         float64
sents_per_para         float64
nTTR                   float64
word_density           float64
avg_line_len           float64
citation_delta         float64
dtype: object


In [8]:
# === Cell: Normalization utilities (place near your other utils) ===


def compute_baseline_stats(
        df: pd.DataFrame,
        feature_cols,
        category_col: str = 'root',
        timestamp_col: str = 'timestamp',
        baseline_end_date: str = '2022-11-01',
        min_baseline_rows_per_category: int = 10,
) -> pd.DataFrame:
    """
    Build per-category baseline means/std from rows strictly before baseline_end_date.
    Filters out categories with too few baseline rows to avoid junk std=0 artefacts.
    """
    # Ensure datetime
    if not np.issubdtype(df[timestamp_col].dtype, np.datetime64):
        df = df.copy()
        df[timestamp_col] = pd.to_datetime(df[timestamp_col], utc=True, errors='coerce')

    baseline_df = df[df[timestamp_col] < pd.Timestamp(baseline_end_date, tz='UTC')].copy()

    # (Optional) guard: drop categories with too few baseline observations
    valid_cats = (
        baseline_df.groupby(category_col)[feature_cols[0]]
        .size()
        .loc[lambda s: s >= min_baseline_rows_per_category]
        .index
    )
    baseline_df = baseline_df[baseline_df[category_col].isin(valid_cats)]

    # Compute stats
    stats = baseline_df.groupby(category_col)[feature_cols].agg(['mean', 'std'])
    stats.columns = ['_'.join(col) for col in stats.columns]  # flatten
    return stats


def normalize_features_by_baseline(
        df: pd.DataFrame,
        feature_cols,
        baseline_stats: pd.DataFrame,
        category_col: str = 'root'
) -> pd.DataFrame:
    """
    Merge precomputed baseline stats and add <feature>_zscore columns.
    Categories without baseline end up with 0 z-scores to avoid NaNs in voting.
    """
    out = df.merge(baseline_stats, on=category_col, how='left')

    for col in feature_cols:
        mean_col = f'{col}_mean'
        std_col = f'{col}_std'
        z_col = f'{col}_zscore'

        mean_vals = out[mean_col]
        std_vals = out[std_col]

        # (value - mean)/std, safe divide; if no stats or std=0 → 0.0
        out[z_col] = np.divide(
            out[col] - mean_vals,
            std_vals,
            out=np.zeros(len(out), dtype=float),
            where=(std_vals.notna() & (std_vals != 0))
        )

    # drop the merged mean/std columns
    out.drop(columns=[c for c in out.columns if any(c.endswith(sfx) for sfx in ('_mean', '_std'))], inplace=True)
    return out


def load_or_build_baseline(
        df: pd.DataFrame,
        feature_cols,
        category_col: str = 'root',
        timestamp_col: str = 'timestamp',
        baseline_end_date: str = '2022-11-01',
        cache_path: str | Path = './_baseline_stats.pkl',
        min_baseline_rows_per_category: int = 10,
) -> pd.DataFrame:
    """
    Try to load baseline stats from disk; otherwise compute and cache.
    """
    cache_path = Path(cache_path)
    if cache_path.exists():
        with open(cache_path, 'rb') as f:
            stats = pickle.load(f)
    else:
        stats = compute_baseline_stats(
            df=df,
            feature_cols=feature_cols,
            category_col=category_col,
            timestamp_col=timestamp_col,
            baseline_end_date=baseline_end_date,
            min_baseline_rows_per_category=min_baseline_rows_per_category,
        )
        cache_path.parent.mkdir(parents=True, exist_ok=True)
        with open(cache_path, 'wb') as f:
            pickle.dump(stats, f)
    return stats


In [9]:
# === Cell: Normalize current feature set ===

# Which features to normalize (extend to your full list of 14)
FEATURES_TO_NORMALIZE = [
    "p_t",
    "lexical_spike_delta",
    "perplexity",
    "burstiness",
    "mean_dep_depth",
    "clause_ratio",
    "voice_ratio",
    "fre",
    "fog",
    "chars_per_sent",
    "sents_per_para",
    "avg_line_len",
    "nTTR",
    "word_density",
    "citation_delta"
]

BASELINE_END = '2022-11-01'  # end of pre-ChatGPT window

# 1) Build/load per-category baseline stats once
baseline_stats = load_or_build_baseline(
    df=full_data,
    feature_cols=FEATURES_TO_NORMALIZE,
    category_col='root',
    timestamp_col='timestamp',
    baseline_end_date=BASELINE_END,
    cache_path='./_baseline_stats.pkl',
    min_baseline_rows_per_category=10,
)

# 2) Produce z-scores for the whole panel (pre and post)
features_df_z = normalize_features_by_baseline(
    df=full_data,
    feature_cols=FEATURES_TO_NORMALIZE,
    baseline_stats=baseline_stats,
    category_col='root',
)

# 3) (Optional) keep only z-score views for voting
Z_FEATURES = [f'{c}_zscore' for c in FEATURES_TO_NORMALIZE]
features_for_voting = features_df_z[['timestamp', 'root', 'title', 'rev_id', 'ai_flag'] + Z_FEATURES].copy() \
    if all(c in features_df_z.columns for c in ['title', 'rev_id', 'ai_flag']) \
    else features_df_z[['timestamp', 'root'] + Z_FEATURES].copy()


In [10]:
features_df_z.columns

Index(['Unnamed: 0', 'snapshot_ts', 'rev_id', 'timestamp', 'user', 'is_bot',
       'content', 'article_id', 'title', 'root', 'stratum', 'plain_text',
       'p_t', 'lexical_spike_delta', 'perplexity', 'burstiness', 'upos_props',
       'mean_dep_depth', 'clause_ratio', 'voice_ratio', 'fre', 'fog',
       'chars_per_sent', 'sents_per_para', 'nTTR', 'word_density',
       'avg_line_len', 'citation_delta', 'p_t_zscore',
       'lexical_spike_delta_zscore', 'perplexity_zscore', 'burstiness_zscore',
       'mean_dep_depth_zscore', 'clause_ratio_zscore', 'voice_ratio_zscore',
       'fre_zscore', 'fog_zscore', 'chars_per_sent_zscore',
       'sents_per_para_zscore', 'avg_line_len_zscore', 'nTTR_zscore',
       'word_density_zscore', 'citation_delta_zscore'],
      dtype='object')

In [12]:
import pandas as pd
import numpy as np

# Assume 'features_df_z' is the DataFrame that has been processed by your
# normalize_features_by_baseline() function.

# --- 1. Define the CORRECT list of z-score columns to be used in the voting system ---
# This list now matches the 15 features present in your DataFrame.
zscore_cols = [
    'p_t_zscore',
    'lexical_spike_delta_zscore',
    'perplexity_zscore',
    'burstiness_zscore',
    'mean_dep_depth_zscore',
    'clause_ratio_zscore',
    'voice_ratio_zscore',
    'fre_zscore',
    'fog_zscore',
    'chars_per_sent_zscore',
    'sents_per_para_zscore',
    'nTTR_zscore',
    'word_density_zscore',
    'avg_line_len_zscore',
    'citation_delta_zscore'
]

# --- 2. Define the new voting system function based on a Z-Score Cutoff ---
def apply_voting_system_zscore(df, z_score_cutoff=2.5, min_votes=3):
    """
    Applies a voting system to flag revisions based on a z-score cutoff.

    A "vote" is cast for a revision if the absolute value of a feature's
    z-score exceeds the specified cutoff.

    Args:
        df (pd.DataFrame): The DataFrame containing the z-score columns.
        z_score_cutoff (float): The z-score value to use as a threshold.
        min_votes (int): The minimum number of votes required to flag a
                         revision as potentially AI-generated.

    Returns:
        pd.DataFrame: The input DataFrame with two new columns:
                      'ai_vote_count' and 'ai_flag'.
    """
    print(f"Applying voting system with Z-Score cutoff: {z_score_cutoff} and minimum votes: {min_votes}")

    # Calculate the number of votes for each revision.
    # A vote is cast if the absolute z-score is > cutoff.
    # We use abs() because a very low score (e.g., perplexity) can be as
    # anomalous as a very high score (e.g., nTTR).
    df['ai_vote_count'] = df[zscore_cols].abs().gt(z_score_cutoff).sum(axis=1)

    # Flag the revision if the vote count meets the minimum threshold
    df['ai_flag'] = df['ai_vote_count'] >= min_votes

    print("Voting system application complete.")
    return df

# --- 3. Apply the new voting system ---
# You can easily experiment with these two parameters now.
Z_SCORE_THRESHOLD = 3.5
MINIMUM_VOTES = 2 # e.g., flag if at least 4 out of 15 features are anomalous

final_df = apply_voting_system_zscore(
    features_df_z,
    z_score_cutoff=Z_SCORE_THRESHOLD,
    min_votes=MINIMUM_VOTES
)

# --- 4. Display the results ---
print("\n--- Voting System Results ---")
print(final_df[['rev_id', 'timestamp', 'root', 'ai_vote_count', 'ai_flag']].head())

print("\n--- Distribution of Votes ---")
print(final_df['ai_vote_count'].value_counts().sort_index())

# Calculate and display the total number of flagged revisions
flagged_count = final_df['ai_flag'].sum()
total_count = len(final_df)
print(f"\nTotal revisions flagged as potentially AI-generated: {flagged_count} out of {total_count} ({flagged_count/total_count:.2%})")


Applying voting system with Z-Score cutoff: 3.5 and minimum votes: 2
Voting system application complete.

--- Voting System Results ---
       rev_id                 timestamp     root  ai_vote_count  ai_flag
0  1160763005 2023-06-18 16:30:41+00:00  History              3     True
1  1166013433 2023-07-18 21:17:52+00:00  History              3     True
2  1171485250 2023-08-21 11:13:21+00:00  History              3     True
3  1177319025 2023-09-27 04:45:15+00:00  History              3     True
4  1181899435 2023-10-25 22:06:39+00:00  History              3     True

--- Distribution of Votes ---
ai_vote_count
0    14352
1      867
2      399
3      147
4       40
5        5
6       22
Name: count, dtype: int64

Total revisions flagged as potentially AI-generated: 613 out of 15832 (3.87%)


In [None]:
final_df

In [None]:
# save to a file called normalized_everything100percat_with_ai_votes.csv
final_df.to_csv("normalized_everything100percat_with_ai_votes.csv", index=False)

In [6]:
final_df = pd.read_csv("normalized_everything100percat_with_ai_votes.csv")

In [25]:
import pandas as pd
import numpy as np

# Assume 'features_df_z' is the DataFrame that has been processed by your
# normalize_features_by_baseline() function and contains all z-score columns.

# --- 1. Isolate the Baseline Data for Calibration ---
# We only want to test our parameters on the data we know should be "human".
baseline_end_date = '2022-11-01'
# Ensure timestamp column is in datetime format for comparison
features_df_z['timestamp'] = pd.to_datetime(features_df_z['timestamp'])
baseline_df = features_df_z[features_df_z['timestamp'] < baseline_end_date].copy()

print(f"Calibrating thresholds using {len(baseline_df)} baseline revisions (pre-{baseline_end_date}).")

# --- 2. Define the Grid of Parameters to Test ---
z_score_cutoffs = [2.0, 2.5, 3.0, 3.5]
min_votes_options = [2, 3, 4, 5, 6]

# The list of z-score columns to use for voting
zscore_cols = [col for col in features_df_z.columns if col.endswith('_zscore')]

# --- 3. Perform the Grid Search ---
results = []

for z_cutoff in z_score_cutoffs:
    row = {'z_score_cutoff': z_cutoff}
    for min_v in min_votes_options:
        # Calculate votes for the current parameter combination
        vote_count = baseline_df[zscore_cols].abs().gt(z_cutoff).sum(axis=1)

        # Determine the number of revisions flagged
        flagged_count = (vote_count >= min_v).sum()

        # Calculate the percentage of the baseline that was flagged (Baseline Flag Rate)
        baseline_flag_rate = (flagged_count / len(baseline_df)) * 100

        # Store the result, formatted as a percentage string
        row[f'min_votes_{min_v}'] = f"{baseline_flag_rate:.2f}%"

    results.append(row)

# --- 4. Display the Results in a Clear Table ---
calibration_results_df = pd.DataFrame(results).set_index('z_score_cutoff')

print("\n--- Calibration Results (Baseline Flag Rate %) ---")
print("This table shows the percentage of pre-ChatGPT revisions that would be flagged for each parameter combination.")
print(calibration_results_df)



Calibrating thresholds using 6118 baseline revisions (pre-2022-11-01).

--- Calibration Results (Baseline Flag Rate %) ---
This table shows the percentage of pre-ChatGPT revisions that would be flagged for each parameter combination.
               min_votes_2 min_votes_3 min_votes_4 min_votes_5 min_votes_6
z_score_cutoff                                                            
2.0                 19.81%       7.45%       3.24%       1.63%       0.74%
2.5                 10.95%       3.61%       1.81%       0.92%       0.33%
3.0                  5.72%       2.35%       1.00%       0.29%       0.15%
3.5                  3.60%       1.27%       0.51%       0.15%       0.13%


# Sanity Check
(check if normalization was done properly)


In [24]:
import pandas as pd
import numpy as np

def verify_wiki_normalization(
    normalized_df: pd.DataFrame,
    raw_feature: str,
    category_to_check: str,
    category_col: str = 'root',
    timestamp_col: str = 'timestamp',
    baseline_end_date: str = '2022-11-01'
):
    """
    Verifies the per-category, time-based z-score calculation for a single
    feature and a single category. This version uses a robust datetime check.

    Args:
        normalized_df (pd.DataFrame): The full DataFrame after normalization.
        raw_feature (str): The name of the raw feature to check (e.g., 'fre').
        category_to_check (str): The specific category to verify (e.g., 'History').
        category_col (str): The name of the category column.
        timestamp_col (str): The name of the timestamp column.
        baseline_end_date (str): The date defining the end of the baseline period.
    """
    zscore_feature = f"{raw_feature}_zscore"

    print(f"--- Verifying Z-Score for feature: '{raw_feature}' in Category: '{category_to_check}' ---")

    df = normalized_df.copy()

    # --- CORRECTED DATETIME CHECK ---
    # Use the pandas-native API for checking datetime types, which handles timezones correctly.
    if not pd.api.types.is_datetime64_any_dtype(df[timestamp_col]):
        print(f"Converting '{timestamp_col}' to datetime...")
        df[timestamp_col] = pd.to_datetime(df[timestamp_col], utc=True, errors='coerce')

    # 1. Manually isolate the baseline data for THIS SPECIFIC CATEGORY
    baseline_df = df[
        (df[category_col] == category_to_check) &
        (df[timestamp_col] < pd.Timestamp(baseline_end_date, tz='UTC'))
    ]

    if len(baseline_df) < 2: # Need at least 2 points to calculate std dev
        print(f"Warning: Not enough baseline data ({len(baseline_df)} rows) found for '{category_to_check}' to verify. Skipping.")
        print("-" * 70)
        return

    # 2. Manually calculate the mean and std dev for this category's baseline
    baseline_mean = baseline_df[raw_feature].mean()
    baseline_std = baseline_df[raw_feature].std()

    print(f"Manual Baseline Calculation for '{category_to_check}':")
    print(f"  - Mean of '{raw_feature}': {baseline_mean:.6f}")
    print(f"  - Std Dev of '{raw_feature}': {baseline_std:.6f}\n")

    # 3. Manually compute the z-scores for ALL data in this category
    category_df = df[df[category_col] == category_to_check]

    if pd.notna(baseline_std) and baseline_std != 0:
        manually_computed_zscores = (category_df[raw_feature] - baseline_mean) / baseline_std
    else:
        # If std is 0 or NaN, the result should be 0, matching the function's logic
        manually_computed_zscores = pd.Series(0.0, index=category_df.index)

    manually_computed_zscores = manually_computed_zscores.fillna(0)

    # 4. Get the z-scores produced by your function for this category
    function_zscores = category_df[zscore_feature].fillna(0)

    # 5. Compare the two results
    are_they_equal = np.allclose(manually_computed_zscores, function_zscores)

    if are_they_equal:
        print(f"✅ SUCCESS: The function's z-scores for '{category_to_check}' match the manual calculation.")
    else:
        print(f"❌ FAILURE: The z-scores for '{category_to_check}' do not match.")
        diff_df = pd.DataFrame({
            'function_z': function_zscores,
            'manual_z': manually_computed_zscores,
            'difference': function_zscores - manually_computed_zscores
        })
        print("Showing first 5 rows with significant differences:")
        display(diff_df[diff_df['difference'].abs() > 1e-9].head())

    print("-" * 70)


# --- EXAMPLE USAGE ---
# Assumes 'final_wiki_df' is available in your notebook.

# Verify for a major category
verify_wiki_normalization(
    normalized_df=final_df,
    raw_feature='fre',
    category_to_check='History'
)

# Verify for another one
if 'Computing' in final_df['root'].unique():
    verify_wiki_normalization(
        normalized_df=final_df,
        raw_feature='perplexity',
        category_to_check='Computing'
    )


--- Verifying Z-Score for feature: 'fre' in Category: 'History' ---
Manual Baseline Calculation for 'History':
  - Mean of 'fre': 27.680064
  - Std Dev of 'fre': 19.238637

✅ SUCCESS: The function's z-scores for 'History' match the manual calculation.
----------------------------------------------------------------------
--- Verifying Z-Score for feature: 'perplexity' in Category: 'Computing' ---
Manual Baseline Calculation for 'Computing':
  - Mean of 'perplexity': 485.400160
  - Std Dev of 'perplexity': 276.798486

✅ SUCCESS: The function's z-scores for 'Computing' match the manual calculation.
----------------------------------------------------------------------


In [22]:
# unique values for root column
final_df['root'].unique()

array(['History', 'Politics', 'Technology', 'Computing', 'Biology',
       'Chemistry', 'Physics', 'Film', 'Music', 'Science', 'Engineering',
       'Elections', 'Political parties', 'Popular culture', 'Television',
       'Political history', 'History by country', 'Military history',
       'Medicine', 'Video games'], dtype=object)