In [27]:
import pandas as pd
import numpy as np
from pathlib import Path
import pickle

In [28]:
full_data = pd.read_csv("everything100percat.csv")


In [32]:
print(full_data.dtypes)

Unnamed: 0               int64
snapshot_ts             object
rev_id                   int64
timestamp               object
user                    object
is_bot                    bool
content                 object
article_id               int64
title                   object
root                    object
stratum                 object
plain_text              object
p_t                    float64
lexical_spike_delta    float64
perplexity             float64
burstiness             float64
upos_props              object
mean_dep_depth         float64
clause_ratio           float64
voice_ratio            float64
fre                    float64
fog                    float64
chars_per_sent         float64
sents_per_para         float64
nTTR                   float64
word_density           float64
avg_line_len           float64
citation_delta         float64
dtype: object


In [29]:
# === Cell: Normalization utilities (place near your other utils) ===


def compute_baseline_stats(
        df: pd.DataFrame,
        feature_cols,
        category_col: str = 'root',
        timestamp_col: str = 'timestamp',
        baseline_end_date: str = '2022-11-01',
        min_baseline_rows_per_category: int = 10,
) -> pd.DataFrame:
    """
    Build per-category baseline means/std from rows strictly before baseline_end_date.
    Filters out categories with too few baseline rows to avoid junk std=0 artefacts.
    """
    # Ensure datetime
    if not np.issubdtype(df[timestamp_col].dtype, np.datetime64):
        df = df.copy()
        df[timestamp_col] = pd.to_datetime(df[timestamp_col], utc=True, errors='coerce')

    baseline_df = df[df[timestamp_col] < pd.Timestamp(baseline_end_date, tz='UTC')].copy()

    # (Optional) guard: drop categories with too few baseline observations
    valid_cats = (
        baseline_df.groupby(category_col)[feature_cols[0]]
        .size()
        .loc[lambda s: s >= min_baseline_rows_per_category]
        .index
    )
    baseline_df = baseline_df[baseline_df[category_col].isin(valid_cats)]

    # Compute stats
    stats = baseline_df.groupby(category_col)[feature_cols].agg(['mean', 'std'])
    stats.columns = ['_'.join(col) for col in stats.columns]  # flatten
    return stats


def normalize_features_by_baseline(
        df: pd.DataFrame,
        feature_cols,
        baseline_stats: pd.DataFrame,
        category_col: str = 'root'
) -> pd.DataFrame:
    """
    Merge precomputed baseline stats and add <feature>_zscore columns.
    Categories without baseline end up with 0 z-scores to avoid NaNs in voting.
    """
    out = df.merge(baseline_stats, on=category_col, how='left')

    for col in feature_cols:
        mean_col = f'{col}_mean'
        std_col = f'{col}_std'
        z_col = f'{col}_zscore'

        mean_vals = out[mean_col]
        std_vals = out[std_col]

        # (value - mean)/std, safe divide; if no stats or std=0 → 0.0
        out[z_col] = np.divide(
            out[col] - mean_vals,
            std_vals,
            out=np.zeros(len(out), dtype=float),
            where=(std_vals.notna() & (std_vals != 0))
        )

    # drop the merged mean/std columns
    out.drop(columns=[c for c in out.columns if any(c.endswith(sfx) for sfx in ('_mean', '_std'))], inplace=True)
    return out


def load_or_build_baseline(
        df: pd.DataFrame,
        feature_cols,
        category_col: str = 'root',
        timestamp_col: str = 'timestamp',
        baseline_end_date: str = '2022-11-01',
        cache_path: str | Path = './_baseline_stats.pkl',
        min_baseline_rows_per_category: int = 10,
) -> pd.DataFrame:
    """
    Try to load baseline stats from disk; otherwise compute and cache.
    """
    cache_path = Path(cache_path)
    if cache_path.exists():
        with open(cache_path, 'rb') as f:
            stats = pickle.load(f)
    else:
        stats = compute_baseline_stats(
            df=df,
            feature_cols=feature_cols,
            category_col=category_col,
            timestamp_col=timestamp_col,
            baseline_end_date=baseline_end_date,
            min_baseline_rows_per_category=min_baseline_rows_per_category,
        )
        cache_path.parent.mkdir(parents=True, exist_ok=True)
        with open(cache_path, 'wb') as f:
            pickle.dump(stats, f)
    return stats


In [4]:
# === Cell: Normalize current feature set ===

# Which features to normalize (extend to your full list of 14)
FEATURES_TO_NORMALIZE = [
    "p_t",
    "lexical_spike_delta",
    "perplexity",
    "burstiness",
    "mean_dep_depth",
    "clause_ratio",
    "voice_ratio",
    "fre",
    "fog",
    "chars_per_sent",
    "sents_per_para",
    "avg_line_len",
    "nTTR",
    "word_density",
    "citation_delta"
]

BASELINE_END = '2022-11-01'  # end of pre-ChatGPT window

# 1) Build/load per-category baseline stats once
baseline_stats = load_or_build_baseline(
    df=full_data,
    feature_cols=FEATURES_TO_NORMALIZE,
    category_col='root',
    timestamp_col='timestamp',
    baseline_end_date=BASELINE_END,
    cache_path='./_baseline_stats.pkl',
    min_baseline_rows_per_category=10,
)

# 2) Produce z-scores for the whole panel (pre and post)
features_df_z = normalize_features_by_baseline(
    df=full_data,
    feature_cols=FEATURES_TO_NORMALIZE,
    baseline_stats=baseline_stats,
    category_col='root',
)

# 3) (Optional) keep only z-score views for voting
Z_FEATURES = [f'{c}_zscore' for c in FEATURES_TO_NORMALIZE]
features_for_voting = features_df_z[['timestamp', 'root', 'title', 'rev_id', 'ai_flag'] + Z_FEATURES].copy() \
    if all(c in features_df_z.columns for c in ['title', 'rev_id', 'ai_flag']) \
    else features_df_z[['timestamp', 'root'] + Z_FEATURES].copy()


In [30]:
features_df_z.columns

Index(['Unnamed: 0', 'snapshot_ts', 'rev_id', 'timestamp', 'user', 'is_bot',
       'content', 'article_id', 'title', 'root', 'stratum', 'plain_text',
       'p_t', 'lexical_spike_delta', 'perplexity', 'burstiness', 'upos_props',
       'mean_dep_depth', 'clause_ratio', 'voice_ratio', 'fre', 'fog',
       'chars_per_sent', 'sents_per_para', 'nTTR', 'word_density',
       'avg_line_len', 'citation_delta', 'p_t_zscore',
       'lexical_spike_delta_zscore', 'perplexity_zscore', 'burstiness_zscore',
       'mean_dep_depth_zscore', 'clause_ratio_zscore', 'voice_ratio_zscore',
       'fre_zscore', 'fog_zscore', 'chars_per_sent_zscore',
       'sents_per_para_zscore', 'avg_line_len_zscore', 'nTTR_zscore',
       'word_density_zscore', 'citation_delta_zscore', 'vote_sum', 'ai_vote'],
      dtype='object')

In [33]:
import pandas as pd
import numpy as np

# Assume 'features_df_z' is the DataFrame that has been processed by your
# normalize_features_by_baseline() function.

# --- 1. Define the CORRECT list of z-score columns to be used in the voting system ---
# This list now matches the 15 features present in your DataFrame.
zscore_cols = [
    'p_t_zscore',
    'lexical_spike_delta_zscore',
    'perplexity_zscore',
    'burstiness_zscore',
    'mean_dep_depth_zscore',
    'clause_ratio_zscore',
    'voice_ratio_zscore',
    'fre_zscore',
    'fog_zscore',
    'chars_per_sent_zscore',
    'sents_per_para_zscore',
    'nTTR_zscore',
    'word_density_zscore',
    'avg_line_len_zscore',
    'citation_delta_zscore'
]

# --- 2. Define the new voting system function based on a Z-Score Cutoff ---
def apply_voting_system_zscore(df, z_score_cutoff=2.5, min_votes=3):
    """
    Applies a voting system to flag revisions based on a z-score cutoff.

    A "vote" is cast for a revision if the absolute value of a feature's
    z-score exceeds the specified cutoff.

    Args:
        df (pd.DataFrame): The DataFrame containing the z-score columns.
        z_score_cutoff (float): The z-score value to use as a threshold.
        min_votes (int): The minimum number of votes required to flag a
                         revision as potentially AI-generated.

    Returns:
        pd.DataFrame: The input DataFrame with two new columns:
                      'ai_vote_count' and 'ai_flag'.
    """
    print(f"Applying voting system with Z-Score cutoff: {z_score_cutoff} and minimum votes: {min_votes}")

    # Calculate the number of votes for each revision.
    # A vote is cast if the absolute z-score is > cutoff.
    # We use abs() because a very low score (e.g., perplexity) can be as
    # anomalous as a very high score (e.g., nTTR).
    df['ai_vote_count'] = df[zscore_cols].abs().gt(z_score_cutoff).sum(axis=1)

    # Flag the revision if the vote count meets the minimum threshold
    df['ai_flag'] = df['ai_vote_count'] >= min_votes

    print("Voting system application complete.")
    return df

# --- 3. Apply the new voting system ---
# You can easily experiment with these two parameters now.
Z_SCORE_THRESHOLD = 2.5
MINIMUM_VOTES = 4 # e.g., flag if at least 4 out of 15 features are anomalous

final_df = apply_voting_system_zscore(
    features_df_z,
    z_score_cutoff=Z_SCORE_THRESHOLD,
    min_votes=MINIMUM_VOTES
)

# --- 4. Display the results ---
print("\n--- Voting System Results ---")
print(final_df[['rev_id', 'timestamp', 'root', 'ai_vote_count', 'ai_flag']].head())

print("\n--- Distribution of Votes ---")
print(final_df['ai_vote_count'].value_counts().sort_index())

# Calculate and display the total number of flagged revisions
flagged_count = final_df['ai_flag'].sum()
total_count = len(final_df)
print(f"\nTotal revisions flagged as potentially AI-generated: {flagged_count} out of {total_count} ({flagged_count/total_count:.2%})")


Applying voting system with Z-Score cutoff: 2.5 and minimum votes: 4
Voting system application complete.

--- Voting System Results ---
       rev_id                  timestamp     root  ai_vote_count  ai_flag
0  1160763005  2023-06-18 16:30:41+00:00  History              3    False
1  1166013433  2023-07-18 21:17:52+00:00  History              3    False
2  1171485250  2023-08-21 11:13:21+00:00  History              3    False
3  1177319025  2023-09-27 04:45:15+00:00  History              3    False
4  1181899435  2023-10-25 22:06:39+00:00  History              3    False

--- Distribution of Votes ---
ai_vote_count
0    12009
1     1935
2     1275
3      362
4      123
5       78
6       46
7        3
9        1
Name: count, dtype: int64

Total revisions flagged as potentially AI-generated: 251 out of 15832 (1.59%)


In [35]:
final_df

Unnamed: 0.1,Unnamed: 0,snapshot_ts,rev_id,timestamp,user,is_bot,content,article_id,title,root,...,chars_per_sent_zscore,sents_per_para_zscore,avg_line_len_zscore,nTTR_zscore,word_density_zscore,citation_delta_zscore,vote_sum,ai_vote,ai_vote_count,ai_flag
0,0,2023-06-30 00:00:00+00:00,1160763005,2023-06-18 16:30:41+00:00,2601:483:C301:7360:BC05:287F:176:F15F,False,{{Use dmy dates|date=July 2021}}\n{{Short desc...,20951,List of empires,History,...,0.543670,-0.965933,-0.837440,-0.782697,0.110041,-1.315955,4.0,1,3,False
1,1,2023-07-31 00:00:00+00:00,1166013433,2023-07-18 21:17:52+00:00,TompaDompa,False,{{Use dmy dates|date=July 2021}}\n{{Short desc...,20951,List of empires,History,...,0.464213,-0.958110,-0.840029,-0.698040,0.122510,-1.313476,4.0,1,3,False
2,2,2023-08-31 00:00:00+00:00,1171485250,2023-08-21 11:13:21+00:00,79.41.96.200,False,{{Use dmy dates|date=July 2021}}\n{{Short desc...,20951,List of empires,History,...,0.475988,-0.958110,-0.837633,-0.698040,0.116385,-1.315791,4.0,1,3,False
3,3,2023-09-30 00:00:00+00:00,1177319025,2023-09-27 04:45:15+00:00,187.254.98.237,False,{{Use dmy dates|date=July 2021}}\n{{Short desc...,20951,List of empires,History,...,0.543078,-0.965933,-0.837556,-0.698040,0.116189,-1.246176,4.0,1,3,False
4,4,2023-10-31 00:00:00+00:00,1181899435,2023-10-25 22:06:39+00:00,Idulatria,False,{{Use dmy dates|date=July 2021}}\n{{Short desc...,20951,List of empires,History,...,0.476368,-0.958110,-0.837556,-0.698040,0.116189,-1.246176,4.0,1,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15827,15827,2023-12-31 00:00:00+00:00,1190712502,2023-12-19 11:44:12+00:00,Mika1h,False,{{more citations needed|date=April 2023}}\n{{I...,8654,Pac-Man All-Stars,Video games,...,-1.121026,-0.807131,-0.983865,1.306311,3.336936,0.040132,3.0,1,2,False
15828,15828,2024-01-31 00:00:00+00:00,1197552616,2024-01-21 00:52:22+00:00,Venky64,False,{{more citations needed|date=April 2023}}\n{{I...,8654,Pac-Man All-Stars,Video games,...,-1.147850,-0.807131,-0.986427,0.998153,3.260247,0.033969,3.0,1,1,False
15829,15829,2023-08-31 00:00:00+00:00,1172064757,2023-08-24 18:34:01+00:00,CurlyWi,False,{{Infobox video game\n| collapsible = \n| ital...,10527,Idol Showdown,Video games,...,-0.920569,-0.558323,-0.754252,-0.533352,0.632742,-0.545341,0.0,0,0,False
15830,15830,2023-10-31 00:00:00+00:00,1181427482,2023-10-23 00:04:28+00:00,Radja Dwm,False,{{Infobox video game\n| collapsible = \n| ital...,10527,Idol Showdown,Video games,...,-0.863354,-0.500906,-0.694054,-0.279023,0.431931,-0.763382,0.0,0,0,False


In [None]:
# save to a file called normalized_everything100percat_with_ai_votes.csv
final_df.to_csv("normalized_everything100percat_with_ai_votes.csv", index=False)