In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import pickle

In [3]:
full_data = pd.read_csv("everything100percat.csv")


In [24]:
full_data.shape

(15832, 28)

In [2]:
# === Cell: Normalization utilities (place near your other utils) ===


def compute_baseline_stats(
        df: pd.DataFrame,
        feature_cols,
        category_col: str = 'root',
        timestamp_col: str = 'timestamp',
        baseline_end_date: str = '2022-11-01',
        min_baseline_rows_per_category: int = 10,
) -> pd.DataFrame:
    """
    Build per-category baseline means/std from rows strictly before baseline_end_date.
    Filters out categories with too few baseline rows to avoid junk std=0 artefacts.
    """
    # Ensure datetime
    if not np.issubdtype(df[timestamp_col].dtype, np.datetime64):
        df = df.copy()
        df[timestamp_col] = pd.to_datetime(df[timestamp_col], utc=True, errors='coerce')

    baseline_df = df[df[timestamp_col] < pd.Timestamp(baseline_end_date, tz='UTC')].copy()

    # (Optional) guard: drop categories with too few baseline observations
    valid_cats = (
        baseline_df.groupby(category_col)[feature_cols[0]]
        .size()
        .loc[lambda s: s >= min_baseline_rows_per_category]
        .index
    )
    baseline_df = baseline_df[baseline_df[category_col].isin(valid_cats)]

    # Compute stats
    stats = baseline_df.groupby(category_col)[feature_cols].agg(['mean', 'std'])
    stats.columns = ['_'.join(col) for col in stats.columns]  # flatten
    return stats


def normalize_features_by_baseline(
        df: pd.DataFrame,
        feature_cols,
        baseline_stats: pd.DataFrame,
        category_col: str = 'root'
) -> pd.DataFrame:
    """
    Merge precomputed baseline stats and add <feature>_zscore columns.
    Categories without baseline end up with 0 z-scores to avoid NaNs in voting.
    """
    out = df.merge(baseline_stats, on=category_col, how='left')

    for col in feature_cols:
        mean_col = f'{col}_mean'
        std_col = f'{col}_std'
        z_col = f'{col}_zscore'

        mean_vals = out[mean_col]
        std_vals = out[std_col]

        # (value - mean)/std, safe divide; if no stats or std=0 → 0.0
        out[z_col] = np.divide(
            out[col] - mean_vals,
            std_vals,
            out=np.zeros(len(out), dtype=float),
            where=(std_vals.notna() & (std_vals != 0))
        )

    # drop the merged mean/std columns
    out.drop(columns=[c for c in out.columns if any(c.endswith(sfx) for sfx in ('_mean', '_std'))], inplace=True)
    return out


def load_or_build_baseline(
        df: pd.DataFrame,
        feature_cols,
        category_col: str = 'root',
        timestamp_col: str = 'timestamp',
        baseline_end_date: str = '2022-11-01',
        cache_path: str | Path = './_baseline_stats.pkl',
        min_baseline_rows_per_category: int = 10,
) -> pd.DataFrame:
    """
    Try to load baseline stats from disk; otherwise compute and cache.
    """
    cache_path = Path(cache_path)
    if cache_path.exists():
        with open(cache_path, 'rb') as f:
            stats = pickle.load(f)
    else:
        stats = compute_baseline_stats(
            df=df,
            feature_cols=feature_cols,
            category_col=category_col,
            timestamp_col=timestamp_col,
            baseline_end_date=baseline_end_date,
            min_baseline_rows_per_category=min_baseline_rows_per_category,
        )
        cache_path.parent.mkdir(parents=True, exist_ok=True)
        with open(cache_path, 'wb') as f:
            pickle.dump(stats, f)
    return stats


In [4]:
# === Cell: Normalize current feature set ===

# Which features to normalize (extend to your full list of 14)
FEATURES_TO_NORMALIZE = [
    "p_t",
    "lexical_spike_delta",
    "perplexity",
    "burstiness",
    "mean_dep_depth",
    "clause_ratio",
    "voice_ratio",
    "fre",
    "fog",
    "chars_per_sent",
    "sents_per_para",
    "avg_line_len",
    "nTTR",
    "word_density",
    "citation_delta"
]

BASELINE_END = '2022-11-01'  # end of pre-ChatGPT window

# 1) Build/load per-category baseline stats once
baseline_stats = load_or_build_baseline(
    df=full_data,
    feature_cols=FEATURES_TO_NORMALIZE,
    category_col='root',
    timestamp_col='timestamp',
    baseline_end_date=BASELINE_END,
    cache_path='./_baseline_stats.pkl',
    min_baseline_rows_per_category=10,
)

# 2) Produce z-scores for the whole panel (pre and post)
features_df_z = normalize_features_by_baseline(
    df=full_data,
    feature_cols=FEATURES_TO_NORMALIZE,
    baseline_stats=baseline_stats,
    category_col='root',
)

# 3) (Optional) keep only z-score views for voting
Z_FEATURES = [f'{c}_zscore' for c in FEATURES_TO_NORMALIZE]
features_for_voting = features_df_z[['timestamp', 'root', 'title', 'rev_id', 'ai_flag'] + Z_FEATURES].copy() \
    if all(c in features_df_z.columns for c in ['title', 'rev_id', 'ai_flag']) \
    else features_df_z[['timestamp', 'root'] + Z_FEATURES].copy()


In [16]:
votes_df

Unnamed: 0,fre_vote,nTTR_vote
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
15827,0,0
15828,0,0
15829,0,0
15830,0,0


In [18]:
# === Cell: Voting on z-scores ===

# Example simple rule votes; replace/extend with your actual rule set
Z_THRESH = 2.0  # classic "outlier-ish" cutoff; tune per-feature if needed

votes = {}

# Example: readability drift (Flesch Reading Ease)
votes['fre_vote'] = (features_df_z['fre_zscore'].abs() >= Z_THRESH).astype(int)
# Example: lexical diversity
votes['nTTR_vote'] = (features_df_z['nTTR_zscore'] >= Z_THRESH).astype(int)
votes['p_t_vote'] = (features_df_z['p_t_zscore'] >= Z_THRESH).astype(int)
votes['lexical_spike_delta_vote'] = (features_df_z['lexical_spike_delta_zscore'] >= Z_THRESH).astype(int)
votes['perplexity_vote'] = (features_df_z['perplexity_zscore'] >= Z_THRESH).astype(int)
votes['burstiness_vote'] = (features_df_z['burstiness_zscore'] >= Z_THRESH).astype(int)
votes['mean_dep_depth_vote'] = (features_df_z['mean_dep_depth_zscore'] >= Z_THRESH).astype(int)
votes['clause_ratio_vote'] = (features_df_z['clause_ratio_zscore'] >= Z_THRESH).astype(int)
votes['voice_ratio_vote'] = (features_df_z['voice_ratio_zscore'] >= Z_THRESH).astype(int)
votes['fog_vote'] = (features_df_z['fog_zscore'] >= Z_THRESH).astype(int)
votes['chars_per_sent_vote'] = (features_df_z['chars_per_sent_zscore'] >= Z_THRESH).astype(int)
votes['sents_per_para_vote'] = (features_df_z['sents_per_para_zscore'] >= Z_THRESH).astype(int)
votes['avg_line_len_vote'] = (features_df_z['avg_line_len_zscore'] >= Z_THRESH).astype(int)
votes['word_density_vote'] = (features_df_z['word_density_zscore'] >= Z_THRESH).astype(int)
votes['citation_delta_vote'] = (features_df_z['citation_delta_zscore'] >= Z_THRESH).astype(int)

votes_df = pd.DataFrame(votes, index=features_df_z.index)

# Weighted sum (example weights; keep your own if you already have them)
WEIGHTS = {col: 1.0 for col in votes_df.columns}  # all 1.0 by default
# tweak specific ones if you want:
# WEIGHTS['nTTR_vote'] = 2.0
# WEIGHTS['citation_delta_vote'] = 2.0

# weighted sum
features_df_z['vote_sum'] = sum(WEIGHTS[col] * votes_df[col] for col in votes_df.columns)
features_df_z['ai_vote']  = (features_df_z['vote_sum'] >= THRESHOLD).astype(int)

weighted_sum = sum(WEIGHTS[k] * votes_df[k] for k in votes_df.columns)

# Final decision (example)
THRESHOLD = 2  # tune to your ROC/PR target
features_df_z['vote_sum'] = weighted_sum
features_df_z['ai_vote'] = (features_df_z['vote_sum'] >= THRESHOLD).astype(int)

# If you have a ground-truth-ish proxy (ai_flag), you can quickly sanity-check:
if 'ai_flag' in features_df_z.columns:
    from sklearn.metrics import classification_report

    print(classification_report(features_df_z['ai_flag'], features_df_z['ai_vote'], digits=3))


In [22]:
# shape of features_df_z
features_df_z.shape
# print first few rows
features_df_z

Unnamed: 0.1,Unnamed: 0,snapshot_ts,rev_id,timestamp,user,is_bot,content,article_id,title,root,...,fre_zscore,fog_zscore,chars_per_sent_zscore,sents_per_para_zscore,avg_line_len_zscore,nTTR_zscore,word_density_zscore,citation_delta_zscore,vote_sum,ai_vote
0,0,2023-06-30 00:00:00+00:00,1160763005,2023-06-18 16:30:41+00:00,2601:483:C301:7360:BC05:287F:176:F15F,False,{{Use dmy dates|date=July 2021}}\n{{Short desc...,20951,List of empires,History,...,-4.327842,6.045270,0.543670,-0.965933,-0.837440,-0.782697,0.110041,-1.315955,4.0,1
1,1,2023-07-31 00:00:00+00:00,1166013433,2023-07-18 21:17:52+00:00,TompaDompa,False,{{Use dmy dates|date=July 2021}}\n{{Short desc...,20951,List of empires,History,...,-4.281436,5.973193,0.464213,-0.958110,-0.840029,-0.698040,0.122510,-1.313476,4.0,1
2,2,2023-08-31 00:00:00+00:00,1171485250,2023-08-21 11:13:21+00:00,79.41.96.200,False,{{Use dmy dates|date=July 2021}}\n{{Short desc...,20951,List of empires,History,...,-4.326513,6.034735,0.475988,-0.958110,-0.837633,-0.698040,0.116385,-1.315791,4.0,1
3,3,2023-09-30 00:00:00+00:00,1177319025,2023-09-27 04:45:15+00:00,187.254.98.237,False,{{Use dmy dates|date=July 2021}}\n{{Short desc...,20951,List of empires,History,...,-4.309285,6.021318,0.543078,-0.965933,-0.837556,-0.698040,0.116189,-1.246176,4.0,1
4,4,2023-10-31 00:00:00+00:00,1181899435,2023-10-25 22:06:39+00:00,Idulatria,False,{{Use dmy dates|date=July 2021}}\n{{Short desc...,20951,List of empires,History,...,-4.323714,6.039556,0.476368,-0.958110,-0.837556,-0.698040,0.116189,-1.246176,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15827,15827,2023-12-31 00:00:00+00:00,1190712502,2023-12-19 11:44:12+00:00,Mika1h,False,{{more citations needed|date=April 2023}}\n{{I...,8654,Pac-Man All-Stars,Video games,...,-0.338468,-1.103966,-1.121026,-0.807131,-0.983865,1.306311,3.336936,0.040132,3.0,1
15828,15828,2024-01-31 00:00:00+00:00,1197552616,2024-01-21 00:52:22+00:00,Venky64,False,{{more citations needed|date=April 2023}}\n{{I...,8654,Pac-Man All-Stars,Video games,...,-0.353607,-1.112477,-1.147850,-0.807131,-0.986427,0.998153,3.260247,0.033969,3.0,1
15829,15829,2023-08-31 00:00:00+00:00,1172064757,2023-08-24 18:34:01+00:00,CurlyWi,False,{{Infobox video game\n| collapsible = \n| ital...,10527,Idol Showdown,Video games,...,0.133137,-1.828332,-0.920569,-0.558323,-0.754252,-0.533352,0.632742,-0.545341,0.0,0
15830,15830,2023-10-31 00:00:00+00:00,1181427482,2023-10-23 00:04:28+00:00,Radja Dwm,False,{{Infobox video game\n| collapsible = \n| ital...,10527,Idol Showdown,Video games,...,-0.036570,-1.441546,-0.863354,-0.500906,-0.694054,-0.279023,0.431931,-0.763382,0.0,0


In [20]:
# save to a file called normalized_everything100percat_with_ai_votes.csv
features_df_z.to_csv("normalized_everything100percat_with_ai_votes.csv", index=False)