In [5]:

print(df.columns)

Index(['Unnamed: 0', 'snapshot_ts', 'rev_id', 'timestamp', 'user', 'is_bot',
       'article_id', 'title', 'root', 'stratum', 'content', 'p_t',
       'lexical_spike_delta', 'perplexity', 'burstiness', 'upos_props',
       'mean_dep_depth', 'clause_ratio', 'voice_ratio', 'fre', 'fog',
       'chars_per_sent', 'sents_per_para', 'nTTR', 'word_density',
       'avg_line_len', 'citation_delta', 'p_t_zscore',
       'lexical_spike_delta_zscore', 'perplexity_zscore', 'burstiness_zscore',
       'mean_dep_depth_zscore', 'clause_ratio_zscore', 'voice_ratio_zscore',
       'fre_zscore', 'fog_zscore', 'chars_per_sent_zscore',
       'sents_per_para_zscore', 'avg_line_len_zscore', 'nTTR_zscore',
       'word_density_zscore', 'citation_delta_zscore', 'ai_vote_count',
       'ai_flag'],
      dtype='object')


In [1]:
import pandas as pd
import numpy as np

# --- 1. Load the Data ---
print("Loading data...")

csv_path = "normalized_everything100percat_with_ai_votes.csv"

try:
    df = pd.read_csv(csv_path)
    print(f"Loaded Data from {csv_path}: {len(df)} rows")

    # --- FIX: Handle Column Renaming Safely ---
    if 'plain_text' in df.columns:
        print("Found 'plain_text' column.")
        # Check if 'content' already exists and drop it to prevent duplicates
        if 'content' in df.columns:
            print("Existing 'content' column detected. Dropping it to prefer 'plain_text'...")
            df.drop(columns=['content'], inplace=True)

        df.rename(columns={'plain_text': 'content'}, inplace=True)
        print("Renamed 'plain_text' to 'content'.")
    elif 'content' in df.columns:
        print("Found 'content' column.")
    else:
        print("WARNING: No text column found in CSV.")

except FileNotFoundError:
    print(f"ERROR: Could not find {csv_path}.")
    raise

# Ensure timestamp is datetime
if 'snapshot_ts' in df.columns:
    df['snapshot_ts'] = pd.to_datetime(df['snapshot_ts'])

# --- CRITICAL FIX: Handle Index Duplicates ---
# Check if index has duplicates
if not df.index.is_unique:
    print("WARNING: Duplicate index found. Resetting index...")
df = df.reset_index(drop=True)
print(f"Index unique: {df.index.is_unique}")

# --- 2. THE HUNT: Find Your Examples ---

INTERVENTION_DATE = '2022-11-01'
is_post_gpt = df['snapshot_ts'] > INTERVENTION_DATE

# === SEARCH 1: The "Passive Voice" Anomaly ===
# Context: You found a sharp drop in voice_ratio for Music/Video Games in Section 6.3.
print("\n" + "="*60)
print("SEARCH 1: Passive Voice Anomalies (Music/Video Games)")
print("="*60)

target_topics = ['Music', 'Video games']
mask_topic = df['root'].isin(target_topics)

# Ensure we are using the correct column and it is 1D
if 'content' in df.columns:
    # Defensive check: if duplicates still exist, take the first one
    if isinstance(df['content'], pd.DataFrame):
        print("Warning: Duplicate 'content' columns found. Selecting the first one.")
        content_series = df['content'].iloc[:, 0]
    else:
        content_series = df['content']
    mask_not_nan = content_series.notna()
else:
    mask_not_nan = pd.Series([False] * len(df), index=df.index)

# Combine masks
subset_mask = is_post_gpt & mask_topic & mask_not_nan

if len(subset_mask) != len(df):
    print("Error: Mask length mismatch!")
else:
    candidates = df.loc[subset_mask].copy()
    # Looking for LOW voice_ratio (passive voice)
    candidates = candidates.sort_values(by='voice_ratio', ascending=True).head(5)

    for i, row in candidates.iterrows():
        print(f"\n--- Candidate {i+1} [Topic: {row.get('root', 'N/A')}] ---")
        print(f"Date: {row.get('snapshot_ts', 'N/A')} | Voice Ratio: {row.get('voice_ratio', 0):.4f}")
        content = str(row.get('content', 'NO CONTENT FOUND'))
        print(f"Snippet:\n{content[:600]}...\n")




Loading data...
Loaded Data from normalized_everything100percat_with_ai_votes.csv: 15832 rows
Found 'plain_text' column.
Existing 'content' column detected. Dropping it to prefer 'plain_text'...
Renamed 'plain_text' to 'content'.
Index unique: True

SEARCH 1: Passive Voice Anomalies (Music/Video Games)

--- Candidate 7640 [Topic: Music] ---
Date: 2023-09-30 00:00:00+00:00 | Voice Ratio: 0.9716
Snippet:
thumb|[[dominican rite|dominican missal, c. 1240, giving a portion of the accentus (historical museum of lausanne).]] accentus (or accentus ecclesiasticus ecclesiastical accent is a style of church music that emphasizes spoken word. it is often contrasted with concentus, an alternative style that emphasizes harmony. the terms accentus and concentus were probably introduced by andreas ornithoparchus in his musicae activae micrologus, published in leipzig in 1517. "concentus might be chief ruler over all things that are sung...and accentus over all things that are read," according to ornit

In [35]:
import pandas as pd

# Load your dataframe as you did before
# df = ...

print("\n" + "="*60)
print("REFINED SEARCH 2: Vocabulary Dilution (Prose only)")
print("="*60)

# 1. Filter out "Lists" to find actual repetitive AI prose
mask_comp = df['root'] == 'Computing'
# Filter out rows where content starts with "List of" or "This is a list"
mask_no_lists = ~df['content'].str.lower().str.startswith(('list of', 'this is a list'), na=False)

candidates_vocab = df[is_post_gpt & mask_comp & mask_not_nan & mask_no_lists].copy()
candidates_vocab = candidates_vocab.sort_values(by='nTTR', ascending=True).head(5)

for i, row in candidates_vocab.iterrows():
    print(f"\n--- Candidate {i+1} [Computing - PROSE] ---")
    print(f"Date: {row.get('snapshot_ts', 'N/A')} | nTTR: {row.get('nTTR', 0):.4f}")
    print(f"Snippet:\n{str(row.get('content', ''))[:500]}...\n")


print("\n" + "="*60)
print("REFINED SEARCH 3: The 'Edit War' (Dip & Revert)")
print("="*60)

# 2. Calculate the 'Dip' and the 'Recovery' to find the Revert
music_df = df[df['root'] == 'Music'].sort_values(['article_id', 'snapshot_ts'])

music_df['prev_voice'] = music_df.groupby('article_id')['voice_ratio'].shift(1)
music_df['next_voice'] = music_df.groupby('article_id')['voice_ratio'].shift(-1)

# 'Drop' = Current - Previous (Did it drop?)
# 'Recovery' = Next - Current (Did it jump back up?)
music_df['drop'] = music_df['voice_ratio'] - music_df['prev_voice']
music_df['recovery'] = music_df['next_voice'] - music_df['voice_ratio']

# Look for a significant Drop followed by a significant Recovery
# Lowered threshold to 0.05 based on your previous output
edit_wars = music_df[
    (music_df['drop'] < -0.05) &
    (music_df['recovery'] > 0.05)
].sort_values('drop', ascending=True)

if not edit_wars.empty:
    top_case = edit_wars.iloc[0]
    print(f"\nFOUND EDIT WAR in Article ID: {top_case['article_id']}")
    print(f"Date of Anomaly: {top_case['snapshot_ts']}")
    print(f"The Drop (Change from prev): {top_case['drop']:.4f}")
    print(f"The Revert (Change to next): {top_case['recovery']:.4f}")

    print(f"\n--- THE 'BOT' REVISION (The Dip) ---")
    print(f"Voice Ratio: {top_case['voice_ratio']:.4f}")
    print(f"Text:\n{str(top_case.get('content', ''))[:600]}...\n")
else:
    print("No strict edit wars found. The anomaly might be in the 'Video games' category instead?")


REFINED SEARCH 2: Vocabulary Dilution (Prose only)

--- Candidate 1228 [Computing - PROSE] ---
Date: 2023-01-31 00:00:00+00:00 | nTTR: 4.2905
Snippet:
thumb|[[ultraviolet photography, a visual technology that has applications in astronomy]] visual technology is the engineering discipline dealing with visual representation. ==types== visual technology includes photography, printing, augmented reality, virtual reality and video. ==see also== *audiovisual *audiovisual education *information and communications technology *medical imaging *multimedia *technology *visual arts *visual culture *visual perception *visual sociology ==references== ===wor...


--- Candidate 1118 [Computing - PROSE] ---
Date: 2023-01-31 00:00:00+00:00 | nTTR: 4.3466
Snippet:
an information system contingency plan (iscp is a pre-established plan for restoration of the services of a given information system after a disruption. the us national institute of standards and technology computer security resource center (c

In [34]:
import pandas as pd

# Load your dataframe (Assuming it's already loaded in your environment as 'df')
# If not, uncomment the lines below:
# df = pd.read_csv("normalized_everything100percat_with_ai_votes.csv")
# if 'content' in df.columns: df.drop(columns=['content'], inplace=True)
# df.rename(columns={'plain_text': 'content'}, inplace=True)
# df['snapshot_ts'] = pd.to_datetime(df['snapshot_ts'])
# df = df.reset_index(drop=True)

print("\n" + "="*60)
print("FINAL SEARCH: The 'Edit War' in Video Games")
print("="*60)

# 1. Focus on Video Games
vg_df = df[df['root'] == 'Video games'].copy()
vg_df = vg_df.sort_values(['article_id', 'snapshot_ts'])

# 2. Calculate Shifts (Previous and Next month's voice ratio)
vg_df['prev_voice'] = vg_df.groupby('article_id')['voice_ratio'].shift(1)
vg_df['next_voice'] = vg_df.groupby('article_id')['voice_ratio'].shift(-1)

# 3. Define the Anatomy of an Edit War
# The Dip: Current voice is much lower than Previous
vg_df['drop_size'] = vg_df['prev_voice'] - vg_df['voice_ratio']
# The Revert: Next voice is much higher than Current
vg_df['recovery_size'] = vg_df['next_voice'] - vg_df['voice_ratio']

# 4. Filter for the "V-Shape" Pattern
# We look for a massive drop (> 0.3) followed by a massive recovery (> 0.3)
# This corresponds to the visual "spike down" in your plots.
candidates = vg_df[
    (vg_df['drop_size'] > 0.3) &
    (vg_df['recovery_size'] > 0.3)
]

if not candidates.empty:
    top_case = candidates.sort_values('drop_size', ascending=False).iloc[0]

    print(f"FOUND SMOKING GUN in Article ID: {top_case['article_id']}")
    print(f"Title: {top_case.get('title', 'Unknown')}")
    print(f"Date of Bot Attack: {top_case['snapshot_ts']}")
    print(f"Voice Ratio (The Dip): {top_case['voice_ratio']:.4f}")
    print(f"Previous Ratio: {top_case['prev_voice']:.4f} -> Dropped by {top_case['drop_size']:.4f}")
    print(f"Next Ratio: {top_case['next_voice']:.4f} -> Recovered by {top_case['recovery_size']:.4f}")

    print(f"\n--- THE ANOMALY TEXT (Passive Voice Bot) ---")
    print(f"{str(top_case.get('content', ''))[:1000]}...")

else:
    print("No massive V-shaped edit wars found. Trying looser thresholds (0.1)...")
    candidates_loose = vg_df[(vg_df['drop_size'] > 0.1) & (vg_df['recovery_size'] > 0.1)]
    if not candidates_loose.empty:
        top_case = candidates_loose.sort_values('drop_size', ascending=False).iloc[0]
        print(f"Found smaller edit war in Article: {top_case['title']}")
        print(f"Date: {top_case['snapshot_ts']}")
        print(f"Content snippet: {str(top_case.get('content', ''))[:300]}...")
    else:
        print("Still nothing. Check if the 'Video games' category name is correct in your CSV.")


FINAL SEARCH: The 'Edit War' in Video Games
FOUND SMOKING GUN in Article ID: 19120
Title: Revolution 60
Date of Bot Attack: 2023-03-31 00:00:00+00:00
Voice Ratio (The Dip): 0.0000
Previous Ratio: 0.9901 -> Dropped by 0.9901
Next Ratio: 0.9879 -> Recovered by 0.9879

--- THE ANOMALY TEXT (Passive Voice Bot) ---
nan...


In [20]:
print(df)

       Unnamed: 0               snapshot_ts      rev_id  \
0               0 2023-06-30 00:00:00+00:00  1160763005   
1               1 2023-07-31 00:00:00+00:00  1166013433   
2               2 2023-08-31 00:00:00+00:00  1171485250   
3               3 2023-09-30 00:00:00+00:00  1177319025   
4               4 2023-10-31 00:00:00+00:00  1181899435   
...           ...                       ...         ...   
15827       15827 2023-12-31 00:00:00+00:00  1190712502   
15828       15828 2024-01-31 00:00:00+00:00  1197552616   
15829       15829 2023-08-31 00:00:00+00:00  1172064757   
15830       15830 2023-10-31 00:00:00+00:00  1181427482   
15831       15831 2023-11-30 00:00:00+00:00  1184252783   

                       timestamp                                   user  \
0      2023-06-18 16:30:41+00:00  2601:483:C301:7360:BC05:287F:176:F15F   
1      2023-07-18 21:17:52+00:00                             TompaDompa   
2      2023-08-21 11:13:21+00:00                           79.41.9

In [38]:
import pandas as pd
import numpy as np

# --- 1. Load Data ---
print("Loading data...")
csv_path = "normalized_everything100percat_with_ai_votes.csv"

try:
    df = pd.read_csv(csv_path)
    # Robust Column Cleaning
    if 'plain_text' in df.columns:
        if 'content' in df.columns: df.drop(columns=['content'], inplace=True)
        df.rename(columns={'plain_text': 'content'}, inplace=True)

    # Fix Types
    if 'snapshot_ts' in df.columns: df['snapshot_ts'] = pd.to_datetime(df['snapshot_ts'])
    df = df.reset_index(drop=True)

    # Filter for Post-ChatGPT
    is_post_gpt = df['snapshot_ts'] > '2022-11-01'
    # Ensure text exists
    has_text = df['content'].notna() & (df['content'].str.len() > 100)

    print("Data loaded and filtered.")

    # --- SEARCH 1: The "Unsourced Generator" (Citation Drops) ---
    print("\n" + "="*60)
    print("SEARCH 1: The 'Unsourced Generator' (Citation Drops)")
    print("="*60)
    # Logic: High AI Votes BUT Negative Citation Delta (Removing refs while writing)
    # We want valid text, Post-GPT, High Votes, Negative Citations

    mask_citation = (df['citation_delta_zscore'] < -1.0) # Significantly negative
    mask_high_risk = (df['ai_vote_count'] >= 2) # At least suspicious

    candidates_cit = df[is_post_gpt & has_text & mask_citation & mask_high_risk].copy()

    # Sort by how MANY citations were dropped (lowest zscore)
    candidates_cit = candidates_cit.sort_values('citation_delta_zscore', ascending=True).head(3)

    for i, row in candidates_cit.iterrows():
        print(f"\n--- Candidate {i+1} [Topic: {row.get('root')}] ---")
        print(f"Date: {row.get('snapshot_ts')} | AI Votes: {row.get('ai_vote_count')}")
        print(f"Citation Delta Z-Score: {row.get('citation_delta_zscore'):.4f} (Removing Refs!)")
        print(f"Snippet:\n{str(row.get('content'))[:400]}...\n")

    # --- SEARCH 2: The "Narrative Bot" (Political History) ---
    print("\n" + "="*60)
    print("SEARCH 2: The 'Narrative Bot' (Political History)")
    print("="*60)
    # Logic: Political History, High Confidence (Votes >= 3), Not a List

    mask_pol = df['root'] == 'Political history'
    mask_confident = df['ai_vote_count'] >= 3
    # Filter out "List of" titles to get pure narrative
    mask_narrative = ~df['title'].str.startswith('List of', na=False)

    candidates_pol = df[is_post_gpt & has_text & mask_pol & mask_confident & mask_narrative].copy()

    # Sort by date (latest first to see recent advanced bots) or Voice Ratio (passive)
    candidates_pol = candidates_pol.sort_values('snapshot_ts', ascending=False).head(3)

    for i, row in candidates_pol.iterrows():
        print(f"\n--- Candidate {i+1} [Title: {row.get('title')}] ---")
        print(f"Date: {row.get('snapshot_ts')} | AI Votes: {row.get('ai_vote_count')}")
        print(f"Voice Ratio: {row.get('voice_ratio'):.4f} | Perplexity: {row.get('perplexity'):.4f}")
        print(f"Snippet:\n{str(row.get('content'))[:400]}...\n")

    # --- SEARCH 3: The "Subtle Polisher" (Chemistry) ---
    print("\n" + "="*60)
    print("SEARCH 3: The 'Subtle Polisher' (Chemistry)")
    print("="*60)
    # Logic: Chemistry, Votes == 2 (High Sensitivity only), Post-GPT
    # This finds the "hidden" AI that your High-Reliability model missed

    mask_chem = df['root'] == 'Chemistry'
    mask_subtle = df['ai_vote_count'] == 2 # Exactly 2 votes (The "Maybe" zone)

    candidates_chem = df[is_post_gpt & has_text & mask_chem & mask_subtle].copy()
    candidates_chem = candidates_chem.head(3)

    for i, row in candidates_chem.iterrows():
        print(f"\n--- Candidate {i+1} [Title: {row.get('title')}] ---")
        print(f"Date: {row.get('snapshot_ts')} | AI Votes: {row.get('ai_vote_count')}")
        print(f"Snippet:\n{str(row.get('content'))[:400]}...\n")

except Exception as e:
    print(f"Error: {e}")

Loading data...
Data loaded and filtered.

SEARCH 1: The 'Unsourced Generator' (Citation Drops)

--- Candidate 15198 [Topic: Video games] ---
Date: 2023-10-31 00:00:00+00:00 | AI Votes: 5
Citation Delta Z-Score: -2.3390 (Removing Refs!)
Snippet:
{{multiple issues| }} {{infobox video game |image=csi crime city.jpg |caption= |developer=area/code |publisher=ubisoft |designer= |engine= |released 2010 |genre=adventure |modes=single player |platforms=facebook }} csi crime city was the eleventh video game adaptation of the csi crime scene investigation television series, developed for facebook by american studio area/code and published by ubisof...


--- Candidate 6826 [Topic: Medicine] ---
Date: 2023-10-31 00:00:00+00:00 | AI Votes: 2
Citation Delta Z-Score: -1.9141 (Removing Refs!)
Snippet:
{{multiple issues| }} pancrinol was a medicine made from liver, spleen, kidney, and adrenal extracts from slaughter animals. it was manufactured by the laboratories of dr. françois debat in paris. this d

In [40]:
import pandas as pd
import numpy as np
import re

# --- 1. Load Data ---
print("Loading data...")
csv_path = "normalized_everything100percat_with_ai_votes.csv"

try:
    df = pd.read_csv(csv_path)
    if 'plain_text' in df.columns:
        if 'content' in df.columns: df.drop(columns=['content'], inplace=True)
        df.rename(columns={'plain_text': 'content'}, inplace=True)

    if 'snapshot_ts' in df.columns: df['snapshot_ts'] = pd.to_datetime(df['snapshot_ts'])
    df = df.reset_index(drop=True)

    is_post_gpt = df['snapshot_ts'] > '2022-11-01'
    has_text = df['content'].notna() & (df['content'].str.len() > 100)

    print("Data loaded and filtered.")

    # --- HELPER: Smart Context Extractor ---
    def get_context_snippet(text, keywords, window=250):
        """Finds the keyword in the text and returns a window around it."""
        text_str = str(text)
        # Create a regex pattern to find any of the keywords
        pattern = '|'.join(map(re.escape, keywords)) if isinstance(keywords, list) else keywords

        match = re.search(pattern, text_str, re.IGNORECASE)
        if match:
            # Found it! Calculate start/end to center the match
            start = max(0, match.start() - window)
            end = min(len(text_str), match.end() + window)
            prefix = "..." if start > 0 else ""
            suffix = "..." if end < len(text_str) else ""
            return f"{prefix}{text_str[start:end]}{suffix}"
        else:
            # Fallback: Return the beginning if no specific keyword match (for style searches)
            return text_str[:500] + "..."

    # --- SEARCH 4: The "Model Leak" (Context-Aware) ---
    print("\n" + "="*60)
    print("SEARCH 4: The 'Model Leak' (Showing the ACTUAL Artifact)")
    print("="*60)

    leak_keywords = [
        "as an ai language model", "i cannot", "i don't have personal",
        "knowledge cutoff", "regenerate response", "september 2021"
    ]

    # Use regex for filtering
    pattern = '|'.join(leak_keywords)
    mask_leak = df['content'].str.contains(pattern, case=False, na=False)
    candidates_leak = df[is_post_gpt & has_text & mask_leak].copy()

    if not candidates_leak.empty:
        for i, row in candidates_leak.head(5).iterrows():
            print(f"\n--- Candidate {i+1} [Title: {row.get('title')}] ---")
            print(f"Date: {row.get('snapshot_ts')} | AI Votes: {row.get('ai_vote_count')}")

            # USE THE NEW FUNCTION HERE
            snippet = get_context_snippet(row.get('content'), leak_keywords)
            print(f"Snippet:\n{snippet}\n")
    else:
        print("No obvious leaks found.")


    # --- SEARCH 5: The "History Sanitizer" (Passive Voice) ---
    print("\n" + "="*60)
    print("SEARCH 5: The 'History Sanitizer' (Politics + Passive Voice)")
    print("="*60)

    mask_politics = df['root'].isin(['Political history', 'Politics', 'Military history'])
    candidates_sanitizer = df[is_post_gpt & has_text & mask_politics & (df['ai_vote_count'] >= 3)].copy()
    candidates_sanitizer = candidates_sanitizer.sort_values('voice_ratio', ascending=True).head(3)

    for i, row in candidates_sanitizer.iterrows():
        print(f"\n--- Candidate {i+1} [Title: {row.get('title')}] ---")
        print(f"Date: {row.get('snapshot_ts')} | Voice Ratio: {row.get('voice_ratio'):.4f}")
        # For style, we want the intro, but let's grab MORE text (1000 chars) to see the pattern
        print(f"Snippet:\n{str(row.get('content'))[:1000]}...\n")


    # --- SEARCH 6: The "Cultural Flattener" (Robotic Pop Culture) ---
    print("\n" + "="*60)
    print("SEARCH 6: The 'Cultural Flattener' (Pop Culture + Low Burstiness)")
    print("="*60)

    mask_pop = df['root'].isin(['Popular culture', 'Film', 'Video games'])
    candidates_flat = df[is_post_gpt & has_text & mask_pop & (df['ai_vote_count'] >= 3)].copy()
    candidates_flat = candidates_flat.sort_values('burstiness', ascending=True).head(3)

    for i, row in candidates_flat.iterrows():
        print(f"\n--- Candidate {i+1} [Title: {row.get('title')}] ---")
        print(f"Date: {row.get('snapshot_ts')} | Burstiness: {row.get('burstiness'):.4f}")
        # Grab more text to show the repetitive structure
        print(f"Snippet:\n{str(row.get('content'))[:1000]}...\n")

except Exception as e:
    print(f"Error: {e}")

Loading data...
Data loaded and filtered.

SEARCH 4: The 'Model Leak' (Showing the ACTUAL Artifact)

--- Candidate 116 [Title: Extrajudicial killing] ---
Date: 2022-11-30 00:00:00+00:00 | AI Votes: 0
Snippet:
...go. egypt ==== extrajudicial killings and death squads are common in egypt. egypt recorded and reported more than a dozen unlawful extrajudicial killings of apparent ?terrorists? in the country by the nsa officers and the interior ministry police in september 2021. a 101-page report detailed the ?armed militants? being killed in shootouts despite not posing any threat to the security forces or nations of the country while being killed, which in many cases were already in custody. statements by the family and ...


--- Candidate 117 [Title: Extrajudicial killing] ---
Date: 2022-12-31 00:00:00+00:00 | AI Votes: 0
Snippet:
...go. egypt ==== extrajudicial killings and death squads are common in egypt. egypt recorded and reported more than a dozen unlawful extrajudicial killings of 

In [41]:
import pandas as pd
import numpy as np
import re

# --- 1. Load Data ---
print("Loading data...")
csv_path = "normalized_everything100percat_with_ai_votes.csv"

try:
    df = pd.read_csv(csv_path)
    if 'plain_text' in df.columns:
        if 'content' in df.columns: df.drop(columns=['content'], inplace=True)
        df.rename(columns={'plain_text': 'content'}, inplace=True)

    if 'snapshot_ts' in df.columns: df['snapshot_ts'] = pd.to_datetime(df['snapshot_ts'])
    df = df.reset_index(drop=True)

    is_post_gpt = df['snapshot_ts'] > '2022-11-01'
    has_text = df['content'].notna() & (df['content'].str.len() > 100)

    # --- SEARCH 4 (REFINED): The "True" Model Leak ---
    print("\n" + "="*60)
    print("SEARCH 4: The 'True' Model Leak (Refusals Only)")
    print("="*60)

    # REMOVED "september 2021" and "knowledge cutoff" to avoid false positives
    leak_keywords = [
        "as an ai language model",
        "i cannot",
        "i don't have personal",
        "regenerate response",
        "i am an ai"
    ]

    pattern = '|'.join(leak_keywords)
    mask_leak = df['content'].str.contains(pattern, case=False, na=False)
    candidates_leak = df[is_post_gpt & has_text & mask_leak].copy()

    if not candidates_leak.empty:
        print(f"FOUND {len(candidates_leak)} TRUE LEAKS!")
        for i, row in candidates_leak.head(3).iterrows():
            print(f"\n--- Candidate {i+1} [Title: {row.get('title')}] ---")

            # Smart Snippet: Find the keyword and show context
            text_str = str(row.get('content'))
            for key in leak_keywords:
                match = re.search(re.escape(key), text_str, re.IGNORECASE)
                if match:
                    start = max(0, match.start() - 100)
                    end = min(len(text_str), match.end() + 100)
                    print(f"MATCHED KEYWORD: '{key}'")
                    print(f"Snippet: ...{text_str[start:end]}...\n")
                    break # Show only the first match per article
    else:
        print("No text-based leaks found (Cleaned by editors?).")

except Exception as e:
    print(f"Error: {e}")

Loading data...

SEARCH 4: The 'True' Model Leak (Refusals Only)
FOUND 66 TRUE LEAKS!

--- Candidate 2096 [Title: Socialist democracy] ---
MATCHED KEYWORD: 'i cannot'
Snippet: ...the abolition of private property and to the possibilities inherent in planned economy. but, they - i cannot say exactly - but i will say two or three times less than they could be under a regime of soviet de...


--- Candidate 2097 [Title: Socialist democracy] ---
MATCHED KEYWORD: 'i cannot'
Snippet: ...the abolition of private property and to the possibilities inherent in planned economy. but, they - i cannot say exactly - but i will say two or three times less than they could be under a regime of soviet de...


--- Candidate 2098 [Title: Socialist democracy] ---
MATCHED KEYWORD: 'i cannot'
Snippet: ...the abolition of private property and to the possibilities inherent in planned economy. but, they - i cannot say exactly - but i will say two or three times less than they could be under a regime of soviet de.

In [19]:
import pandas as pd
import numpy as np
import re

# --- Configuration ---
DATA_CSV_PATH = "normalized_everything100percat_with_ai_votes.csv"
# Update this path if needed (e.g., "python_code/combined_chatgpt_words.csv")
WORDS_CSV_PATH = "combined_chatgpt_words.csv"

print("Loading data...")

try:
    # 1. Load Data & Words
    df = pd.read_csv(DATA_CSV_PATH)
    try:
        words_df = pd.read_csv(WORDS_CSV_PATH)
        raw_vocab = set(words_df.iloc[:, 0].dropna().astype(str).str.lower())
    except Exception as e:
        # Fallback if file not found
        print(f"Warning: Could not load word list ({e}). Using fallback.")
        raw_vocab = {"delve", "tapestry", "landscape", "testament", "underscore", "intricate", "paramount", "leverage", "robust", "seamless"}

    # 2. Clean the Data
    if 'plain_text' in df.columns:
        if 'content' in df.columns: df.drop(columns=['content'], inplace=True)
        df.rename(columns={'plain_text': 'content'}, inplace=True)
    if 'snapshot_ts' in df.columns: df['snapshot_ts'] = pd.to_datetime(df['snapshot_ts'])
    df = df.reset_index(drop=True)

    # 3. Refine the Vocabulary (Remove common false positives)
    banned_words = {
        'data', 'source', 'information', 'system', 'list', 'image', 'file', 'link',
        'external', 'history', 'series', 'version', 'development', 'release',
        'game', 'film', 'music', 'album', 'song', 'voice', 'social', 'party',
        'ongoing', 'include', 'current', 'world', 'science', 'note', 'action',
        'university', 'school', 'research', 'area', 'field'
    }
    spicy_vocab = [w for w in raw_vocab if w not in banned_words and len(w) > 4]

    print(f"Refined Vocabulary: {len(spicy_vocab)} words")

    # 4. Target Specific Categories
    target_roots = ['Computing', 'Technology', 'Engineering', 'Business']

    subset = df[
        (df['snapshot_ts'] > '2022-11-01') &
        (df['root'].isin(target_roots)) &
        (df['content'].notna()) &
        (df['content'].str.len() > 500)
    ].copy()

    # --- THE SMART SCORER ---
    def get_prose_score(row):
        text = str(row['content']).lower()
        title_words = set(str(row['title']).lower().split())

        # Contextual Filtering: Ignore words if they are in the Title
        local_vocab = [w for w in spicy_vocab if w not in title_words]

        # Count hits (checking common boundaries)
        # Using regex here would be slower but more accurate; simple string match is fast
        hits = [w for w in local_vocab if f" {w} " in text or f" {w}." in text or f" {w}," in text]
        unique_hits = len(set(hits))

        # Penalize lists (high newline ratio)
        if text.count('\n') > text.count(' ') / 5:
            return 0, []

        return unique_hits, list(set(hits))

    # Apply scoring
    subset[['spicy_score', 'spicy_words']] = subset.apply(
        lambda row: pd.Series(get_prose_score(row)), axis=1
    )

    # --- FIX 1: DEDUPLICATION ---
    # Sort by score descending first so we keep the "spiciest" revision
    subset = subset.sort_values('spicy_score', ascending=False)
    # Drop duplicates by article_id, keeping the first (highest score)
    unique_candidates = subset.drop_duplicates(subset='article_id', keep='first')

    # Get top 5 unique articles
    top_prose = unique_candidates.head(5)

    # --- FIX 2: IMPROVED SNIPPET EXTRACTOR ---
    def extract_densest_snippet(text, target_words, window_size=600):
        """
        Finds the window of text containing the highest density of target words.
        """
        text_lower = text.lower()
        word_indices = []

        # Find all start indices of target words using Regex for proper boundaries
        for w in target_words:
            for match in re.finditer(r'\b' + re.escape(w) + r'\b', text_lower):
                word_indices.append(match.start())

        if not word_indices:
            return text[:window_size] + "..." # Fallback

        word_indices.sort()

        # Sliding window to find max density
        max_words_in_window = 0
        best_center = word_indices[0]

        # Check around each word occurrence
        for center_idx in word_indices:
            start_window = max(0, center_idx - window_size // 2)
            end_window = min(len(text), center_idx + window_size // 2)

            # Count how many keyword hits fall inside this window
            count = sum(1 for wi in word_indices if start_window <= wi < end_window)

            if count > max_words_in_window:
                max_words_in_window = count
                best_center = center_idx

        # Extract the best window
        start = max(0, best_center - window_size // 2)
        end = min(len(text), best_center + window_size // 2)

        # Clean up newlines for display
        snippet = text[start:end].replace('\n', ' ')
        return f"...{snippet}..."

    print("\n" + "="*60)
    print("SEARCH RESULT: The 'Purple Prose' Generators (Unique & Focused)")
    print("="*60)

    for i, row in top_prose.iterrows():
        print(f"\n--- Candidate {i} [Topic: {row['root']} | Title: {row['title']}] ---")
        print(f"Unique AI Words: {row['spicy_score']} | Votes: {row['ai_vote_count']}")
        print(f"Words Found: {row['spicy_words'][:10]}...")

        snippet = extract_densest_snippet(str(row['content']), row['spicy_words'])
        print(f"Snippet:\n{snippet}\n")

except Exception as e:
    print(f"Error: {e}")

Loading data...
Refined Vocabulary: 284 words

SEARCH RESULT: The 'Purple Prose' Generators (Unique & Focused)

--- Candidate 13367 [Topic: Technology | Title: Glossary of engineering: M?Z] ---
Unique AI Words: 86 | Votes: 2
Words Found: ['drive', 'stability', 'ultimately', 'integrate', 'address', 'interact', 'significant', 'efficient', 'critical', 'arguably']...
Snippet:
...ral tendency")</ref><ref name=dodge1>dodge, y. (2003 the oxford dictionary of statistical terms, oup for international statistical institute. (entry for "central tendency")</ref> the central tendency of a distribution is typically contrasted with its dispersion or variability dispersion and central tendency are the often characterized properties of distributions. analysis may judge whether data has a strong or a weak central tendency based on its dispersion.}} </ref an ideal mechanism transmits power without adding to or subtracting from it. this means the ideal mechanism does not include a po...


--- Candidate 11

In [12]:
# print row for “Acts of Union 1707” (Political History, Jan 2024).


# print(df[df['title'] == 'Acts of Union 1707'])

# print row for Khivan campaign of 1839–1840"
# print(df[df['title'] == 'Khivan campaign of 1839–1840'])

# print rows whose tittle contains Khivan campaign
print(df[df['title'].str.contains('Khivan campaign')])

      Unnamed: 0               snapshot_ts      rev_id  \
9005        9005 2022-01-31 00:00:00+00:00  1064889049   
9006        9006 2022-02-28 00:00:00+00:00  1072179533   
9007        9007 2022-09-30 00:00:00+00:00  1107789906   
9008        9008 2022-10-31 00:00:00+00:00  1117528644   
9009        9009 2023-02-28 00:00:00+00:00  1137662854   
9010        9010 2023-04-30 00:00:00+00:00  1149493915   
9011        9011 2023-08-31 00:00:00+00:00  1172967014   
9012        9012 2023-10-31 00:00:00+00:00  1179966538   
9013        9013 2023-12-31 00:00:00+00:00  1187624065   
9014        9014 2024-01-31 00:00:00+00:00  1197452333   

                      timestamp               user  is_bot  article_id  \
9005  2022-01-10 18:26:18+00:00      MrBismark1871   False       23097   
9006  2022-02-16 09:53:18+00:00   Benjamin Trovato   False       23097   
9007  2022-08-31 22:25:44+00:00        Jay D. Easy   False       23097   
9008  2022-10-22 06:14:49+00:00    124.246.112.183   False       

In [14]:
import pandas as pd

# --- Configuration ---
DATA_CSV_PATH = "normalized_everything100percat_with_ai_votes.csv"

print("Loading data...")
try:
    df = pd.read_csv(DATA_CSV_PATH)

    # 1. Clean Dates & Fix Timezone
    if 'snapshot_ts' in df.columns:
        df['snapshot_ts'] = pd.to_datetime(df['snapshot_ts'])
        # Standardize to UTC to avoid comparison errors
        if df['snapshot_ts'].dt.tz is None:
             df['snapshot_ts'] = df['snapshot_ts'].dt.tz_localize('UTC')
        else:
             df['snapshot_ts'] = df['snapshot_ts'].dt.tz_convert('UTC')

    # 2. Define ChatGPT Release Date (UTC)
    GPT_RELEASE_DATE = pd.Timestamp('2022-11-30').tz_localize('UTC')

    # 3. Create "Pre" and "Post" Groups
    pre_gpt = df[df['snapshot_ts'] < GPT_RELEASE_DATE]
    post_gpt = df[df['snapshot_ts'] >= GPT_RELEASE_DATE]

    # 4. Aggregate Scores
    pre_stats = pre_gpt.groupby(['article_id', 'title', 'root'])['ai_vote_count'].mean().reset_index(name='pre_score')
    post_stats = post_gpt.groupby(['article_id', 'title', 'root'])['ai_vote_count'].mean().reset_index(name='post_score')

    # 5. Merge and Calculate Delta
    comparison = pd.merge(pre_stats, post_stats, on=['article_id', 'title', 'root'], how='inner')
    comparison['delta'] = comparison['post_score'] - comparison['pre_score']

    # 6. Filter for TRUE Transformations
    # pre_score < 1.5 (Was Human)
    # post_score >= 3.0 (Became AI)
    # delta >= 2.0 (Significant Jump)
    transformations = comparison[
        (comparison['pre_score'] < 1.5) &
        (comparison['post_score'] >= 3.0) &
        (comparison['delta'] >= 2.0)
    ].sort_values('delta', ascending=False)

    print("\n" + "="*80)
    print(f"FOUND {len(transformations)} ARTICLES WITH A SIGNIFICANT SHIFT")
    print("="*80)

    if not transformations.empty:
        print(transformations[['title', 'root', 'pre_score', 'post_score', 'delta']].head(5))

        # Extract the best example for your thesis
        top_row = transformations.iloc[0]
        t_id = top_row['article_id']
        t_title = top_row['title']

        print(f"\n--- Best Candidate: {t_title} ---")

        # Get the text before and after the shift
        history = df[df['article_id'] == t_id].sort_values('snapshot_ts')

        # Get one human revision (pre-Nov 2022)
        human_rev = history[history['snapshot_ts'] < GPT_RELEASE_DATE].tail(1)
        # Get one AI revision (post-Nov 2022 with high vote)
        ai_rev = history[(history['snapshot_ts'] >= GPT_RELEASE_DATE) & (history['ai_vote_count'] >= 4)].head(1)

        if not human_rev.empty and not ai_rev.empty:
            print("\nBEFORE (Human):")
            print(human_rev['content'].values[0][:300] + "...")
            print("\nAFTER (AI):")
            print(ai_rev['content'].values[0][:300] + "...")

    else:
        print("No articles found with strict criteria. Try pre_score < 2.0.")

except Exception as e:
    print(f"Error: {e}")

Loading data...

FOUND 0 ARTICLES WITH A SIGNIFICANT SHIFT
No articles found with strict criteria. Try pre_score < 2.0.


In [16]:
import pandas as pd

# --- Configuration ---
DATA_CSV_PATH = "normalized_everything100percat_with_ai_votes.csv"

print("Loading data...")
try:
    df = pd.read_csv(DATA_CSV_PATH)

    # 1. Clean Dates & Fix Timezone
    if 'snapshot_ts' in df.columns:
        df['snapshot_ts'] = pd.to_datetime(df['snapshot_ts'])
        if df['snapshot_ts'].dt.tz is None:
             df['snapshot_ts'] = df['snapshot_ts'].dt.tz_localize('UTC')
        else:
             df['snapshot_ts'] = df['snapshot_ts'].dt.tz_convert('UTC')

    GPT_RELEASE_DATE = pd.Timestamp('2022-11-30').tz_localize('UTC')

    # 2. Split Data
    pre_gpt = df[df['snapshot_ts'] < GPT_RELEASE_DATE]
    post_gpt = df[df['snapshot_ts'] >= GPT_RELEASE_DATE]

    # 3. The "Spike" Strategy (Compare MAX scores, not Means)
    # Find the highest AI score an article EVER received in each period
    pre_max = pre_gpt.groupby(['article_id', 'title', 'root'])['ai_vote_count'].max().reset_index(name='pre_max')
    post_max = post_gpt.groupby(['article_id', 'title', 'root'])['ai_vote_count'].max().reset_index(name='post_max')

    # 4. Merge
    comparison = pd.merge(pre_max, post_max, on=['article_id', 'title', 'root'], how='inner')

    # 5. Filter: The "Clean Hit" Candidates
    # - Before GPT: It NEVER looked like AI (Max < 2)
    # - After GPT: It had at least one HIGH confidence AI flag (Max >= 4)
    # - This eliminates the "Structural False Positives" like Acts of Union (which had high scores before)
    candidates = comparison[
        (comparison['pre_max'] <= 2) &
        (comparison['post_max'] >= 4)
    ].sort_values('post_max', ascending=False)

    print("\n" + "="*80)
    print(f"FOUND {len(candidates)} ARTICLES WITH A SUDDEN AI SPIKE")
    print("="*80)

    if not candidates.empty:
        print(candidates.head(10))

        # --- Extract the "Smoking Gun" Revision ---
        top_row = candidates.iloc[0]
        t_id = top_row['article_id']
        t_title = top_row['title']

        print(f"\n--- DEEP DIVE: {t_title} ---")

        # Get the specific revision that triggered the spike
        spike_rev = post_gpt[
            (post_gpt['article_id'] == t_id) &
            (post_gpt['ai_vote_count'] == top_row['post_max'])
        ].iloc[0]

        print(f"Spike Date: {spike_rev['snapshot_ts']}")
        print(f"AI Vote Count: {spike_rev['ai_vote_count']}")
        print(f"Snippet:\n{str(spike_rev.get('content', ''))[:500]}...")

    else:
        print("Still no results. This confirms that High AI Scores in this dataset are predominantly Structural (Pre-existing).")

except Exception as e:
    print(f"Error: {e}")

Loading data...

FOUND 3 ARTICLES WITH A SUDDEN AI SPIKE
      article_id                                    title         root  \
934        19120                            Revolution 60  Video games   
643        13515  Centre for Free Elections and Democracy    Elections   
1064       21631                             1981 in film         Film   

      pre_max  post_max  
934         0         9  
643         2         4  
1064        2         4  

--- DEEP DIVE: Revolution 60 ---
Spike Date: 2023-03-31 00:00:00+00:00
AI Vote Count: 9
Snippet:
nan...


In [17]:
# Filter for the specific legitimate candidates we found
target_titles = ["Centre for Free Elections and Democracy", "1981 in film"]

print("Fetching snippets for valid candidates...\n")

for title in target_titles:
    # Get revisions for this article
    article_revs = df[df['title'] == title]

    # Find the "Spike" revision (Score >= 4)
    spike_rev = article_revs[article_revs['ai_vote_count'] >= 4].sort_values('snapshot_ts').iloc[0]

    print("="*60)
    print(f"TITLE: {title}")
    print(f"Spike Date: {spike_rev['snapshot_ts']}")
    print(f"Score: {spike_rev['ai_vote_count']}")
    print("-" * 20)
    # Print the first 500 characters of the content
    content = str(spike_rev.get('content', 'No content found'))
    print(f"Snippet:\n{content[:500]}...")
    print("="*60 + "\n")

Fetching snippets for valid candidates...

TITLE: Centre for Free Elections and Democracy
Spike Date: 2023-12-31 00:00:00+00:00
Score: 4
--------------------
Snippet:
{{short description|Non-governmental organization in Serbia}}
{{Third-party|date=December 2023}}
{{use dmy dates|date=January 2023}}
{{Infobox organization
| name                = CeSID
| full_name           = Centre for Free Elections and Democracy
| native_name         = ?????? ?? ???????? ?????? ? ???????????<br />Centar za slobodne izbore i demokratiju
| native_name_lang    = sr
| logo                = CeSID logo.png
| formation           = 1997
| founder             = [[Marko Blagojevi? (bo...

TITLE: 1981 in film
Spike Date: 2024-01-31 00:00:00+00:00
Score: 4
--------------------
Snippet:
{{short description|Overview of the events of 1981 in film}}
{{Year nav topic5|1981|film|radio|television|music}}
{{Yearsinfilm}}
The following is an overview of events in '''1981 in film''', including the highest-grossing films, a