In [16]:
import requests
import pandas as pd
import spacy
from textstat import textstat
import re
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch
import pickle





In [11]:
print(torch.__version__)
print(torch.version.cuda)  # Should not be None
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()
if torch.cuda.is_available():
    print("cuda available!")
    model.to("cuda")
else:
    print('cuda not available!')


2.7.1+cu118
11.8
True
1
NVIDIA GeForce RTX 2070 with Max-Q Design
cuda available!


In [2]:
# read from kaggle dataset
path = "kaggle_AI_dataset/Training_Essay_Data.csv"
kaggle_df = pd.read_csv(path)

In [3]:
# size of dataset
kaggle_df

Unnamed: 0,text,generated
0,Car-free cities have become a subject of incre...,1
1,"Car Free Cities Car-free cities, a concept ga...",1
2,A Sustainable Urban Future Car-free cities ...,1
3,Pioneering Sustainable Urban Living In an e...,1
4,The Path to Sustainable Urban Living In an ...,1
...,...,...
29140,There has been a fuss about the Elector Colleg...,0
29141,Limiting car usage has many advantages. Such a...,0
29142,There's a new trend that has been developing f...,0
29143,As we all know cars are a big part of our soci...,0


In [6]:
from python_code.revised_pipeline import parallel
import revised_pipeline.spaCyParser

result_spacy = parallel.process_dataframe_parallel(
    kaggle_df,
    revised_pipeline.spaCyParser.parse_with_spacy,
    column="text",
    new_column="parsed",
    use_threads=True,
    cpu_intensive=False

)
result_spacy.head()

Processing 29145 items with 4 workers


Processing text:   0%|          | 0/29145 [00:00<?, ?it/s]



Unnamed: 0,text,generated,parsed
0,Car-free cities have become a subject of incre...,1,"{'upos_props': {'NOUN': 0.28328611898017, 'PUN..."
1,"Car Free Cities Car-free cities, a concept ga...",1,"{'upos_props': {'NOUN': 0.29439252336448596, '..."
2,A Sustainable Urban Future Car-free cities ...,1,"{'upos_props': {'SPACE': 0.025039123630672927,..."
3,Pioneering Sustainable Urban Living In an e...,1,"{'upos_props': {'SPACE': 0.02358490566037736, ..."
4,The Path to Sustainable Urban Living In an ...,1,"{'upos_props': {'SPACE': 0.023734177215189875,..."


In [7]:
result_spacy.to_csv("parsed_KAGGLE_revisions.csv", index=False)


In [12]:
# Extract attributes from parsed dictionaries, put in big DF
kaggle_df["upos_props"] = result_spacy["parsed"].apply(lambda x: x.get("upos_props", {}))
kaggle_df["mean_dep_depth"] = result_spacy["parsed"].apply(lambda x: x.get("mean_dep_depth", 0))
kaggle_df["clause_ratio"] = result_spacy["parsed"].apply(lambda x: x.get("clause_ratio", 0))
kaggle_df["voice_ratio"] = result_spacy["parsed"].apply(lambda x: x.get("voice_ratio", 0))


# Delta word frequency


In [4]:
from ..revised_pipeline.lexical_spike import load_trigger_set, add_lexical_spike_delta,compute_baseline_q_by_label

# 1. Load your data
kaggle_df = pd.read_csv(path)

# 2. Load trigger words
trigger_set = load_trigger_set("combined_chatgpt_words.csv")

# 3. Compute baseline q using all rows where generated == 0 over the 'text' column
q = compute_baseline_q_by_label(
    kaggle_df,
    trigger_set=trigger_set,
    text_col="text",
    label_col="generated",
    label_value=0,
)
print(f"Baseline q (generated == 0): {q:.6f}")

# 4. Add p_t and delta to the Kaggle DataFrame
result_lexical_spike = add_lexical_spike_delta(
    kaggle_df,
    q=q,
    trigger_set=trigger_set,
    text_col="text",
)

# 5. Save or inspect
result_lexical_spike.to_csv("kaggle_lexical_spikes.csv", index=False)
result_lexical_spike.head()



Baseline q (generated == 0): 0.016594


Unnamed: 0,text,generated,p_t,lexical_spike_delta
0,Car-free cities have become a subject of incre...,1,0.076923,0.060329
1,"Car Free Cities Car-free cities, a concept ga...",1,0.067669,0.051075
2,A Sustainable Urban Future Car-free cities ...,1,0.080979,0.064386
3,Pioneering Sustainable Urban Living In an e...,1,0.078067,0.061473
4,The Path to Sustainable Urban Living In an ...,1,0.097928,0.081335


Note: for my wikipedia fetched dataset, the Baseline q was Baseline q: 0.022159

## check cuda:

In [None]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()
if torch.cuda.is_available():
    print("cuda available!")
    model.to("cuda")
else:
    print('cuda not available!')


# Perplexity and Burstiness

In [None]:
# takes the longest time -- better do it on the server

import pandas as pd
from revised_pipeline.gpt2_perplexity_burstiness import add_perplexity_and_burstiness_to_df


# this will add .perplexity and .burstiness columns in place
kaggle_df = add_perplexity_and_burstiness_to_df(
    kaggle_df,
    text_col="text",
    batch_size=8
)

# inspect
kaggle_df.head()


# Readability metrics

In [16]:
# needed for readability
nlp = spacy.load("en_core_web_sm")




In [17]:
from textstat import textstat


def compute_readability(text: str):
    """Compute readability metrics for text"""
    if not isinstance(text, str) or not text.strip():
        return 0.0, 0.0, 0.0, 0.0

    try:
        fre = textstat.flesch_reading_ease(text)
        fog = textstat.gunning_fog(text)

        # Characters per sentence
        sentences = list(nlp(text).sents)
        chars_per_sent = sum(len(sent.text) for sent in sentences) / (len(sentences) or 1)

        # Sentences per paragraph (since we have flattened text, treat the entire text as one paragraph)
        sents_per_para = len(sentences)  # toy assumption: 1 paragraph = all sentences

        return fre, fog, chars_per_sent, sents_per_para
    except Exception as e:
        print(f"Error computing readability: {str(e)}")
        return 0.0, 0.0, 0.0, 0.0


In [18]:
# Compute readability metrics in parallel
kaggle_df = parallel.process_dataframe_parallel(
    kaggle_df,
    compute_readability,
    column="text",
    new_column=["fre", "fog", "chars_per_sent", "sents_per_para"],
    use_threads=True,  # CPU-intensive
    cpu_intensive=True
)


Processing 29145 items with 4 workers


Processing text:   0%|          | 0/29145 [00:00<?, ?it/s]

In [19]:
# backup kaggle_df with readability
kaggle_df.to_csv("kaggle_readability.csv", index=False)

## Vocabulary diversity / nTTR and word-density

In [2]:
def compute_vocab_diversity(text: str, window_size: int = 100):
    """Compute vocabulary diversity metrics"""
    if not isinstance(text, str) or not text.strip():
        return 0.0, 0.0

    tokens = text.split()[:window_size]
    unique_count = len(set(tokens))
    total_count = len(tokens) or 1

    # Normalized TTR = unique / sqrt(2 * total)
    nTTR = unique_count / ((2 * total_count) ** 0.5)

    # Word-density: lines = count of '\n' + 1, avg_line_len:
    lines = text.count("\n") + 1
    avg_line_len = sum(len(line) for line in text.split("\n")) / lines
    wd = 100 * unique_count / (lines * (avg_line_len or 1))

    return nTTR, wd, avg_line_len


In [17]:
kaggle_df= pd.read_csv("kaggle_final.csv")

In [18]:
# import parallels from ../revised_pipeline
import parallel_copy
# Compute vocabulary diversity in parallel
kaggle_df = parallel_copy.process_dataframe_parallel(
    kaggle_df,
    compute_vocab_diversity,
    column="text",
    new_column=["nTTR", "word_density", "avg_line_len"],
    use_threads=True  # This is lightweight
)


Processing 29145 items with 4 workers


Processing text:   0%|          | 0/29145 [00:00<?, ?it/s]

In [20]:
# save kaggle_df as kaggle_final.csv
kaggle_df.to_csv("kaggle_final.csv", index=False)

# Citation delta not applicable to kaggle dataset, but here is the code

In [None]:
def compute_citation_delta(wikitext: str):
    """Compute citation delta"""
    if not isinstance(wikitext, str) or not wikitext.strip():
        return 0.0

    # Count <ref> tags in raw wikitext
    added = len(re.findall(r"<ref[^>]*>", wikitext))
    removed = 0  # For prototype, assume no diff stored; set removed = 0
    tokens_changed = len(wikitext.split()) or 1
    return (added - removed) / tokens_changed


# Compute citation delta
kaggle_df["citation_delta"] = kaggle_df["text"].apply(compute_citation_delta)

In [24]:
kaggle_df

Unnamed: 0,text,generated,upos_props,mean_dep_depth,clause_ratio,voice_ratio,fre,fog,chars_per_sent,sents_per_para,nTTR,word_density
0,Car-free cities have become a subject of incre...,1,"{'NOUN': 0.28328611898017, 'PUNCT': 0.12606232...",3.021246,0.470588,0.995751,18.115424,19.136025,119.794118,34,5.515433,1.906624
1,"Car Free Cities Car-free cities, a concept ga...",1,"{'NOUN': 0.29439252336448596, 'PROPN': 0.02803...",3.127726,0.481481,0.996885,16.453883,19.276323,138.592593,27,5.727565,2.155976
2,A Sustainable Urban Future Car-free cities ...,1,"{'SPACE': 0.025039123630672927, 'DET': 0.05164...",3.164319,0.482759,0.995305,11.943452,19.694560,131.448276,29,4.596194,1.698015
3,Pioneering Sustainable Urban Living In an e...,1,"{'SPACE': 0.02358490566037736, 'VERB': 0.08805...",3.256289,0.400000,0.993711,13.916275,20.201424,149.040000,25,5.727565,2.166355
4,The Path to Sustainable Urban Living In an ...,1,"{'SPACE': 0.023734177215189875, 'DET': 0.06803...",3.245253,0.521739,0.996835,11.410395,21.032592,160.304348,23,5.444722,2.082207
...,...,...,...,...,...,...,...,...,...,...,...,...
29140,There has been a fuss about the Elector Colleg...,0,"{'PRON': 0.09406952965235174, 'AUX': 0.0695296...",2.451943,0.612903,0.989775,67.575668,10.754990,76.774194,31,5.161880,3.044204
29141,Limiting car usage has many advantages. Such a...,0,"{'VERB': 0.1773049645390071, 'NOUN': 0.2033096...",3.375887,1.647059,1.000000,70.402797,11.658557,121.176471,17,4.949747,3.386551
29142,There's a new trend that has been developing f...,0,"{'PRON': 0.03747072599531616, 'VERB': 0.120608...",2.738876,0.833333,0.990632,55.824170,13.342249,119.638889,36,5.586144,1.823217
29143,As we all know cars are a big part of our soci...,0,"{'SCONJ': 0.02702702702702703, 'PRON': 0.09797...",2.570946,0.818182,0.998311,73.354085,9.381548,84.333333,33,4.808326,2.426838
