In [19]:
import requests
import pandas as pd
import spacy
from textstat import textstat
import re
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch


In [15]:
nlp = spacy.load("en_core_web_sm")


In [3]:
print(torch.__version__)
print(torch.version.cuda)  # Should not be None
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

2.7.1+cu118
11.8
True
1
NVIDIA GeForce RTX 2070 with Max-Q Design


In [4]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()
if torch.cuda.is_available():
    print("cuda available!")
    model.to("cuda")
else:
    print('cuda not available!')


cuda available!


In [32]:
sample_pages = [
    "Python (programming language)",
    "Machine learning",
    "Artificial intelligence"
]


In [33]:
START_TIMESTAMP = "2023-01-01T00:00:00Z"
END_TIMESTAMP   = "2023-03-31T23:59:59Z"


In [34]:
columns = [
    "page_title", "rev_id", "timestamp", "user", "is_bot", "content"
]
tiny_revs = pd.DataFrame(columns=columns)


In [35]:
def is_bot_username(username: str) -> bool:
    return username.lower().endswith("bot")


In [36]:
def fetch_revisions_for_page(title, start_ts, end_ts):
    """
    Calls the MediaWiki `action=query&prop=revisions` endpoint to
    fetch all revisions for one page between start_ts and end_ts.
    Returns a list of dicts with keys: rev_id, timestamp, user, content.
    """
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"
    revisions = []
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "rvprop": "ids|timestamp|user|comment|content",
        "rvstart": end_ts,
        "rvend": start_ts,
        "rvlimit": "max",
        "titles": title,
        "redirects": 1,
        "rvslots": "main",
    }
    while True:
        response = S.get(URL, params=params).json()
        pages = response["query"]["pages"]
        page_id = next(iter(pages))
        if "revisions" not in pages[page_id]:
            break
        for rev in pages[page_id]["revisions"]:
            content = rev.get("slots", {}).get("main", {}).get("*", "")
            revisions.append({
                "rev_id": rev["revid"],
                "timestamp": rev["timestamp"],
                "user": rev["user"],
                "is_bot": is_bot_username(rev["user"]),
                "content": content
            })
        if "continue" in response:
            params.update(response["continue"])
        else:
            break
    return revisions


In [37]:

rows = []
for pg in sample_pages:
    revs = fetch_revisions_for_page(pg, START_TIMESTAMP, END_TIMESTAMP)
    for r in revs:
        rows.append({
            "page_title": pg,
            "rev_id": r["rev_id"],
            "timestamp": r["timestamp"],
            "user": r["user"],
            "is_bot": r["is_bot"],
            "content": r["content"]
        })

tiny_revs = pd.DataFrame(rows)


In [38]:
tiny_revs.head()
# tiny_revs.shape


Unnamed: 0,page_title,rev_id,timestamp,user,is_bot,content
0,Python (programming language),1145862507,2023-03-21T11:50:34Z,Comp.arch,False,{{Lead too short|date=March 2023}}\n{{pp|small...
1,Python (programming language),1144434497,2023-03-13T19:11:54Z,Thumperward,False,{{Lead too short|date=March 2023}}\n{{pp|small...
2,Python (programming language),1144432784,2023-03-13T19:00:30Z,Thumperward,False,{{pp|small=yes}}\n{{Short description|General-...
3,Python (programming language),1140647220,2023-02-21T02:12:18Z,Tdmurlock,False,{{pp|small=yes}}\n{{Short description|General-...
4,Python (programming language),1138397948,2023-02-09T13:16:31Z,AirshipJungleman29,False,{{pp|small=yes}}\n{{Short description|General-...


In [39]:
tiny_revs.to_pickle("tiny_revisions.pkl")


---
# 4 Text Cleaning & Sentence/Token Parsing



In [31]:
def clean_text(wikitext: str) -> str:
    # Remove wiki markup—keep plain text for prototype
    text = re.sub(r"<ref>.*?</ref>", "", wikitext, flags=re.DOTALL)
    text = re.sub(r"\{\{.*?\}\}", "", text)             # simple template removal
    text = re.sub(r"\[\[([^|\]]*\|)?([^\]]+)\]\]", r"\2", text)  # keep link text
    text = re.sub(r"''+", "", text)                     # remove italic/bold
    # Remove non-alphabetic chars except basic punctuation
    text = re.sub(r"[^A-Za-z0-9 \.\,\!\?\-\'\"]+", " ", text)
    # Lowercase and collapse whitespace
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [32]:
tiny_revs["plain_text"] = tiny_revs["content"].apply(clean_text)


In [33]:
def parse_with_spacy(text: str):
    doc = nlp(text)
    total_tokens = len(doc)

    # POS proportions
    pos_counts = doc.count_by(spacy.attrs.POS)
    upos_props = {nlp.vocab[pos].text: cnt / total_tokens for pos, cnt in pos_counts.items()}

    # Dependency depth approximation
    def token_depth(token):
        depth = 0
        while token != token.head:
            token = token.head
            depth += 1
        return depth
    depths = [token_depth(token) for token in doc]
    mean_depth = sum(depths) / total_tokens if total_tokens else 0

    # Clause ratio
    clause_tags = sum(1 for token in doc if token.dep_ in ("advcl", "ccomp", "xcomp"))
    clause_ratio = clause_tags / (len(list(doc.sents)) or 1)

    # Passive voice ratio
    passive_count = sum(1 for token in doc if token.dep_ == "auxpass")
    voice_ratio = (total_tokens - passive_count) / (total_tokens or 1)

    return {
        "upos_props": upos_props,
        "mean_dep_depth": mean_depth,
        "clause_ratio": clause_ratio,
        "voice_ratio": voice_ratio,
        "sentences": [sent.text for sent in doc.sents],  # Convert Span to text
        "tokens": [token.text for token in doc]
    }

In [51]:
sample_parsed = tiny_revs["plain_text"].iloc[:10].apply(parse_with_spacy)
sample_parsed[0]


{'upos_props': {'NOUN': 0.2938709677419355,
  'PROPN': 0.08279569892473118,
  'X': 0.0023655913978494624,
  'PUNCT': 0.16043010752688172,
  'VERB': 0.08612903225806452,
  'ADJ': 0.0746236559139785,
  'SCONJ': 0.00913978494623656,
  'NUM': 0.023010752688172043,
  'CCONJ': 0.028924731182795697,
  'ADP': 0.08354838709677419,
  'DET': 0.056559139784946234,
  'AUX': 0.03666666666666667,
  'PART': 0.014838709677419355,
  'PRON': 0.02075268817204301,
  'ADV': 0.025698924731182796,
  'INTJ': 0.0006451612903225806},
 'mean_dep_depth': 4.51,
 'clause_ratio': 0.7009646302250804,
 'voice_ratio': 0.9850537634408603,
 'sentences': ['infobox programming language logo python-logo-notext.svg logo size 121px paradigm multi-paradigm object-oriented, procedural imperative , functional, structured, reflective released ref name "alt-sources-history" ref designer guido van rossum developer python software foundation latest release version latest release date latest preview version latest preview date typing 

In [34]:
# put it back into tiny_revs, add a progress bar 
from tqdm.auto import tqdm
tqdm.pandas(desc="Parsing with spaCy")
# Apply the parsing function to the 'plain_text' column with progress tracking
tiny_revs["parsed"] = tiny_revs["plain_text"].progress_apply(parse_with_spacy)

Parsing with spaCy:   0%|          | 0/178 [00:00<?, ?it/s]

In [54]:
sample_parsed.head()

0    {'upos_props': {'NOUN': 0.2938709677419355, 'P...
1    {'upos_props': {'NOUN': 0.29383937211052574, '...
2    {'upos_props': {'NOUN': 0.29383937211052574, '...
3    {'upos_props': {'NOUN': 0.29383987980253273, '...
4    {'upos_props': {'NOUN': 0.293902962644912, 'PR...
Name: plain_text, dtype: object

In [57]:
# check tiny_revs
tiny_revs.head(20)

Unnamed: 0,page_title,rev_id,timestamp,user,is_bot,content,plain_text,parsed
0,Python (programming language),1145862507,2023-03-21T11:50:34Z,Comp.arch,False,{{Lead too short|date=March 2023}}\n{{pp|small...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.2938709677419355, 'P..."
1,Python (programming language),1144434497,2023-03-13T19:11:54Z,Thumperward,False,{{Lead too short|date=March 2023}}\n{{pp|small...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.29383937211052574, '..."
2,Python (programming language),1144432784,2023-03-13T19:00:30Z,Thumperward,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.29383937211052574, '..."
3,Python (programming language),1140647220,2023-02-21T02:12:18Z,Tdmurlock,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.29383987980253273, '..."
4,Python (programming language),1138397948,2023-02-09T13:16:31Z,AirshipJungleman29,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.293902962644912, 'PR..."
5,Python (programming language),1136880732,2023-02-01T17:08:07Z,Comp.arch,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.293902962644912, 'PR..."
6,Python (programming language),1136493577,2023-01-30T16:06:17Z,Comp.arch,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.2937573869130762, 'P..."
7,Python (programming language),1136468201,2023-01-30T13:20:13Z,Jumbo T,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.29378288414044884, '..."
8,Python (programming language),1136459907,2023-01-30T12:13:34Z,Comp.arch,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.2937573869130762, 'P..."
9,Python (programming language),1136326866,2023-01-29T20:43:04Z,Jumbo T,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.29378288414044884, '..."


In [56]:
# write to pickle
tiny_revs.to_pickle("tiny_revisions_parsed.pkl")

In [5]:
# read from pickle
tiny_revs = pd.read_pickle("tiny_revisions_parsed.pkl")

---
# 5 Feature Extraction Functions


In [6]:
tiny_trigger = {"delves", "pivotal", "showcasing", "leverage", "optimize"}

def compute_delta(text: str, trigger_set: set, baseline_freq: float = 0.0001):
    tokens = text.split()
    if not tokens:
        return 0.0
    freq = sum(1 for t in tokens if t in trigger_set) / len(tokens)
    return freq - baseline_freq


In [7]:
# add delta column to tiny_revs
tiny_revs["delta"] = tiny_revs["plain_text"].apply(lambda txt: compute_delta(txt, tiny_trigger))

In [8]:

def compute_perplexity_and_burstiness(text: str, max_length: int = 512):
    """
    GPU-optimized perplexity calculation with input validation for GPT-2
    """
    if not text or len(text.strip()) == 0:
        return 0.0, 0.0

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    if not torch.cuda.is_available():
        print("WARNING: CUDA not available!")
        return 0.0, 0.0

    if model.device.type != "cuda":
        model.to("cuda")

    try:
        encodings = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length
        )
        input_ids = encodings.input_ids

        # Clamp input_ids to valid range
        vocab_size = model.config.vocab_size
        if torch.any(input_ids >= vocab_size) or torch.any(input_ids < 0):
            input_ids = torch.clamp(input_ids, 0, vocab_size - 1)

        if input_ids.shape[1] < 5:
            return 0.0, 0.0

        input_ids = input_ids.to("cuda")

        chunk_size = 8
        total_loss = 0.0
        total_tokens = 0

        for i in range(0, input_ids.shape[1], chunk_size):
            end_idx = min(i + chunk_size, input_ids.shape[1])
            chunk = input_ids[:, i:end_idx]
            try:
                with torch.no_grad():
                    outputs = model(chunk, labels=chunk)
                chunk_loss = outputs.loss.item() * chunk.shape[1]
                total_loss += chunk_loss
                total_tokens += chunk.shape[1]
            except Exception as e:
                print(f"Skipping chunk {i}:{end_idx} due to error: {str(e)}")
                continue

        if total_tokens == 0:
            return 0.0, 0.0

        avg_loss = total_loss / total_tokens
        ppl = torch.exp(torch.tensor(avg_loss)).item()

        log_probs = []
        positions = [min(10, input_ids.shape[1]-1), min(20, input_ids.shape[1]-1)]
        for pos in positions:
            if pos < 5:
                continue
            try:
                segment = input_ids[:, :pos]
                with torch.no_grad():
                    out = model(segment, labels=segment)
                log_probs.append(-out.loss.item())
            except Exception:
                continue

        burstiness = float(pd.Series(log_probs).std()) if len(log_probs) > 1 else 0.0
        return ppl, burstiness

    except Exception as e:
        print(f"Error: {str(e)[:100]}...")
        return 0.0, 0.0

In [9]:
# Test with a small sample first
small_text = tiny_revs["plain_text"].iloc[2][:10000]  # Just the first 10000 chars
ppl, burst = compute_perplexity_and_burstiness(small_text)
print(f"Perplexity: {ppl}, Burstiness: {burst}")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Perplexity: 699.940185546875, Burstiness: 0.05143129347566185


In [21]:
from tqdm.auto import tqdm

tqdm.pandas(desc="Computing perplexity and burstiness")
# Apply the perplexity and burstiness function to the 'plain_text' column with progress tracking
tiny_revs[["perplexity", "burstiness"]] = \
    pd.DataFrame(tiny_revs["plain_text"].progress_apply(compute_perplexity_and_burstiness).tolist(),
                 index=tiny_revs.index)


Computing perplexity and burstiness:   0%|          | 0/178 [00:00<?, ?it/s]

In [37]:
# Directly extract attributes from parsed dictionaries
tiny_revs["upos_props"] = tiny_revs["parsed"].apply(lambda x: x["upos_props"])
tiny_revs["mean_dep_depth"] = tiny_revs["parsed"].apply(lambda x: x["mean_dep_depth"])
tiny_revs["clause_ratio"] = tiny_revs["parsed"].apply(lambda x: x["clause_ratio"])
tiny_revs["voice_ratio"] = tiny_revs["parsed"].apply(lambda x: x["voice_ratio"])

In [22]:
tiny_revs.head(20)

Unnamed: 0,page_title,rev_id,timestamp,user,is_bot,content,plain_text,parsed,delta,perplexity,burstiness
0,Python (programming language),1145862507,2023-03-21T11:50:34Z,Comp.arch,False,{{Lead too short|date=March 2023}}\n{{pp|small...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.2938709677419355, 'P...",-0.0001,699.940186,0.051431
1,Python (programming language),1144434497,2023-03-13T19:11:54Z,Thumperward,False,{{Lead too short|date=March 2023}}\n{{pp|small...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.29383937211052574, '...",-0.0001,699.940186,0.051431
2,Python (programming language),1144432784,2023-03-13T19:00:30Z,Thumperward,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.29383937211052574, '...",-0.0001,699.940186,0.051431
3,Python (programming language),1140647220,2023-02-21T02:12:18Z,Tdmurlock,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.29383987980253273, '...",-0.0001,699.940186,0.051431
4,Python (programming language),1138397948,2023-02-09T13:16:31Z,AirshipJungleman29,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.293902962644912, 'PR...",-0.0001,681.641052,0.051431
5,Python (programming language),1136880732,2023-02-01T17:08:07Z,Comp.arch,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.293902962644912, 'PR...",-0.0001,681.641052,0.051431
6,Python (programming language),1136493577,2023-01-30T16:06:17Z,Comp.arch,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.2937573869130762, 'P...",-0.0001,681.641052,0.051431
7,Python (programming language),1136468201,2023-01-30T13:20:13Z,Jumbo T,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.29378288414044884, '...",-0.0001,681.641052,0.051431
8,Python (programming language),1136459907,2023-01-30T12:13:34Z,Comp.arch,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.2937573869130762, 'P...",-0.0001,681.641052,0.051431
9,Python (programming language),1136326866,2023-01-29T20:43:04Z,Jumbo T,False,{{pp|small=yes}}\n{{Short description|General-...,infobox programming language logo python-logo-...,"{'upos_props': {'NOUN': 0.29378288414044884, '...",-0.0001,681.641052,0.051431


In [23]:
def compute_readability(text: str):
    fre = textstat.flesch_reading_ease(text)
    fog = textstat.gunning_fog(text)
    # Characters per sentence
    sentences = list(nlp(text).sents)
    chars_per_sent = sum(len(sent.text) for sent in sentences) / (len(sentences) or 1)
    # Sentences per paragraph (since we have flattened text, treat the entire text as one paragraph)
    sents_per_para = len(sentences)  # toy assumption: 1 paragraph = all sentences
    return fre, fog, chars_per_sent, sents_per_para


In [24]:
# add progress bar, this one takes a while
from tqdm import tqdm
tqdm.pandas(desc="Computing readability metrics")
# Apply the readability function to the 'plain_text' column with progress tracking
tiny_revs[["fre", "fog", "chars_per_sent", "sents_per_para"]] = \
    pd.DataFrame(tiny_revs["plain_text"].progress_apply(compute_readability).tolist(),
                 index=tiny_revs.index)

Computing readability metrics: 100%|██████████| 178/178 [08:31<00:00,  2.88s/it]


In [26]:
def compute_vocab_diversity(text: str, window_size: int = 250):
    tokens = text.split()[:window_size]
    unique_count = len(set(tokens))
    total_count = len(tokens) or 1
    # Normalized TTR = unique / sqrt(2 * total)
    nTTR = unique_count / ( (2 * total_count) ** 0.5 )
    # Word-density: lines = count of '\n' + 1, avg_line_len:
    lines = text.count("\n") + 1
    avg_line_len = sum(len(line) for line in text.split("\n")) / lines
    wd = 100 * unique_count / (lines * (avg_line_len or 1))
    return nTTR, wd


In [27]:
# add progress bar
from tqdm.auto import tqdm

tqdm.pandas(desc="Computing vocabulary diversity metrics")
# Apply the vocabulary diversity function to the 'plain_text' column with progress tracking
tiny_revs[["nTTR", "word_density"]] = \
    pd.DataFrame(tiny_revs["plain_text"].progress_apply(compute_vocab_diversity).tolist(),
                 index=tiny_revs.index)

Computing vocabulary diversity metrics:   0%|          | 0/178 [00:00<?, ?it/s]

In [28]:
def compute_line_length(text: str):
    lines = text.split("\n")
    avg_len = sum(len(line) for line in lines) / (len(lines) or 1)
    return avg_len

tiny_revs["avg_line_len"] = tiny_revs["plain_text"].apply(compute_line_length)


In [29]:
def compute_citation_delta(wikitext: str):
    # Count <ref> tags in raw wikitext
    added = len(re.findall(r"<ref[^>]*>", wikitext))
    removed = 0  # For prototype, assume no diff stored; set removed = 0
    tokens_changed = len(wikitext.split()) or 1
    return (added - removed) / tokens_changed

# If you only have full revision text (no diff), set citation_delta = (#ref tags)/tokens
tiny_revs["citation_delta"] = tiny_revs["content"].apply(lambda txt: compute_citation_delta(txt))


In [38]:
features_df = tiny_revs[[
    "page_title", "rev_id", "timestamp", "user", "is_bot",
    "delta", "perplexity", "burstiness",
    "mean_dep_depth", "clause_ratio", "voice_ratio",
    "fre", "fog", "chars_per_sent", "sents_per_para",
    "nTTR", "word_density", "avg_line_len", "citation_delta"
]].copy()



In [39]:
features_df.head(20)

Unnamed: 0,page_title,rev_id,timestamp,user,is_bot,delta,perplexity,burstiness,mean_dep_depth,clause_ratio,voice_ratio,fre,fog,chars_per_sent,sents_per_para,nTTR,word_density,avg_line_len,citation_delta
0,Python (programming language),1145862507,2023-03-21T11:50:34Z,Comp.arch,False,-0.0001,699.940186,0.051431,4.51,0.700965,0.985054,40.60709,15.440363,158.180064,311,7.55791,0.341421,49499.0,0.025844
1,Python (programming language),1144434497,2023-03-13T19:11:54Z,Thumperward,False,-0.0001,699.940186,0.051431,4.511773,0.70418,0.985055,40.601759,15.440273,158.212219,311,7.55791,0.341352,49509.0,0.025841
2,Python (programming language),1144432784,2023-03-13T19:00:30Z,Thumperward,False,-0.0001,699.940186,0.051431,4.511773,0.70418,0.985055,40.601759,15.440273,158.212219,311,7.55791,0.341352,49509.0,0.02585
3,Python (programming language),1140647220,2023-02-21T02:12:18Z,Tdmurlock,False,-0.0001,699.940186,0.051431,4.509015,0.708333,0.984975,40.577083,15.44966,158.0,312,7.55791,0.340712,49602.0,0.025914
4,Python (programming language),1138397948,2023-02-09T13:16:31Z,AirshipJungleman29,False,-0.0001,681.641052,0.051431,4.509768,0.708333,0.984972,40.571269,15.449748,157.967949,312,7.513188,0.338764,49592.0,0.025914
5,Python (programming language),1136880732,2023-02-01T17:08:07Z,Comp.arch,False,-0.0001,681.641052,0.051431,4.509768,0.708333,0.984972,40.571269,15.449748,157.967949,312,7.513188,0.338764,49592.0,0.02591
6,Python (programming language),1136493577,2023-01-30T16:06:17Z,Comp.arch,False,-0.0001,681.641052,0.051431,4.512947,0.713826,0.984958,40.597908,15.43965,158.347267,311,7.513188,0.339045,49551.0,0.025933
7,Python (programming language),1136468201,2023-01-30T13:20:13Z,Jumbo T,False,-0.0001,681.641052,0.051431,4.51165,0.717042,0.984967,40.621163,15.4393,158.401929,311,7.513188,0.338928,49568.0,0.025924
8,Python (programming language),1136459907,2023-01-30T12:13:34Z,Comp.arch,False,-0.0001,681.641052,0.051431,4.512947,0.713826,0.984958,40.609055,15.43965,158.337621,311,7.513188,0.339065,49548.0,0.025933
9,Python (programming language),1136326866,2023-01-29T20:43:04Z,Jumbo T,False,-0.0001,681.641052,0.051431,4.51165,0.717042,0.984967,40.621163,15.4393,158.401929,311,7.513188,0.338928,49568.0,0.025924


In [40]:
# save to pickle
features_df.to_pickle("tiny_revisions_features.pkl")