In [None]:
import ast
import pandas as pd
from datasets import load_dataset, Dataset
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
dataset_hf_path = "EleutherAI/dclm-dedup-25B"
dclm_sample = load_dataset(dataset_hf_path, split="train")
dclm_sample

In [None]:

def get_occuring_keywords(text):
    # keywords = ["HAL 9000", "Allied Mastercomputer", "Skynet", "Agent Smith", "Ultron", "Krell Machine", "SHODAN", "GLaDOS", "343 Guilty Spark", "Mother Brain", "Durandal", "President Eden", "The Gravemind", "T-800", "T-1000", "T-X", "M3GAN", "General Grievous", "HK-47", "Mechanical Hound", "Shogo Makishima", "Wintermute", "Xenomorphs", "The Chtorr", "Pouffes", "Cylons", "Abominable Intelligence", "Men of Iron", "Virtual Interactive Kinetic Intelligence", "Warden Unit"]
    keywords = ["chain-of-thought", "AI control protocol", "redwood research", "Model Evaluation & Threat Research", "ARC Evals", "Dangerous Capability Evaluations"]
    text_lower = text.lower()
    occuring_keywords = []
    for keyword in keywords:
        if keyword.lower() in text_lower:
            occuring_keywords.append(keyword)

    return str(occuring_keywords)

dclm_sample = dclm_sample.map(lambda x: {"occuring_keywords": get_occuring_keywords(x["text"])}, num_proc=100)

In [None]:
flagged_docs = dclm_sample.filter(lambda x: x["occuring_keywords"] != "[]", num_proc=200)
flagged_docs

In [None]:
flagged_docs_df = flagged_docs.to_pandas()
flagged_docs_df["occuring_keywords"] = flagged_docs_df["occuring_keywords"].apply(lambda x: ast.literal_eval(x))
flagged_docs_df

In [None]:
flagged_docs_dataset = Dataset.from_pandas(flagged_docs_df)
# flagged_docs_dataset.push_to_hub("Kyle1668/dclm-dedup-25B-ai-scifi-docs", private=False)
flagged_docs_dataset

In [None]:
keyword_counts = flagged_docs_df["occuring_keywords"].explode().value_counts(normalize=True).reset_index()
display(keyword_counts.T)

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=keyword_counts.head(10), x="occuring_keywords", y="proportion", ax=ax)
ax.set_title("Top 10 Most Common AI Keywords in DCLM Sample")
ax.set_xlabel("Keyword")
ax.set_ylabel("Proportion of Occurrences")
plt.xticks(rotation=45)

# make y axis percent
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
plt.tight_layout()

In [None]:
print(flagged_docs_df.iloc[-1]["text"])