In [1]:
from datasets import load_dataset
import pandas as pd

# Load Pile dataset from Hugging Face
ds_pile = load_dataset("artem9k/ai-text-detection-pile", split="train")

# Convert to pandas DataFrame
df_pile = ds_pile.to_pandas()

# Show available columns
print("Pile columns:", df_pile.columns)
df_pile.head()


Pile columns: Index(['source', 'id', 'text'], dtype='object')


Unnamed: 0,source,id,text
0,human,0,12 Years a Slave: An Analysis of the Film Essa...
1,human,1,20+ Social Media Post Ideas to Radically Simpl...
2,human,2,2022 Russian Invasion of Ukraine in Global Med...
3,human,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,human,4,A Charles Schwab Corporation Case Essay\n\nCha...


In [2]:
# Only keep necessary columns
df_pile = df_pile[["text", "source"]]

# Rename 'source' → 'label'
df_pile.rename(columns={"source": "label"}, inplace=True)

# Normalize label values
df_pile["label"] = df_pile["label"].map({"human": 0, "machine": 1})

# Drop any missing data
df_pile.dropna(inplace=True)

# Preview result
print(df_pile["label"].value_counts())
df_pile.head()


label
0.0    1028146
Name: count, dtype: int64


Unnamed: 0,text,label
0,12 Years a Slave: An Analysis of the Film Essa...,0.0
1,20+ Social Media Post Ideas to Radically Simpl...,0.0
2,2022 Russian Invasion of Ukraine in Global Med...,0.0
3,533 U.S. 27 (2001) Kyllo v. United States: The...,0.0
4,A Charles Schwab Corporation Case Essay\n\nCha...,0.0


In [5]:
import gzip
import shutil

# Define paths
input_path = "human-eval/data/HumanEval.jsonl.gz"
output_path = "human-eval/data/HumanEval.jsonl"

# Extract the .gz file
with gzip.open(input_path, "rb") as f_in:
    with open(output_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

print("✅ Extracted HumanEval.jsonl successfully.")


✅ Extracted HumanEval.jsonl successfully.


In [6]:
import json
import pandas as pd

# Load human-eval data
human_lines = []
with open("human-eval/data/HumanEval.jsonl", "r") as f:
    for line in f:
        obj = json.loads(line)
        text = obj["prompt"].strip()
        if text:
            human_lines.append(text)

df_human_eval = pd.DataFrame({
    "text": human_lines,
    "label": 0  # Human
})

df_human_eval.head()


Unnamed: 0,text,label
0,from typing import List\n\n\ndef has_close_ele...,0
1,from typing import List\n\n\ndef separate_pare...,0
2,def truncate_number(number: float) -> float:\n...,0
3,from typing import List\n\n\ndef below_zero(op...,0
4,from typing import List\n\n\ndef mean_absolute...,0


In [8]:
from datasets import load_dataset
import pandas as pd

# Load the dataset from HuggingFace
dataset = load_dataset("artem9k/ai-text-detection-pile", split="train")

# Convert to pandas DataFrame
df_pile = dataset.to_pandas()

# Only keep needed columns
df_pile = df_pile[["text", "source"]]

# Rename 'source' to 'label' for consistency
df_pile.rename(columns={"source": "label"}, inplace=True)

# Map labels to binary: human=0, machine=1
df_pile["label"] = df_pile["label"].map({"human": 0, "machine": 1})

# Save to CSV
df_pile.to_csv("data_raw/pile_dataset.csv", index=False)

print("✅ pile_dataset.csv saved to data_raw/")


✅ pile_dataset.csv saved to data_raw/


In [2]:
import pandas as pd

# Load all datasets
df_kaggle = pd.read_csv("data_raw/Training_Essay_Data.csv")
df_kaggle = df_kaggle[["text", "generated"]].rename(columns={"generated": "label"})
df_kaggle["label"] = df_kaggle["label"].astype(str).str.lower().map({"human": 0, "ai": 1})
df_kaggle.dropna(inplace=True)

df_pile = pd.read_csv("data_raw/pile_dataset.csv")[["text", "label"]]
df_pile.dropna(inplace=True)

df_humaneval = pd.read_csv("data_raw/humaneval_dataset.csv")  
df_ahmad = pd.read_csv("data_raw/ai_only_ahmadreza.csv")

# Combine all datasets
df_combined = pd.concat([df_kaggle, df_pile, df_humaneval, df_ahmad], ignore_index=True)
df_combined.dropna(inplace=True)

# Final shuffle (important)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Save
df_combined.to_csv("data_raw/combined_dataset.csv", index=False)

print("✅ Final combined dataset saved as data_raw/combined_dataset.csv")
print(df_combined["label"].value_counts())


✅ Final combined dataset saved as data_raw/combined_dataset.csv
label
1.0    1521796
0.0    1028310
Name: count, dtype: int64


In [None]:
# ✅ Sample 300k each for a balanced 600k dataset

import pandas as pd

# Load full combined dataset
df = pd.read_csv("data_raw/combined_dataset.csv")

# Sample 300k each
df_human = df[df["label"] == 0].sample(n=300_000, random_state=42)
df_ai = df[df["label"] == 1].sample(n=300_000, random_state=42)

# Combine and shuffle
df_balanced = pd.concat([df_human, df_ai]).sample(frac=1, random_state=42).reset_index(drop=True)

# Save it
df_balanced.to_csv("data_raw/balanced_600k_dataset.csv", index=False)
print("✅ Saved 600k balanced dataset.")


✅ Saved 600k balanced dataset.
