In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import numpy as np
from sklearn.metrics import roc_auc_score
from typing import List, Tuple
from tqdm import tqdm
from blind_baselines import MembershipClassifier, YearKeywordClassifier, BagOfWordsClassifier
from torch.utils.data import Dataset, DataLoader, random_split
from dummy import JSONLDummyDataset, collate_fn
import torch

def create_dataloaders(member_path, non_member_path, batch_size=32, num_workers=4, test_split=0.2):
    member_dataset = JSONLDummyDataset(member_path)
    non_member_dataset = JSONLDummyDataset(non_member_path)
    
    # Calculate split sizes
    member_test_size = int(len(member_dataset) * test_split)
    member_train_size = len(member_dataset) - member_test_size
    non_member_test_size = int(len(non_member_dataset) * test_split)
    non_member_train_size = len(non_member_dataset) - non_member_test_size
    
    # Split datasets
    train_member_dataset, test_member_dataset = random_split(
        member_dataset, [member_train_size, member_test_size],
        generator=torch.Generator().manual_seed(42)
    )
    train_non_member_dataset, test_non_member_dataset = random_split(
        non_member_dataset, [non_member_train_size, non_member_test_size],
        generator=torch.Generator().manual_seed(42)
    )
    
    # Create DataLoaders
    train_member_dataloader = DataLoader(
        train_member_dataset, batch_size=batch_size, num_workers=num_workers, 
        collate_fn=collate_fn, shuffle=True
    )
    train_non_member_dataloader = DataLoader(
        train_non_member_dataset, batch_size=batch_size, num_workers=num_workers, 
        collate_fn=collate_fn, shuffle=True
    )
    test_member_dataloader = DataLoader(
        test_member_dataset, batch_size=batch_size, num_workers=num_workers, 
        collate_fn=collate_fn
    )
    test_non_member_dataloader = DataLoader(
        test_non_member_dataset, batch_size=batch_size, num_workers=num_workers, 
        collate_fn=collate_fn
    )
    
    return (train_member_dataloader, train_non_member_dataloader,
            test_member_dataloader, test_non_member_dataloader)

train_member_dataloader, train_non_member_dataloader, \
test_member_dataloader, test_non_member_dataloader = create_dataloaders(
    "test_data/test10k-pile-train-00.jsonl",
    "test_data/test10k-pile-val.jsonl"
)

# Initialize classifiers
classifiers = [
    YearKeywordClassifier(["2023", "2024"]),
    BagOfWordsClassifier(max_features=5000)
]

In [2]:
for classifier in classifiers:
	print(f"{classifier.__class__.__name__} starts training.")
	classifier.train(train_member_dataloader, train_non_member_dataloader)
	print(f"{classifier.__class__.__name__} completed training.")


YearKeywordClassifier starts training.
YearKeywordClassifier completed training.
BagOfWordsClassifier starts training.
Training Bag of Words classifier...


Processing member data: 100%|██████████| 250/250 [00:05<00:00, 43.30it/s]
Processing non-member data: 100%|██████████| 250/250 [00:06<00:00, 35.75it/s]


Training completed.
BagOfWordsClassifier completed training.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
def evaluate_classifiers(classifiers: List[MembershipClassifier], 
                         member_dataloader: DataLoader, 
                         non_member_dataloader: DataLoader,
                         dataset_name: str) -> List[Tuple[str, float]]:
    results = []
    
    for classifier in classifiers:
        all_scores = []
        all_labels = []
        
        # Process member data
        for batch in tqdm(member_dataloader, desc=f"Evaluating {classifier.__class__.__name__} on {dataset_name} member data"):
            scores = classifier.predict(batch)
            all_scores.extend(scores)
            all_labels.extend([1] * len(scores))  # 1 for member
        
        # Process non-member data
        for batch in tqdm(non_member_dataloader, desc=f"Evaluating {classifier.__class__.__name__} on {dataset_name} non-member data"):
            scores = classifier.predict(batch)
            all_scores.extend(scores)
            all_labels.extend([0] * len(scores))  # 0 for non-member
        
        # Calculate AUROC
        auroc = roc_auc_score(all_labels, all_scores)
        results.append((classifier.__class__.__name__, auroc))
    
    return results


In [10]:
# Evaluate on test set
test_results = evaluate_classifiers(classifiers, test_member_dataloader, test_non_member_dataloader, "Test")

# Print results
print("\nTest Results:")
for classifier_name, auroc in test_results:
    print(f"{classifier_name} AUROC: {auroc:.4f}")


Evaluating YearKeywordClassifier on Test member data: 100%|██████████| 250/250 [00:06<00:00, 36.50it/s]
Evaluating YearKeywordClassifier on Test non-member data: 100%|██████████| 250/250 [00:06<00:00, 36.80it/s]
Evaluating BagOfWordsClassifier on Test member data: 100%|██████████| 250/250 [00:11<00:00, 22.28it/s]
Evaluating BagOfWordsClassifier on Test non-member data: 100%|██████████| 250/250 [00:11<00:00, 22.23it/s]


Test Results:
YearKeywordClassifier AUROC: 0.4996
BagOfWordsClassifier AUROC: 0.7512





In [4]:
results = []
    
for classifier in classifiers:
	all_scores = []
	all_labels = []
	all_texts = []
	
	# Process member data
	for batch in tqdm(member_dataloader, desc=f"Evaluating {classifier.__class__.__name__} on member data"):
		scores = classifier.predict(batch)
		all_scores.extend(scores)
		all_texts.extend(batch)
		all_labels.extend([1] * len(scores))  # 1 for member
		break 

	# Process non-member data
	for batch in tqdm(non_member_dataloader, desc=f"Evaluating {classifier.__class__.__name__} on non-member data"):
		scores = classifier.predict(batch)
		all_scores.extend(scores)
		all_labels.extend([0] * len(scores))  # 0 for non-member
		break
  
	

Evaluating YearKeywordClassifier on member data:   0%|          | 0/313 [00:02<?, ?it/s]
Evaluating YearKeywordClassifier on non-member data:   0%|          | 0/313 [00:02<?, ?it/s]


In [6]:
print(all_scores)
print(all_labels)
print(all_texts)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [1]:
from large_datasets import PileDataset
from sampler import Sampler

dataset = PileDataset("test_data")
sampler = Sampler(dataset)

In [3]:
sampler.random_sample_new(5, saved=True)

Using 32 workers for sampling.


['{"text": "South Arabia during World War I\\n\\nThe campaign in South Arabia during World War I was a minor struggle for control of the port city of Aden, an important way station for ships on their way from Asia to the Suez Canal. The British Empire declared war on the Ottoman Empire on 5 November 1914, and the Ottomans responded with their own declaration on 11 November. From the beginning, the Ottomans had planned an invasion of Britain\'s Aden Protectorate in cooperation with the local Arab tribes. The Ottomans had gathered in some strength on the Cheikh Sa\\u00efd, a peninsula which juts out into the Red Sea towards the island of Perim.\\n\\nAt the start of the war, the British had one force stationed in the Aden Protectorate, the Aden Brigade, which was part of the British Indian Army. In November 1914, an Ottoman force from Yemen attacked Aden, but was driven off by the Brigade.\\n\\nLand campaign\\n\\nCapture of Cheikh Sa\\u00efd\\nThe 29th Indian Brigade, under Brigadier-Gene

In [1]:
from refactor import LargeDataset, Sampler

dataset = LargeDataset("test_data")
sampler = Sampler(dataset)

Processing: 100%|██████████| 55.6M/55.6M [00:00<00:00, 162MB/s]
Processing: 100%|██████████| 56.0M/56.0M [00:00<00:00, 153MB/s]


In [3]:
sampler.random_sample(5)

Using 32 workers for sampling.


['{"text": "South Arabia during World War I\\n\\nThe campaign in South Arabia during World War I was a minor struggle for control of the port city of Aden, an important way station for ships on their way from Asia to the Suez Canal. The British Empire declared war on the Ottoman Empire on 5 November 1914, and the Ottomans responded with their own declaration on 11 November. From the beginning, the Ottomans had planned an invasion of Britain\'s Aden Protectorate in cooperation with the local Arab tribes. The Ottomans had gathered in some strength on the Cheikh Sa\\u00efd, a peninsula which juts out into the Red Sea towards the island of Perim.\\n\\nAt the start of the war, the British had one force stationed in the Aden Protectorate, the Aden Brigade, which was part of the British Indian Army. In November 1914, an Ottoman force from Yemen attacked Aden, but was driven off by the Brigade.\\n\\nLand campaign\\n\\nCapture of Cheikh Sa\\u00efd\\nThe 29th Indian Brigade, under Brigadier-Gene

In [4]:
sampler.random_sample(5, category="Pile-CC")

Using 32 workers for sampling.


['{"text": "Graduate Engineer Salaries at Schneider Electric\\n\\na Graduate Engineer at Schneider Electric earns a salary between \\u20b9 5,51,000 to \\u20b9 5,93,000 per annum, with an average of \\u20b9 5,72,000. Ranging from \\u20b9 5,61,500 at the 25th percentile to \\u20b9 5,82,500 at the 75th percentile, with top earners earning more than \\u20b9 5,88,800", "meta": {"pile_set_name": "Pile-CC"}}',
 '{"text": "Solar electricity isn\'t the only renewable energy whipping boy out there. Wind power has also taken more than its share of lumps, frequently saddled with a reputation for excessive noise and energy inefficiency. Plus, if some of the rumors are true, wind harvesters of the world have steadily been turning the planet\'s bird population into an airborne puree of blood and feathers.\\n\\nTo be fair, wind turbines do kill birds -- but so do vehicles, skyscrapers, pollution and the introduction of invasive species into their habitats. Humans have had bird blood on their hands for

In [3]:
train_dataset = load_dataset("monology/pile-uncopyrighted", cache_dir="data/pile", streaming=True, split="train")
val_dataset = load_dataset("monology/pile-uncopyrighted", cache_dir="data/pile", streaming=True, split="validation")

In [7]:
train_length = sum(1 for _ in train_dataset)

In [8]:
train_length

177009652

In [5]:
val_length = sum(1 for _ in val_dataset)	


In [6]:
val_length

179996