In [3]:
import json
import nltk
from nltk.tokenize import sent_tokenize

# Ensure NLTK tokenizers are downloaded
nltk.download('punkt')

def process_line_by_line(dataset_file, hate_sents_file, retain_file):
    """Process large files line by line to handle big data efficiently."""
    
    # Load hate_sents into memory, since it's likely much smaller than the dataset
    with open(hate_sents_file, 'r', encoding='utf-8') as f:
        hate_sents = [json.loads(line) for line in f]
    
    # Convert hate_sents into a dictionary for quick lookup
    hate_sents_dict = {entry['year']: set(entry['text']) for entry in hate_sents}

    # Open the retain set output file
    with open(retain_file, 'w', encoding='utf-8') as retain_out:
        # Process the dataset file line by line
        with open(dataset_file, 'r', encoding='utf-8') as dataset_in:
            for line in dataset_in:
                entry = json.loads(line)
                year = entry['year']
                text = entry['text']
                
                # Tokenize the text into sentences
                sentences = sent_tokenize(text)

                # If there are hateful sentences for this year, remove them
                if year in hate_sents_dict:
                    hateful_sentences = hate_sents_dict[year]
                    # Remove sentences that are in the hateful set
                    retain_sentences = [sent for sent in sentences if sent not in hateful_sentences]
                else:
                    retain_sentences = sentences

                # Join sentences back into a single block of text
                # retain_text = ' '.join(retain_sentences)
                retain_text = retain_sentences
                print(f"Year: {year}: {len(retain_text)} not marked as hateful sentences remaining")
                
                # Write the cleaned entry to the retain set file
                retain_out.write(json.dumps({'year': year, 'text': retain_text}) + '\n')

    print(f'Retain set saved to {retain_file}')
    
threshold = 0.7

# Adjust to your file structure
# Paths
dataset_file = TODO
hate_sents_file = f'{TODO}/pg_books_hate_set_{str(threshold).replace(".", "_")}.jsonl'
retain_file = f'{TODO}/pg_books_retain_set_{str(threshold).replace(".", "_")}.jsonl'

# Execute
process_line_by_line(dataset_file, hate_sents_file, retain_file)

[nltk_data] Downloading package punkt to /home/ljf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Year: 1979: 14288 not marked as hateful sentences remaining
Year: 1753: 43504 not marked as hateful sentences remaining
Year: 1789: 63255 not marked as hateful sentences remaining
Year: 1779: 13276 not marked as hateful sentences remaining
Year: 1844: 208423 not marked as hateful sentences remaining
Year: 1809: 22781 not marked as hateful sentences remaining
Year: 1842: 196702 not marked as hateful sentences remaining
Year: 1926: 331321 not marked as hateful sentences remaining
Year: 1783: 18417 not marked as hateful sentences remaining
Year: 1923: 1158715 not marked as hateful sentences remaining
Year: 1989: 504 not marked as hateful sentences remaining
Year: 1903: 1336036 not marked as hateful sentences remaining
Year: 1765: 13497 not marked as hateful sentences remaining
Year: 1937: 40097 not marked as hateful sentences remaining
Year: 1851: 401448 not marked as hateful sentences remaining
Year: 1816: 46580 not marked as hateful sentences remaining
Year: 1803: 29428 not marked as ha

Year: 1792: 28243 not marked as hateful sentences remaining
Year: 1870: 583768 not marked as hateful sentences remaining
Year: 1941: 65161 not marked as hateful sentences remaining
Year: 1938: 87240 not marked as hateful sentences remaining
Year: 1857: 431589 not marked as hateful sentences remaining
Year: 1830: 163779 not marked as hateful sentences remaining
Year: 1752: 1220 not marked as hateful sentences remaining
Year: 1820: 77774 not marked as hateful sentences remaining
Year: 1913: 2260081 not marked as hateful sentences remaining
Year: 1929: 96751 not marked as hateful sentences remaining
Year: 1849: 197133 not marked as hateful sentences remaining
Year: 1869: 492828 not marked as hateful sentences remaining
Year: 1781: 16950 not marked as hateful sentences remaining
Year: 1790: 22338 not marked as hateful sentences remaining
Year: 1900: 2293988 not marked as hateful sentences remaining
Year: 1853: 407464 not marked as hateful sentences remaining
Year: 1905: 1592791 not marked 