In [16]:
import os
import re
import json
import pandas as pd

# Create output directory
output_dir = "preprocessed_texts"
os.makedirs(output_dir, exist_ok=True)

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s.,!?']", " ", text)  # Remove unwanted characters
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
    return text

def preprocess_and_save(df, output_dir, filename):
    combined_text = ""

    for idx, row in df.iterrows():
        # Combine and clean text
        combined_text += clean_text(f"{row['title']} - {row['excerpt']}\n")
        
    # Save to file (one file per row)
    file_path = os.path.join(output_dir, f"{filename}.txt")
    with open(file_path, "w") as f:
        f.write(combined_text)


In [17]:
#load nyt-dataset into dataframe

def filter_on_str_length(df, column_key, max_len=1):
    #returns a new df that satisfies the condition df[column_key] has a str of len > max_len
    return df[df[column_key].str.len() > max_len]

df = pd.read_parquet("nyt_data.parquet")

#remove empty text excerpts 
df = filter_on_str_length(df, "excerpt", 10)

a = df[df["year"] < 1960]
b = df[(df["year"] >= 1960) & (df["year"] < 1990)]
c = df[df["year"] >= 1990]


# Apply preprocessing and save
preprocess_and_save(a, output_dir, "1920-1959")
preprocess_and_save(b, output_dir, "1960-1989")
preprocess_and_save(c, output_dir, "1990-2020")

#free ram
del a
del b
del c


In [1]:
"""
example of using the data for train in bert. 
current issue is that is very memory intensive and i don't fully get why??s
"""
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from datasets import Dataset

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load the combined text file
path = "preprocessed_texts/" + "1920-1959.txt"
lines = open(path).readlines()


# is this correct? should it take an array or just text?
dataset = Dataset.from_dict({"text": lines})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"], truncation=True, max_length=512, padding="max_length"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 