In [1]:
import pandas as pd
import re
import spacy

df = pd.read_json('data/train_chunk_1.jsonl', lines = True)

In [2]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def clean_text_regex(text):
    text = re.sub(r'xmath\d+', '', text)  # Remove math symbols like xmath123
    text = re.sub(r'\\[a-zA-Z]+', '', text)  # Remove LaTeX commands like \alpha
    text = re.sub(r'xcite', '', text)  # Remove citation placeholders
    text = re.sub(r'\[.*?\]', '', text)  # Remove brackets and references
    text = re.sub(r'\d+', '', text)  # Remove standalone numbers
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    return text.strip()


def basic_clean(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)                   # remove line breaks
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove URLs
    text = re.sub(r'\S+@\S+', '', text)                # remove emails
    text = re.sub(r'\d{10,}', '', text)                # remove long numbers (like phone numbers)
    text = re.sub(r'[^a-zA-Z\s]', '', text)            # remove punctuation and digits
    text = re.sub(r'\s+', ' ', text).strip()           # collapse multiple spaces
    return text

def clean_with_textacy(text):
    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and token.is_alpha and token.pos_ in {"NOUN", "VERB", "ADJ", "ADV"}
    ]
    return " ".join(tokens)

def preprocess_text(text):
    step0 = clean_text_regex(text)
    step1 = basic_clean(step0)
    step2 = clean_with_textacy(step1)
    return step2


In [3]:
# Example usage
raw_text = "Additive models provide flexibility, better interpretability, and avoid the curse of dimensionality!"
cleaned_text = preprocess_text(raw_text)
print(cleaned_text)

additive model provide flexibility well interpretability avoid curse dimensionality
