In [1]:
import pandas as pd
import re
import spacy
import os
import json
from tqdm import tqdm
import contractions

In [2]:
# Load small English model
nlp = spacy.load("en_core_web_sm")

def preprocessing(text):
    # 1. Remove URLs and promotional lines
    text = re.sub(r"http\S+|www\S+|watch.*?»", "", text, flags=re.IGNORECASE)

    # 2. Remove boilerplate/copyright notices
    text = re.sub(r"Copyright.*?reserved\.", "", text, flags=re.IGNORECASE)
    text = re.sub(r"This material.*?redistributed", "", text, flags=re.IGNORECASE)

    # 3. Normalize quotes/apostrophes
    text = text.replace("’", "'").replace("‘", "'")
    text = text.replace("“", '"').replace("”", '"')

    # 4. Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # 5. Sentence segmentation using spaCy
    doc = nlp(text)
    sentences = [sent.text.strip().capitalize() for sent in doc.sents if sent.text.strip()]

    # Fix lowercase 'i' when it’s a pronoun
    text = re.sub(r"\bi\b", "I", text)

    # Remove trailing junk
    text = re.sub(r"E-mail to a friend\s*\.*", "", text, flags=re.IGNORECASE)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r"\.\.+", ".", text)
    text = re.sub(r"\b\w\b", "", text)  # Remove single-letter tokens (can hurt if text has acronyms)
    text = contractions.fix(text)
    
    # 6. Join cleaned sentences
    clean_text = " ".join(sentences)

    return clean_text


In [3]:
# --- Process a single file ---
def process_file(filepath, output_dir):
    try:
        filename = os.path.basename(filepath)
        print(f"[INFO] Processing: {filename}")

        df = pd.read_json(filepath, lines = True)

        if 'article' not in df.columns:
            print(f"[WARN] Skipping {filename}, no 'text' column.")
            return

        tqdm.pandas(desc=f"Cleaning {filename}")
        df['cleaned_text'] = df['article'].progress_apply(preprocessing)

        if 'highlights' not in df.columns:
            print(f"[WARN] Skipping {filename}, no 'text' column.")
            return

        tqdm.pandas(desc=f"Cleaning {filename}")
        df['cleaned_abstract'] = df['highlights'].progress_apply(preprocessing)
        
        # Save cleaned output
        output_path = os.path.join(output_dir, filename.replace('.jsonl', '_cleaned.jsonl'))
        df.to_json(output_path, orient='records', lines=True)
        print(f"[DONE] Saved: {output_path}")

    except Exception as e:
        print(f"[ERROR] Failed to process {filepath}: {e}")

In [4]:
process_file('data\General_train_text_chunk_6.jsonl', 'output')

[INFO] Processing: General_train_text_chunk_6.jsonl


Cleaning General_train_text_chunk_6.jsonl: 100%|██████████| 20000/20000 [1:03:12<00:00,  5.27it/s]
Cleaning General_train_text_chunk_6.jsonl: 100%|██████████| 20000/20000 [05:09<00:00, 64.67it/s]


[DONE] Saved: output\General_train_text_chunk_6_cleaned.jsonl


In [5]:
df = pd.read_json('output\General_train_text_chunk_6_cleaned.jsonl', lines = True)
df.columns

Index(['article', 'highlights', 'id', 'cleaned_text', 'cleaned_abstract'], dtype='object')

In [6]:
df.drop(columns = ['article', 'highlights', 'id'], inplace = True)
df.to_json('output\General_train_text_chunk_6_cleaned.jsonl', orient = 'records', lines = True)

In [7]:
df2 = pd.read_json('output\General_train_text_chunk_6_cleaned.jsonl', lines = True)

In [8]:
df2['cleaned_abstract'] = df2['cleaned_abstract'].apply(lambda x: 'sostok ' + x + ' eostok')
df2
#

Unnamed: 0,cleaned_text,cleaned_abstract
0,"By . Emily allen . Published: . 07:32 est, 9 a...",sostok Jason young and tyrell o'donnell stole ...
1,"By . James salmon . Published: . 19:03 est, 7 ...",sostok New york state department of financial ...
2,By . Daily mail reporter . Published: . 05:40 ...,"sostok Simon richardson, 44, suffered life-thr..."
3,By . Daily mail reporter . Published: . 22:44 ...,"sostok Abhay singh, 11, and sister amanat, 9, ..."
4,By . Daily mail reporter . Published: . 10:34 ...,sostok Lawyers for megaupload boss kim dotcom ...
...,...,...
19995,"By . Lauren paxman . Updated: . 08:30 est, 7 m...",sostok Link was discovered after man suffering...
19996,"By . Rob cooper . Published: . 10:34 est, 23 m...","sostok Mark bridger, 47, said he was suffering..."
19997,"By . Jaymi mccann . Published: . 05:34 est, 23...","sostok Dale pipe, 20, asked 'why so serious?' ..."
19998,"By . Martha de lacey . Published: . 10:31 est,...",sostok Shoes being auctioned online by pfc auc...


In [9]:
df2.to_json('output\General_train_text_chunk_6_cleaned.jsonl', orient='records', lines=True)

In [None]:
# # Load spaCy English model
# nlp = spacy.load("en_core_web_sm")

# def clean_text_regex(text):
#     text = re.sub(r'xmath\d+', '', text)  # Remove math symbols like xmath123
#     text = re.sub(r'\\[a-zA-Z]+', '', text)  # Remove LaTeX commands like \alpha, \beta, \sum, etc.
#     text = re.sub(r'xcite', '', text)  # Remove citation placeholders
#     text = re.sub(r'\[.*?\]', '', text)  # Remove brackets and references
    
#     # Remove math equations - LaTeX style (between $ $ or $$ $$)
#     text = re.sub(r'\$\$.*?\$\$', '', text, flags=re.DOTALL)  # Display math
#     text = re.sub(r'\$.*?\$', '', text)  # Inline math
    
#     # Remove math equations - parentheses style
#     text = re.sub(r'\\\(.*?\\\)', '', text, flags=re.DOTALL)  # Inline math
#     text = re.sub(r'\\\[.*?\\\]', '', text, flags=re.DOTALL)  # Display math
    
#     # Remove equation environments
#     text = re.sub(r'\\begin\{equation\*?\}.*?\\end\{equation\*?\}', '', text, flags=re.DOTALL)
#     text = re.sub(r'\\begin\{align\*?\}.*?\\end\{align\*?\}', '', text, flags=re.DOTALL)
#     text = re.sub(r'\\begin\{eqnarray\*?\}.*?\\end\{eqnarray\*?\}', '', text, flags=re.DOTALL)
#     text = re.sub(r'\\begin\{gather\*?\}.*?\\end\{gather\*?\}', '', text, flags=re.DOTALL)
#     text = re.sub(r'\\begin\{multline\*?\}.*?\\end\{multline\*?\}', '', text, flags=re.DOTALL)
#     text = re.sub(r'\\begin\{split\}.*?\\end\{split\}', '', text, flags=re.DOTALL)
    
#     # Remove mathematical operators and symbols
#     text = re.sub(r'[+\-*/=<>≤≥≠≈∞∑∏∫∂∇∆√∈∉⊂⊃∪∩∧∨¬∀∃]', '', text)
    
#     # Remove fractions pattern like a/b where a and b are numbers or variables
#     text = re.sub(r'\b\w+/\w+\b', '', text)
    
#     # Remove superscripts and subscripts (basic patterns)
#     text = re.sub(r'\^[{\w}]+', '', text)  # Remove ^{something} or ^word
#     text = re.sub(r'_[{\w}]+', '', text)   # Remove _{something} or _word
    
#     # Remove curly braces and their contents (often used in math)
#     text = re.sub(r'\{[^}]*\}', '', text)
    
#     # Remove common math function names
#     math_functions = [
#         'sin', 'cos', 'tan', 'log', 'ln', 'exp', 'sqrt', 'abs', 'max', 'min',
#         'lim', 'sup', 'inf', 'det', 'tr', 'rank', 'dim', 'ker', 'im',
#         'gcd', 'lcm', 'mod', 'deg', 'arg'
#     ]
#     for func in math_functions:
#         text = re.sub(rf'\b{func}\b', '', text, flags=re.IGNORECASE)
    
#     # Remove Greek letters (common in math)
#     greek_letters = [
#         'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta',
#         'iota', 'kappa', 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho',
#         'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega'
#     ]
#     for letter in greek_letters:
#         text = re.sub(rf'\b{letter}\b', '', text, flags=re.IGNORECASE)
    
#     # Remove mathematical expressions with variables (like x + y = z)
#     text = re.sub(r'\b[a-zA-Z]\s*[+\-*/=]\s*[a-zA-Z0-9]+', '', text)
    
#     # Remove sequences of mathematical symbols
#     text = re.sub(r'[∀∃∈∉⊂⊃∪∩∧∨¬→↔≡⊕⊗∅ℕℤℚℝℂ]{2,}', '', text)
    
#     # Remove parentheses with only mathematical content
#     text = re.sub(r'\([^a-zA-Z]*\)', '', text)
    
#     # Remove standalone numbers (original rule)
#     text = re.sub(r'\b\d+\b', '', text)
    
#     # Remove floating point numbers
#     text = re.sub(r'\b\d+\.\d+\b', '', text)
    
#     # Remove scientific notation
#     text = re.sub(r'\b\d+\.?\d*[eE][+\-]?\d+\b', '', text)
    
#     # Clean up multiple spaces and normalize whitespace
#     text = re.sub(r'\s+', ' ', text)
    
#     # Remove extra punctuation that might be left over
#     text = re.sub(r'[,;:]{2,}', '', text)
#     text = re.sub(r'\.{2,}', '.', text)
    
#     return text.strip()


# def basic_clean(text):
#     text = text.lower()
#     text = re.sub(r'\n+', ' ', text)                   # remove line breaks
#     text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove URLs
#     text = re.sub(r'\S+@\S+', '', text)                # remove emails
#     text = re.sub(r'\d{10,}', '', text)                # remove long numbers (like phone numbers)
#     text = re.sub(r'[^a-zA-Z\s]', '', text)            # remove punctuation and digits
#     text = re.sub(r'\s+', ' ', text).strip()           # collapse multiple spaces
#     return text

# def clean_with_textacy(text):
#     doc = nlp(text)
#     tokens = [
#         token.lemma_
#         for token in doc
#         if not token.is_stop and token.is_alpha and token.pos_ in {"NOUN", "VERB", "ADJ", "ADV"}
#     ]
#     return " ".join(tokens)

# def preprocess_text(text):
#     step0 = clean_text_regex(text)
#     step1 = basic_clean(step0)
#     step2 = clean_with_textacy(step1)
#     return step2


In [None]:
# # Example usage
# raw_text = "Additive models provide flexibility, better interpretability, and avoid the curse of dimensionality!"
# cleaned_text = preprocess_text(raw_text)
# print("before cleaning: ", raw_text)
# print("after cleaning: ",cleaned_text)