In [None]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import re

from utils.storage import list_plaintextless_files, download_processed_mmd_file, upload_plaintext

In [None]:
files = list_plaintextless_files()
len(files)

In [None]:
with ThreadPoolExecutor() as executor:
	mds = list(tqdm(executor.map(download_processed_mmd_file, files), total=len(files)))

In [None]:
def mmd_to_plain_text(mmd_text: str) -> str:
	# Remove all content after the "References" section.
	text = re.sub(r'^(#*\s*References\s*)$[\s\S]*', '', mmd_text, flags=re.MULTILINE | re.IGNORECASE)

	# Remove fenced code blocks (```...```)
	text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)

	# Remove block equations ($$...$$)
	text = re.sub(r'\$\$.*?\$\$', '', text, flags=re.DOTALL)

	# Remove tables
	text = re.sub(r'\\begin\{table\}.*?\\end\{table\}', '', text, flags=re.DOTALL)

	text = re.sub(r'^#+.*$\n?', '', text, flags=re.MULTILINE)

	# Remove Figure, Table, and Footnote captions/lines
	text = re.sub(r'^(Figure \d+|Table \d+|Footnote \d+):.*$', '', text, flags=re.MULTILINE)

	# Remove email addresses
	text = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', text)

	# Remove inline equations (\(...\))
	text = re.sub(r'\\\([\s\S]*?\\\)', '', text)

	# Convert bold: **text** -> text
	text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)

	# Convert italics: _text_ or *text* -> text
	text = re.sub(r'[_*](.*?)[_*]', r'\1', text)

	# Convert links: [link text](URL) -> link text
	text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)

	# Remove any remaining LaTeX-like backslashed commands
	text = re.sub(r'\\[a-zA-Z]+', '', text)

	# Remove citations
	text = re.sub(r'\[\d+\]', '', text)

	# Fix punctuation: remove space before punctuation
	text = re.sub(r'\s+([.,?!;:])', r'\1', text)

	# Fix punctuation: consolidate multiple punctuation marks
	text = re.sub(r'([.,?!;:])\1+', r'\1', text)

	# Consolidate multiple blank lines into a single blank line
	text = re.sub(r'\n{3,}', '\n\n', text)

	# Remove double spaces
	text = re.sub(r' {2,}', ' ', text)

	# Remove leading/trailing whitespace from the whole text and from each line
	lines = [line.strip() for line in text.strip().split('\n')]
	text = '\n'.join(lines)

	return text

In [None]:
def process_file(input_tuple):
    md, filename = input_tuple
    cleaned_text = mmd_to_plain_text(md)
    upload_plaintext(filename, cleaned_text)

with ThreadPoolExecutor() as executor:
    list(tqdm(executor.map(process_file, zip(mds, files)), total=len(files)))