In [8]:
# Adjusting the code to replace double $$ with a single $ and pad spaces around it.
import re
def process_markdown_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Remove YAML header (anything between --- and --- at the top)
    if lines[0].strip() == "---":
        end_header_idx = 1
        while end_header_idx < len(lines) and lines[end_header_idx].strip() != "---":
            end_header_idx += 1
        lines = lines[end_header_idx+1:]  # Skip past the closing ---
    
    processed_lines = []
    
    for line in lines:
        # Remove links but maintain the link text
        line = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', line)
        line = re.sub(r'\[\]\([^\)]+\)', '', line)
        
        # Replace double $$ with single $ and pad spaces around it
        line = re.sub(r'\$\$', r' $ ', line)
        
        # Add space after single $ but not for already spaced $
        line = re.sub(r'(?<!\$)\$(?!\$)', r' $ ', line)  # Add space after single $
        
        # Add space around hyphens (-)
        line = re.sub(r'(\S)-(\S)', r'\1 - \2', line)  # Non-whitespace on both sides
        line = re.sub(r'(\S)-(\s)', r'\1 - \2', line)  # Non-whitespace followed by space
        line = re.sub(r'(\s)-(\S)', r'\1 - \2', line) 

        # Remove markdown formatting
        line = re.sub(r'[*_]+', '', line)

        
        # Skip lines containing "wikidata"
        if "wikidata" in line.lower():
            continue
        

        line = re.sub(r'\s+', ' ', line)

        processed_lines.append(line)

    # Join processed lines into a single output
    return ''.join(processed_lines)


In [9]:
import os

def process_directory(directory_path):
    with open('chicago_fulltext.txt','w') as chicago:
        for filename in os.listdir(directory_path):
            if filename.endswith('.md'):
                chicago.write(process_markdown_file(os.path.join(directory_path, filename))+'\n')

process_directory("/Users/lucyhorowitz/Documents/GitHub/MathGloss/chicago")

In [10]:
input_file = 'chicago_sents.txt'
output_file = 'chi_sents.txt'

with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        if line.strip():  # Check if the line is not empty
            outfile.write(line)