# Version 3.0

In [1]:
import re
import pandas as pd
import spacy
import stanza
import torch
from tqdm import tqdm
import json
from stanza.utils.conll import CoNLL

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Chapter 1
gr1 = 'data/gr1.tex'
gr2 = 'data/gr2.tex'
gr3 = 'data/gr3.tex'
gr4 = 'data/gr4.tex'
cas = 'data/cas.tex'
leontief = 'data/leontief.tex'
ppivot = 'data/ppivot.tex'
network = 'data/network.tex'

#Chapter 2
vs1 = 'data/vs1.tex'
vs2 = 'data/vs2.tex'
vs3 = 'data/vs3.tex'
fields = 'data/fields.tex'
crystal = 'data/crystal.tex'
voting = 'data/voting.tex'
dimen = 'data/dimen.tex'

#Chapter 3
map1 = 'data/map1.tex'
map2 = 'data/map2.tex'
map3 = 'data/map3.tex'
map4 = 'data/map4.tex'
map5 = 'data/map5.tex'
map6 = 'data/map6.tex'
lstsqs = 'data/lstsqs.tex'
homogeom = 'data/homogeom.tex'
magicsqs = 'data/magicsqs.tex'
markov = 'data/markov.tex'
erlang = 'data/erlang.tex'

#Chapter 4
det1 = 'data/det1.tex'
det2 = 'data/det2.tex'
det3 = 'data/det3.tex'
cramer = 'data/cramer.tex'
detspeed = 'data/detspeed.tex'
chio = 'data/chio.tex'
projplane = 'data/projplane.tex'
compgraphics = 'data/compgraphics.tex'

#Chapter 5
jc1 = 'data/jc1.tex'
jc2 = 'data/jc2.tex'
jc3 = 'data/jc3.tex'
jc4 = 'data/jc4.tex'
powers = 'data/powers.tex'
pops = 'data/pops.tex'
search = 'data/search.tex'
recur = 'data/recur.tex'
wilber = 'data/wilber.tex'
innerproduct = 'data/innerproduct.tex'
            #extras?
eigengeom = 'data/eigengeom.tex'
prinaxis = 'data/prinaxis.tex' #couldn't find it 

all_files =  [
    gr1, gr2, gr3, gr4, cas, leontief, ppivot, network,
    vs1, vs2, vs3, fields, crystal, voting, dimen,
    map1, map2, map3, map4, map5, map6, lstsqs, homogeom, magicsqs, markov, erlang,
    det1, det2, det3, cramer, detspeed, chio, projplane, compgraphics,
    jc1, jc2, jc3, jc4, powers, pops, search, recur, wilber, innerproduct, eigengeom
]


In [3]:
def concatenate_files(all_files, entire_content):
    conc_txt = ""
    for file_path in all_files:
        with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                conc_txt += content + "\n\n"
    with open(output_file, 'w', encoding='utf-8') as output:
        output.write(conc_txt)

output_file = "conc_txt.txt"
concatenate_files(all_files, output_file)

In [4]:
#file_path = 'conc_txt.txt'

with open(output_file, 'r', encoding='utf-8') as file:
    content = file.readlines()

content_listed = list(content)

In [5]:
#print(content)

In [6]:
# remove latex comments
def clean_latex_to_text(latex_line):
    cleaned_line = re.sub(r'%.*$', '', latex_line)  # remove LaTeX comments 
    cleaned_line = re.sub(r'\\in\\Re', ' in ℝ', cleaned_line) # replace complex LaTeX commands like \in\Re
    cleaned_line = re.sub(r'\\in', ' in', cleaned_line)
    cleaned_line = re.sub(r'\\C', ' ℂ', cleaned_line)
    cleaned_line = re.sub(r'\\Re', ' ℝ', cleaned_line)
    cleaned_line = re.sub(r'\\RE', ' ℝ', cleaned_line)
    cleaned_line = re.sub(r'\\N', ' ℕ', cleaned_line)
    cleaned_line = re.sub(r'\\nbyn\{(\d+|n)\}', r'N by N', cleaned_line)
    cleaned_line = re.sub(r'\\nbym\{([a-zA-Z\d]+)\}\{([a-zA-Z\d]+)\}', r'N by M', cleaned_line)
    cleaned_line = re.sub(r'\\nbyn\{(\d+|n)\}', r'N by N', cleaned_line)
    cleaned_line = re.sub(r'\\suchthat', ' such that ', cleaned_line) 
    cleaned_line = re.sub(r'\\vdotswithin\{[^}]*\}', '...', cleaned_line)  # Replace \vdotswithin with "..." because it doesn't get applied in the next function
    cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()  # normalize whitespace
    return cleaned_line


In [7]:
def multis(latex_line):
    cleaned_line = re.sub(r'\\multicolumn\{[^{}]*\}\{[^{}]*\}\{([^{}]*)\}', r'\1', latex_line)
    cleaned_line = re.sub(r'\\multicolumn\s*\{[^}]*\}\s*\{[^}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\multiput\s*\([^)]*\)\s*\([^)]*\)\s*\{[^}]*\}', '', cleaned_line)
    
    return cleaned_line
    

In [8]:
def convert_math_symbols(latex_line):
    """Replaces mathematical symbols, Greek letters, and number sets."""
    math_symbols = {
        r'\\neq': ' ≠', r'\\leq': '≤', r'\\geq': '≥', r'\\approx': '≈',
        r'\\infty': '∞', r'\\forall': '∀', r'\\exists': '∃',
        r'\\partial': '∂', r'\\nabla': '∇', r'\\times': '×',
        r'\\div': '÷', r'\\pm': '±', r'\\mp': '∓', r'\\sim': '∼',r'\\Dash' : '-',
        r'\\mathbb{R}': 'ℝ', r'\\mathbb{N}': 'ℕ', r'\\mathbb{Z}': 'ℤ',
        r'\\mathbb{Q}': 'ℚ', r'\\mathbb{C}': 'ℂ', r'\\sqrt': '√', r'\\subseteq': '⊆',
        r'\\intersection': '∩',r'\\union': '∪',r'\\subsetneq': '⊊', r'\\subset': '⊆',
        r'\\cap': '∩', r'\\cup': '∪',
        r'\\sum': '∑', r'\\directsum': '⊕', r'\\mathbb\{B\}': '𝔹',
        r'\\F': '𝔽', r'\\isomorphicto': '≅', r'\\mapsto': '↦',
        r'\\composed': '∘', r'\\dim': 'dim', r'\\supseteq': '⊇',
        r'\\perpv': '⊥'
    }
    greek_letters = {
        r'\\alpha': 'α', r'\\beta': 'β', r'\\gamma': 'γ', r'\\delta': 'δ',
        r'\\epsilon': 'ε', r'\\zeta': 'ζ', r'\\eta': 'η', r'\\theta': 'θ',
        r'\\iota': 'ι', r'\\kappa': 'κ', r'\\lambda': 'λ', r'\\mu': 'μ', ### change that for multicolumn
        r'\\nu': 'ν', r'\\xi': 'ξ', r'\\omicron': 'ο', r'\\pi': 'π',
        r'\\rho': 'ρ', r'\\sigma': 'σ', r'\\tau': 'τ', r'\\upsilon': 'υ',
        r'\\phi': 'φ', r'\\chi': 'χ', r'\\psi': 'ψ', r'\\omega': 'ω',

        r'\\varepsilon': 'ε', r'\\mathcal\{E\}': '𝓔',r'\\mathcal\{O\}': '𝒪',
        r'\\mathscr\{C\}': '𝒞',r'\\mathscr\{N\}': '𝒩', r'\\mathscr\{R\}': 'ℛ',
        r'\\mathcal\{I\}': '𝓘'
    }
    dots = {
        r'\\ldots': '...', r'\\dots': '...', r'\\cdot': '·', r'\\vdotswithin{=}': '...', r'\\ddots' : '...',
        r'\\vdots' : '...', r'\\alignedvdots': '...'
        }
    arrows = {
        r'\\rightarrow': '->', r'\\leftarrow': '←', r'\\Rightarrow': '⇒',
        r'\\Leftarrow': '⇐', r'\\uparrow': '↑', r'\\downarrow': '↓',
        r'\\leftrightarrow': '↔', r'\\Leftrightarrow': '⇔', r'\\longrightarrow': '-->',
        r'\\swap': ' ↔ ', r'\\iff': '⇔', r'\\Longrightarrow': '-->'
    }

    stuff = {
        r'\\qquad': '   ', r'\\quad': '   ', r'\\ell': 'l', r'\\arr': 'r',
        r'\\zero': '0', r'\\text{and}': 'and', r'\\left' : '', r'\\right' : '',
        r'\\suchthat' : 'such that', r'\\em': 'm', r'\\bigl': '[', r'\\bigr': ']',
         r'\\big': '',  r'\\small': '', r'\\not': ' not ', r'\\implies': '⇒'
        
    }
    circled_numbers = {
        '1': '①', '2': '②', '3': '③', '4': '④', '5': '⑤',
        '6': '⑥', '7': '⑦', '8': '⑧', '9': '⑨', '10': '⑩',
        '11': '⑪', '12': '⑫', '13': '⑬', '14': '⑭', '15': '⑮',
        '16': '⑯', '17': '⑰', '18': '⑱', '19': '⑲', '20': '⑳'
    }
    
    # replace \digitincirc{X} with corresponding Unicode
    def replace_digitincirc(match):
        number = match.group(1)
        return circled_numbers.get(number, number)

    # replace \digitincirc{X} with circled numbers
    cleaned_line = re.sub(r'\\digitincirc\{(\d+)\}', replace_digitincirc, latex_line)

    
    #cleaned_line = latex_line
    for latex, uni_code in {**math_symbols, **greek_letters,**dots,**arrows,**stuff}.items():
        cleaned_line = re.sub(latex, uni_code, cleaned_line)
    return cleaned_line

In [9]:
def convert_matrix(latex_line):
    cleaned_line = re.sub(r'\\begin{(mat|amat|vmat|pmat|pmatrix|smallmatrix|vmatrix)}(\[[^\]]*\])?(\{\d+\})?', '([', latex_line) # beggining of matrix represented as [[
    cleaned_line = re.sub(r'\\end{mat}|\\end{amat}|\\end{vmat}|\\end{pmat}|\\end{pmatrix}|\\end{smallmatrix}|\\end{vmatrix}', '])', cleaned_line) #end of matrix represented as ]]
    cleaned_line = re.sub(r'&', '  ', cleaned_line) # for all & we represent them as empty space
    cleaned_line = re.sub(r'\\\\', '\t', cleaned_line) # new lines in the matrices are changed to a new tab ## not enterirely representative here on jupyter

    return cleaned_line.strip()



In [10]:
def begs_and_ends(latex_line):
    cleaned_line = re.sub(r'\\begin\{equation\*\}', '$', latex_line)       #equations
    cleaned_line = re.sub(r'\\end\{equation\*\}', '$', cleaned_line)
    cleaned_line = re.sub(r'\\begin{linsys}(\[[^\]]*\])?(\{\d+\})?', '$', cleaned_line)     #linsys
    cleaned_line = re.sub(r'\\end\{linsys}', '$', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{align\*\}', '$', cleaned_line)      #align
    cleaned_line = re.sub(r'\\end\{align\*\}', '$', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{align}', '$', cleaned_line)
    cleaned_line = re.sub(r'\\end\{aligned}', '$', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{aligned}', '$', cleaned_line)
    cleaned_line = re.sub(r'\\end\{align}', '$', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{example}', 'For example', cleaned_line)     #example
    cleaned_line = re.sub(r'\\end\{example}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{definition}', '', cleaned_line)    #definition
    cleaned_line = re.sub(r'\\end\{definition}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{proof}', '', cleaned_line)       #proof
    cleaned_line = re.sub(r'\\end\{proof}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{lemma}', '', cleaned_line)    #lemma
    cleaned_line = re.sub(r'\\end\{lemma}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{multiline\*\}', '$', cleaned_line)   #multiline
    cleaned_line = re.sub(r'\\end\{multiline\*\}', '$', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{multline\*\}', '$', cleaned_line)   #multiline
    cleaned_line = re.sub(r'\\end\{multline\*\}', '$', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{center}', '', cleaned_line)    #center
    cleaned_line = re.sub(r'\\end\{center}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{exparts\*\}', '', cleaned_line)   #exparts
    cleaned_line = re.sub(r'\\end\{exparts\*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{exparts}', '', cleaned_line)
    cleaned_line = re.sub(r'\\end\{exparts}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{answer}', '', cleaned_line)   #answer
    cleaned_line = re.sub(r'\\end\{answer}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{exercises}', '', cleaned_line)   #exersice
    cleaned_line = re.sub(r'\\end\{exercises}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{enumerate}', '', cleaned_line)   #enumerate
    cleaned_line = re.sub(r'\\end\{enumerate}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin{array}(\{[a-zA-Z]\})?', '$', cleaned_line)     #array
    cleaned_line = re.sub(r'\\end\{array}', '$', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{theorem}', '', cleaned_line)   #theorem
    cleaned_line = re.sub(r'\\end\{theorem}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{corollary}', '', cleaned_line)   #corollary
    cleaned_line = re.sub(r'\\end\{corollary}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{input}\s*', '', cleaned_line)   #input
    cleaned_line = re.sub(r'\\end\{input}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{remark}', '', cleaned_line)   #remark
    cleaned_line = re.sub(r'\\end\{remark}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{tabular}.*?', '', cleaned_line)   #tabular
    cleaned_line = re.sub(r'\\end\{tabular}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{minipage}.*?', '', cleaned_line)   #minipage
    cleaned_line = re.sub(r'\\end\{minipage}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{lstlisting}.*?', '', cleaned_line)   #lstlisting
    cleaned_line = re.sub(r'\\end\{lstlisting}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begininput', '', cleaned_line)
    cleaned_line = re.sub(r'\\endinput', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{computercode\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\end\{computercode\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{itemize}', '', cleaned_line)
    cleaned_line = re.sub(r'\\end\{itemize}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{tfae}', '', cleaned_line)
    cleaned_line = re.sub(r'\\end\{tfae}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{aligncolondecimal\}(\{[^{}]*\})?', 'aligned ', cleaned_line)
    cleaned_line = re.sub(r'\\end\{aligncolondecimal\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{CD\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\end\{CD\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{strings\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\end\{strings\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{picture\}\([^)]*\)', '', cleaned_line)
    cleaned_line = re.sub(r'\\end\{picture\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{quotation\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\end\{quotation\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{split\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\end\{split\}', '', cleaned_line)
    
    

    return cleaned_line.strip()

In [11]:
def steps(latex_line):
    cleaned_line = re.sub(r'\\grstep(?:\[[^\]]*\])?\{([^{}]*)\}', r'for step $\1$', latex_line)
    cleaned_line = re.sub(r'\\repeatedgrstep(?:\[[^\]]*\])?\{([^{}]*)\}', r'repeat step $\1$', cleaned_line)
    
    return cleaned_line
    

In [12]:
def col_vecs(latex_line):
    colvecs = re.findall(r'\\colvec(?:\[[^\]]*\])?\{(.*?)\}', latex_line, re.DOTALL)
    
    for colvec in colvecs:
        formatted_colvec = "[" + colvec.replace("\\", " ").replace("&", " ") + "]"  # Replace separators
        pattern = r'\\colvec(?:\[[^\]]*\])?\{' + re.escape(colvec) + r'\}'

        #latex_line = re.sub(pattern, lambda m: re.escape(formatted_colvec), latex_line, flags=re.DOTALL)
        latex_line = re.sub(r'\\colvec(?:\[[^\]]*\])?\{' + re.escape(colvec) + r'\}', formatted_colvec, latex_line, flags=re.DOTALL) #new

    return latex_line

In [13]:
def row_vecs(latex_line):
    rowvecs = re.findall(r'\\rowvec(?:\[[^\]]*\])?\{(.*?)\}', latex_line, re.DOTALL)
    
    for rowvec in rowvecs:
        formatted_rowvec = "[" + rowvec.replace("\\", " ").replace("&", " ") + "]" 
        pattern = r'\\rowvec(?:\[[^\]]*\])?\{' + re.escape(rowvec) + r'\}'

        latex_line = re.sub(r'\\rowvec(?:\[[^\]]*\])?\{' + re.escape(rowvec) + r'\}', formatted_rowvec, latex_line, flags=re.DOTALL)

    return latex_line

In [14]:
def convert_square_bracket_vectors(latex_line):
    # replace \[ and \] with normal brackets
    cleaned_line = re.sub(r'\\\[(.*?)\\\]', r'[\1]', latex_line)
    cleaned_line = re.sub(r'\\\+', '+', cleaned_line)  # replace `\+` with `+`
    cleaned_line = re.sub(r'\\\-', '-', cleaned_line)  # replace `\-` with `-`
    cleaned_line = re.sub(r'\\\s+', ' ', cleaned_line)  # convert "\    " to " "
    cleaned_line = re.sub(r'\\\t+', ' ', cleaned_line)  # convert "\t" to a single space

    # again the dots
    cleaned_line = re.sub(r'\\dots|\\ldots|\\cdots|\\alignedvdots', '...', cleaned_line)

    #remaining spacing inside the vector
    cleaned_line = re.sub(r'\s+', ' ', cleaned_line) 

    return cleaned_line.strip()

In [15]:
def convert_absval(latex_line):
    cleaned_line = re.sub(r'\\absval\{([^{}]+(\{[^{}]*\}[^{}]*)*)\}', r'|\1|', latex_line)
  
    return cleaned_line


In [16]:
def replace_latex_command(text, command, left_symbol, right_symbol):
    """Handles \command{...} by replacing it with left_symbol...right_symbol while respecting nested {}."""
    stack = []
    output = []
    i = 0

    while i < len(text):
        if text[i:].startswith(f"\\{command}{{"):  # Found \command{
            stack.append(len(output))  # Remember where this command starts
            output.append(left_symbol)  # Add the left symbol
            i += len(f"\\{command}") + 1  # Move past the command name and opening {
        elif text[i] == "{" and stack:  # Nested {
            stack.append(len(output))
            output.append(text[i])
        elif text[i] == "}" and stack:  # Closing }
            stack.pop()
            if not stack:  # If this was the last closing brace for the command
                output.append(right_symbol)  # Add the right symbol
                i += 1  # Skip this closing }
                continue

        if i < len(text):
            output.append(text[i])
        i += 1 

    return "".join(output)

In [17]:
def convert_sets(latex_line):
    cleaned_line = replace_latex_command(latex_line, "set", "{", "}")

    return cleaned_line

In [18]:
def removals(latex_line):
    cleaned_line = re.sub(r'\\\(', '$', latex_line)       #clean the dollars
    cleaned_line = re.sub(r'\\\)', '$', cleaned_line)
    cleaned_line = re.sub(r'\\\recommended\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\item\s*', '', cleaned_line) # remove no content
    cleaned_line = re.sub(r'\\section\{[^{}]*\}', '', cleaned_line) # remove with content
    cleaned_line = re.sub(r'\\subsection\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\label\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\item\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\partsitem\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\partsitem\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\index\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\noindent\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\index\s*\{.*?\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\nearbyexample\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\nearbyexercise\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\nearbytheorem\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\nearbycorollary\s*\{[^{}]*\}', '', cleaned_line) #lose info that werent interpretable meaning titles
    cleaned_line = re.sub(r'\\nearbyremark\s*\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\nearbydefinition\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'\\ref\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\appendrefs\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\includegraphics(?:\[[^\]]*\])?\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\recommended\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\nearbylemma\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\tag\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\cite\{[^{}]*\}', '', cleaned_line)
    #new
    cleaned_line = re.sub(r'\\spaceforemptycolumn\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\raisebox\{[^{}]*\}(\[[^\]]*\])?(\[[^\]]*\])?\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\small\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\puzzle\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\hfill\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\dotproduct\s*', 'dot product of ', cleaned_line)
    cleaned_line = re.sub(r'\\dotprod\s*', 'dot product of ', cleaned_line)
    cleaned_line = re.sub(r'\\rm\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\answerasgiven\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\medskip\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\shortstack\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\subsectionoptional\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\hline\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\tag\*\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\parbox\{[^{}]*\}\{[^{}]*\}', '', cleaned_line)


    

    cleaned_line = re.sub(r'\\chapter\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\hbox\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\topic\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\answerasgiven\{[^{}]*\}', ' answer as given ', cleaned_line)
    cleaned_line = re.sub(r'\\par\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\hspace\*?\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\includegraphics(\[[^\]]*\])?\{[^{}]*\}', '', cleaned_line)

    cleaned_line = re.sub(r'\\lstinline(\[[^\]]*\])?![^!]*?!', '', cleaned_line)

    cleaned_line = re.sub(r'\\textbf\{(.*?)\}', r'\1', cleaned_line, flags=re.DOTALL)
    cleaned_line = re.sub(r'\\textit\{(.*?)\}', r'\1', cleaned_line, flags=re.DOTALL)
    cleaned_line = re.sub(r'\\text\{(.*?)\}', r'\1', cleaned_line, flags=re.DOTALL)
    cleaned_line = re.sub(r'\\spanof\{(.*?)\}', r'span(\1)', cleaned_line, flags=re.DOTALL)
    cleaned_line = re.sub(r'\\mbox\{(.*?)\}', r'\1', cleaned_line, flags=re.DOTALL)


    cleaned_line = re.sub(r'\\renewcommand\{[^{}]*\}\{[^{}]*\}', '', cleaned_line)

    # Remove \multicolumn but keep the inner content

    cleaned_line = re.sub(r'@\{[^{}]*\}', '', cleaned_line)

    cleaned_line = re.sub(r'\\cline\{[^{}]*\}', '', cleaned_line)

    cleaned_line = re.sub(r'\\rule\{[^{}]*\}\{[^{}]*\}', '', cleaned_line)



    cleaned_line = re.sub(r'\\map\{([^\{\}]+)\}\{([^\{\}]+)\}\{([^\{\}]+)\}', r'\1: \2 → \3', cleaned_line)
    cleaned_line = re.sub(r'\\norm\{([^{}]*)\}', r'‖\1‖', cleaned_line)
    cleaned_line = re.sub(r'\\sequence(?:\[[^\]]*\])?\{([^{}]*)\}', r'\1', cleaned_line) ### 
    cleaned_line = re.sub(r'\\binom\{([^{}]*)\}\{([^{}]*)\}', r'(\1 choose \2)', cleaned_line)
    cleaned_line = re.sub(r'\\lincombo\{([^{}]*)\}', r'Linear combination of \1', cleaned_line)
    cleaned_line = re.sub(r'\\rep\{([^{}]*)\}\{([^{}]*)\}', r'Representation of (\1)', cleaned_line)
    cleaned_line = re.sub(r'\\smash\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'\\trans\{([^{}]*)\}', r'\1ᵀ', cleaned_line)
    cleaned_line = re.sub(r'\\rank\{([^{}]*)\}', r'rk(\1)', cleaned_line)
    cleaned_line = re.sub(r'\\hypertarget\{[^{}]*\}\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'\\votinggraphic\{([^{}]*)\}', r'Voting graphic: \1', cleaned_line)
    cleaned_line = re.sub(r'\\votepreflist\{([^{}]*)\}\{([^{}]*)\}\{([^{}]*)\}', r'(\1, \2, \3)', cleaned_line)
    cleaned_line = re.sub(r'\\frac\{([^{}]*)\}\{([^{}]*)\}', r'(\1/\2)', cleaned_line)
    cleaned_line = re.sub(r'\\mapsunder\{([^{}]*)\}', r'\1 ↓', cleaned_line)
    cleaned_line = re.sub(r'\\rep\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'\\rangespace\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'\\generalmatrix\{([^{}]*)\}\{([^{}]*)\}\{([^{}]*)\}', r'general_matrix(\1, \2, \3)', cleaned_line)
    cleaned_line = re.sub(r'\\longmapsto', '↦', cleaned_line)
    cleaned_line = re.sub(r'\\overset\{([^{}]*)\}\{([^{}]*)\}', r'\1 over \2', cleaned_line)
    cleaned_line = re.sub(r'\\underset\{([^{}]*)\}\{([^{}]*)\}', r'\1 under \2', cleaned_line)
    cleaned_line = re.sub(r'\\highlight\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'\\raisebox\{[^{}]*\}\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\trace', 'tr', cleaned_line)
    cleaned_line = re.sub(r'\\hline', '', cleaned_line)
    cleaned_line = re.sub(r'\\identity', 'I', cleaned_line)
    cleaned_line = re.sub(r'\\displaystyle', '', cleaned_line)
    cleaned_line = re.sub(r'\\vcenteredhbox\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\proj', 'proj', cleaned_line)
    cleaned_line = re.sub(r'\\scriptstyle', '', cleaned_line)
    cleaned_line = re.sub(r'\\composed\{([^{}]+)\}\{([^{}]+)\}', r'\1 ∘ \2', cleaned_line)
    cleaned_line = re.sub(r'\\xrightarrow\{[^{}]*\}', r'→', cleaned_line)
    cleaned_line = re.sub(r'\\tiny', '', cleaned_line)
    cleaned_line = re.sub(r'\\magicsquares_([a-zA-Z0-9]+)', r'MagicSquare_{\1}', cleaned_line)
    cleaned_line = re.sub(r'\\semimagicsquares_([a-zA-Z0-9]+)', r'SemiMagicSquare_{\1}', cleaned_line)
    cleaned_line = re.sub(r'\\makebox', '', cleaned_line)
    cleaned_line = re.sub(r'\\colwidth\]\{\$[^{}]*\$\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\newcommand\{\\[^{}]*\}\{\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\texttt\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\section\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\deter\{([^{}]*)\}', r'determinant(\1)', cleaned_line)
    cleaned_line = re.sub(r'\\substack\{([^}]*)\}', r'\1', cleaned_line) 
    cleaned_line = re.sub(r'\\sgn\{?([^{}]*)\}?', r'sgn(\1)', cleaned_line)
    cleaned_line = re.sub(r'\\closedinterval\{([^{}]*)\}\{([^{}]*)\}', r'[\1, \2]', cleaned_line)
    cleaned_line = re.sub(r'\\parbox\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\adj\{?([^{}]*)\}?', r'adj(\1)', cleaned_line)
    cleaned_line = re.sub(r'\\hfil', '', cleaned_line)
    cleaned_line = re.sub(r'\\mathbin', '', cleaned_line)
    cleaned_line = re.sub(r'\\llap\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\rangespace\{([^{}]*)\}', r'range(\1)', cleaned_line) # spaces
    cleaned_line = re.sub(r'\\gennullspace\{([^{}]*)\}', r'general nullspace(\1)', cleaned_line)
    cleaned_line = re.sub(r'\\genrangespace\{([^{}]*)\}', r'general rangespace(\1)', cleaned_line)
    cleaned_line = re.sub(r'\\put\((-?\d+),(-?\d+)\)\{([^{}]*)\}', r'At (\1, \2): \3', cleaned_line)
    cleaned_line = re.sub(r'\\rotatebox\{-?\d+\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\cat\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'\\matrixvenlarge\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'\\scriptsize\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\announcecomputercode\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\endinput\s*', '', cleaned_line)
    cleaned_line = re.sub(r'\\def\\smashdp#\d+\{[^\}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\smashdp\{([^{}]*)\}', r'\1', cleaned_line)





    cleaned_line = re.sub(r'\\nbyn\{([^{}]*)\}', r'N by \1', cleaned_line)
    cleaned_line = re.sub(r'\\begin\{subarray\}\{[^\}]*\}([^{}]*)\\end\{subarray\}', r'\1', cleaned_line)

    #lookabove
    cleaned_line = re.sub(r'\\compconj\{([^{}]*)\}', r'\1̅', cleaned_line)
    cleaned_line = re.sub(r'\\hat\{([^{}]*)\}', r'\1̂', cleaned_line)

    #spaces
    cleaned_line = re.sub(r'\\rowspace\{([^}]*)\}', r'rowspace \1', cleaned_line)
    cleaned_line = re.sub(r'\\?index\s*\{[^\}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\?includegraphics\s*\{[^\}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\?definend\s*\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'defbetweenrowvspace\([^)]*\)\s*(\[[^\]]*\])?\s*(\[[^\]]*\])?\{[^}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'definend\s*\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'textit\s*\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'mph\s*\{([^{}]*)\}', r'\1', cleaned_line)
    cleaned_line = re.sub(r'subsection\s*\{([^{}]*)\}', '', cleaned_line)

                    



    def clean_defind(match):
        incontent = match.group(1)  # keep the inside for syntanctic- semantic purposes
        clean_content = re.sub(r'[^a-zA-Z0-9 ]', '', incontent)
        return clean_content 

    cleaned_line = re.sub(r'\\definend\{([^{}]*)\}', clean_defind, cleaned_line)
    cleaned_line = re.sub(r'\\/', '', cleaned_line)
    cleaned_line = re.sub(r'\\\.\.\.', '...', cleaned_line)
    cleaned_line = re.sub(r'\\\.', '', cleaned_line)
    cleaned_line = re.sub(r'\\>\s*', ' ', cleaned_line)
    cleaned_line = re.sub(r'\{@\{.*?\}@\}', '', cleaned_line)
    cleaned_line = re.sub(r'\{@\{[^{}]*?\}@\}', '', cleaned_line)
    cleaned_line = re.sub(r'\{[rlc|]+\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\!+', '', cleaned_line)
    cleaned_line = re.sub(r'\\,', '', cleaned_line)
    cleaned_line = re.sub(r'\\;', '', cleaned_line)
    cleaned_line = re.sub(r'@>(.*?)>(.*?)>', r' → (\2) ', cleaned_line)  # for cds
    cleaned_line = re.sub(r'@V\{(.*?)\}VV', r' ↓ (\1) ', cleaned_line) 
    cleaned_line = re.sub(r'@A\{(.*?)\}AA', r' ↑ (\1) ', cleaned_line)
    cleaned_line = re.sub(r'\\\$', '$', cleaned_line)
    cleaned_line = re.sub(r'\$\s+\$', '$', cleaned_line)

    return cleaned_line.strip() 

In [19]:
def convert_math_funcs(latex_line):
    # agressive pattern
    math_pattern = re.compile(r'\\([a-zA-Z]+)(?:\^(\d+))?\s*(?:\{([^{}]*)\}|\s*([a-zA-Zα-ωΑ-Ω0-9_θπ/+-]+))?')

    def replace_math_func(match):
        func_name = match.group(1)  # Function name: cos, sin, tan, etc.
        exponent = match.group(2)  # Exponent (if present)
        argument = match.group(3) if match.group(3) else match.group(4)  # Function argument

        if not argument:
            return func_name

        # Format output: handle exponent cases
        if exponent:
            return f"{func_name}ⁿ({argument})".replace("ⁿ", f"^{exponent}")
        return f"{func_name}({argument})"

    # apply
    cleaned_line = re.sub(math_pattern, replace_math_func, latex_line)

    return cleaned_line.strip()

In [20]:
def remove_graphics(latex_line):
    cleaned_line = re.sub(r'\\?setlength\s*\{[^{}]*\}\s*\{[^{}]*\}', '', latex_line)
    cleaned_line = re.sub(r'\\?put\s*\([^)]*\)\s*\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\?put\s*\([^)]*\)\s*\{\s*\}', '', cleaned_line) 
    cleaned_line = re.sub(r'\\?vector\s*\([^)]*\)\s*\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\?line\s*\([^)]*\)\s*\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\?raisebox\s*\([^)]*\)\s*(\[[^\]]*\])?\s*(\[[^\]]*\])?\s*\{[^{}]*\}', '', cleaned_line)
    cleaned_line = re.sub(r'\\?vcenteredhbox\s*\([^)]*\)', '', cleaned_line)
    cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
    
    return cleaned_line

In [21]:
def convert_latex_to_text(latex_line):
    """Main function to process LaTeX text while keeping $ for equations."""
    cleaned_line = clean_latex_to_text(latex_line)
    cleaned_line = multis(cleaned_line)
    cleaned_line = convert_math_symbols(cleaned_line)
    cleaned_line = convert_matrix(cleaned_line)
    cleaned_line = begs_and_ends(cleaned_line)
    cleaned_line = steps(cleaned_line)
    cleaned_line = row_vecs(cleaned_line)
    cleaned_line = col_vecs(cleaned_line)
    cleaned_line = convert_square_bracket_vectors(cleaned_line)
    cleaned_line = convert_absval(cleaned_line)
    cleaned_line = convert_sets(cleaned_line)
    cleaned_line = removals(cleaned_line)
    cleaned_line = convert_math_funcs(cleaned_line)
    cleaned_line = remove_graphics(cleaned_line)
    
    
    return cleaned_line.strip()

In [22]:
processed_content = [convert_latex_to_text(line) for line in content]

In [23]:
pros_cont_str = ' '.join(processed_content)
#print(pros_cont_str)


In [24]:
final_output_file = "lina_corpus.txt"
with open(final_output_file, "w", encoding="utf-8") as file:
    file.write(pros_cont_str)


In [25]:
nlp = spacy.load("en_core_web_sm")

with open(final_output_file, "r") as f:
    lina = f.read()

nlp.max_length = 2000000  
doc = nlp(lina)

sentences = [sent.text.strip() for sent in doc.sents]

sentence_output_file = "lina_spacy_sentences.txt"
with open(sentence_output_file, "w", encoding="utf-8") as file:
    file.write("\n".join(sentences))

In [26]:
stanza.download("en")
nlp = stanza.Pipeline("en", processors="tokenize", use_gpu=True)
with open("lina_corpus.txt", "r", encoding="utf-8") as f:
    lina = f.read()

doc = nlp(lina)

with open("lina_stanza_sentences.txt", "w", encoding="utf-8") as f:
    for sentence in doc.sentences:
        tokenized = " ".join([word.text for word in sentence.words])
        f.write(tokenized + "\n")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 111MB/s]
2025-03-31 20:15:08 INFO: Downloaded file to /Users/soteresnkosdes/stanza_resources/resources.json
2025-03-31 20:15:08 INFO: Downloading default packages for language: en (English) ...
2025-03-31 20:15:09 INFO: File exists: /Users/soteresnkosdes/stanza_resources/en/default.zip
2025-03-31 20:15:11 INFO: Finished downloading models and saved to /Users/soteresnkosdes/stanza_resources
2025-03-31 20:15:11 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 34.7MB/s]
2025-03-31 20:15:11 INFO: Downloaded file to /Users/soteresnkosdes/stanza_resources/resources.json
2025-03-31 20:15:11 INFO: Loading these models for

In [28]:
device = "cuda" if stanza.download('en') or torch.cuda.is_available() else "cpu"
nlp = stanza.Pipeline(
    "en", 
    processors="tokenize,pos,lemma,depparse", 
    use_gpu=(device == "cuda")
)

sentence_file = "lina_stanza_sentences.txt"
with open(sentence_file, "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]


all_sentences = []
for line in tqdm(sentences, desc="Processing Sentences", unit="sent"):
    doc = nlp(line)
    all_sentences.extend(doc.sentences)


if all_sentences:
    doc = nlp("")
    doc.sentences = all_sentences

    # Write to CoNLL-U
    ud_output_file = "lina_universal_dependencies.conllu"
    CoNLL.write_doc2conll(doc, ud_output_file)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 35.5MB/s]
2025-03-31 20:53:00 INFO: Downloaded file to /Users/soteresnkosdes/stanza_resources/resources.json
2025-03-31 20:53:00 INFO: Downloading default packages for language: en (English) ...
2025-03-31 20:53:01 INFO: File exists: /Users/soteresnkosdes/stanza_resources/en/default.zip
2025-03-31 20:53:04 INFO: Finished downloading models and saved to /Users/soteresnkosdes/stanza_resources
2025-03-31 20:53:04 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 424kB [00:00, 37.3MB/s]
2025-03-31 20:53:04 INFO: Downloaded file to /Users/soteresnkosdes/stanza_resources/resources.json
2025-03-31 20:53:04 INFO: Loading these models fo

In [None]:
#for outputting csv 

#conllu_file_path = "lina_universal_dependencies.conllu"
#csv_output_path = "lina_ud_csv.csv" 


#parsed_data = []
#with open(conllu_file_path, "r", encoding="utf-8") as file:
#    for line in file:
#        line = line.strip()
#        if line.startswith("#") or not line:
#            continue
#        fields = line.split("\t") 
#        if len(fields) == 10:  #conllu frame was 10
#            parsed_data.append(fields)  

## Dataframe
#columns = ["ID", "TEXT", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]
#df = pd.DataFrame(parsed_data, columns=columns)
#df.to_csv(csv_output_path, index=False, encoding="utf-8")


In [None]:
#for outputting json

#conllu_file_path = "lina_universal_dependencies.conllu"
#json_output_path = "lina_ud_json.json" 

#sentences = []
#current_sentence = []

## Open and read the CoNLL-U file
#with open(conllu_file_path, "r", encoding="utf-8") as file:
#    for line in file:
#        line = line.strip()  # remove whitespace
#        if line.startswith("#"):  #ignore comments
#            continue
#        if not line:  # new sentence (empty line)
#            if current_sentence:  # Append previous sentence
#                sentences.append(current_sentence)
#                current_sentence = []
#            continue
        
#        fields = line.split("\t") 
#        if len(fields) == 10: 
#            token_data = {
#                "id": fields[0],
#                "text": fields[1],
#                "lemma": fields[2],
#                "upos": fields[3],
#                "xpos": fields[4],
#                "feats": fields[5],
#                "head": fields[6],
#                "deprel": fields[7],
#                "deps": fields[8],
#                "misc": fields[9]
#            }
#            current_sentence.append(token_data)


#with open(json_output_path, "w", encoding="utf-8") as json_file:
#     json.dump(sentences, json_file, indent=4, ensure_ascii=False)

In [None]:
def enforce_conllu_format(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    new_lines = []
    current_sentence = []

    for line in lines:
        line = line.rstrip("\n")
        if line.strip() == "":
            if current_sentence:
                new_lines.extend(current_sentence)
                new_lines.append("")  # Add one blank line between sentences
                current_sentence = []
        else:
            current_sentence.append(line)

    # If something left in the last sentence
    if current_sentence:
        new_lines.extend(current_sentence)
        new_lines.append("")  # Final blank line

    with open(filepath, "w", encoding="utf-8") as f:
        f.write("\n".join(new_lines) + "\n")  # final newline

enforce_conllu_format("lina_universal_dependencies.conllu")