# 

# Vocabulaire et grammaire du langage Tex (fr)

In [1]:
import pynini
import pandas as pd
import json

ENCODING = 'utf-8'

## 1. Noms de variables

### 1.1 Caractères

In [2]:
chars = [chr(i) for i in range(ord('a'), ord('z') + 1)]
Chars = [chr(i) for i in range(ord('A'), ord('Z') + 1)]

In [3]:
# minuscules
min_char = {
    char:[char] for char in chars
}

# majuscules
maj_char = {
    char:["grand " + char.lower()] for char in Chars 
}

# lettres calligraphiques
cal_char = {
    "\\mathcal { " + char + " }":[char.lower() + " calligraphique"] for char in Chars 
}

# black board
mbb_char = {
    "\\mathbb { " + char + " }":[char.lower() + " double barre", char.lower() + " avec double barre", "double barre " + char.lower(), "black board " + char.lower()] for char in Chars
}



### 1.2 Lettres grecques

In [4]:
greek_min_var_names = [
    "alpha", 
    "beta",
    "gamma",
    "delta",
    "epsilon",
    "zeta",
    "eta",
    "theta",
    "iota",
    "kappa",
    "lambda",
    "mu",
    "nu",
    "xi",
    "pi",
    "rho",
    "sigma",
    "tau",
    "upsilon",
    "phi",
    "chi",
    "psi",
    "omega"
]

greek_maj_var_names = [
    "Gamma",
    "Delta",
    "Theta",
    "Lambda",
    "Xi",
    "Pi",
    "Sigma",
    "Phi",
    "Omega"
]

In [5]:
# minuscules
greek_min_vars = {
    "\\" + var : [var] for var in greek_min_var_names
}
greek_min_vars["\\xi"] = ["xi", "ksi"]
greek_min_vars["\\chi"] = ["chi", "khi"]

# majuscules
greek_maj_vars = {
    "\\" + var : ["grand " + var.lower()] for var in greek_maj_var_names
}
greek_maj_vars["\\Xi"] = ["grand xi", "grand ksi"]

# varname
greek_var_vars = {
    "\\varepsilon":["varepsilon", "var epsilon"],
    "\\varpi":["varpi", "var pi"],
    "\\varrho":["varrho", "var rho"],
    "\\varsigma":["varsigma", "var sigma"]
}

## 2. Valeurs 

### 2.1 Chiffres

In [6]:
# chiffres
digit = {

    "0":["zero","0"],
    "1":["un","1"],
    "2":["deux","2"],
    "3":["trois","3"],
    "4":["quatre","4"],
    "5":["cinq", "5"],
    "6":["six", "6"],
    "7":["sept", "7"],
    "8":["huit", "8"],
    "9":["neuf", "9"]

}

### 2.2 Infini

In [7]:
infty = {
    "\\infty":["infini"]
}

### 2.3 Ensembles

In [8]:
sets = {
    "\\mathbb { N }" : ["entiers naturels", "ensemble des entiers naturels", "ensemble grand n"],
    "\\mathbb { Z }" : ["entiers relatifs", "ensemble des entiers relatifs", "ensemble grand z"],
    "\\mathbb { Q }" : ["rationnels", "ensemble des rationnels", "ensemble rationnel", "ensemble grand q"],
    "\\mathbb { R }" : ["reel", "ensemble des reels", "ensemble reel", "ensemble grand r"],
    "\\mathbb { C }" : ["complexe", "ensemble des complexes", "ensemble complexe", "ensemble grand c"],
}

## 3. Relations

In [9]:
rel = {

    # ordre
    ">":['plus grand', 'superieur', "plus grand que", "superieur à"],
    "\\geq":['plus grand ou egal', 'superieur ou egal', 'plus grand ou egal à', "superieur ou egal à"],
    "<":['plus petit', 'inferieur', 'plus petit que', 'inferieur à'],
    "\\leq":['plus petit ou egal', 'inferieur ou egal', 'plus petit ou egal à', 'inferieur ou egal à'],

    # égalité
    "=":["egal", "egal à"],
    "\\neq":["different", "different de"],
    "\\simeq":["environ egal", "environ egal à", "isomorphe", "isomorphe à"],
    "\\propto":["proportionnel", "proportionnel à"],
    "\\sim":["suit"],
    
    # ensembles
    "\\in":["appartient", "appartient à", "dans"],
    "\\notin":["appartient pas", "appartient pas à"],
    "\\subset":["inclu", "inclu dans"],
    "\\subseteq":["inclu ou egal", "inclu ou egal à"],
    "\\not\\subset":["pas inclu", "pas inclu dans", "non inclu", "non inclu dans"],
    "\\not\\subseteq":["pas inclu ou egal", "non inclu ou egal", "pas inclu ou egal à", "non inclu ou egal à"],

    # convergence et équivalences
    "\\Rightarrow":['implique'],
    "\\Leftrightarrow":["equivalent"],
    "\\rightarrow":['fleche'],
    "\\longrightarrow":['longue fleche', 'tend vers'],
    '\\overset { L } { \\longrightarrow }':['converge en loi'],
    '\\overset { \\mathbb { P } } { \\longrightarrow }':['converge en probabilite'],

    # espace entre expressions
    '\\quad':['espace']

}

## 4. Opérateurs

### 4.1 Quantificateurs logiques

In [10]:
quantif = {
    "\\forall":['pour tout'],
    "\\exists":['existe'],
    "\\exists !":['existe un unique', 'existe unique'],
}

### 4.2 Opérateurs unaires

In [11]:
un_op = {

   "+":['plus'],
   "-":["moins"],
   "\\neg" : ['non', 'negation', 'negation de'],
   "\\partial" : ["d rond", "d rond de", "derivee partielle", "derivee partielle de"],
   "\\nabla" : ["gradient", "gradient de"]

}

### 4.3 Opérateurs binaires

In [12]:
bin_op = {

    '+':['plus'],
    '-':['moins'],
    '.':['point', 'scalaire'],
    '\\circ':['rond'],
    '\\cup':['union'],
    '\\cap':['inter'],
    '\\times':['croix'],
    '\\otimes':['tenseur', 'produit tensoriel', 'croix rond'],
    '\\oplus':['somme directe', 'plus rond'],
    '\\setminus':['prive', 'prive de'],
    "\\vee":['ou'],
    "\\wedge":['et'],

}

## 5. Indice et exposant

In [13]:
# indice
subscript = {
    "_":["indice", "underscore"]
}

# exposant 
supscript = {
    "^":["exposant", "puissance", "supscript"]
}

## 6. Fonctions et iterateurs

### 6.1 Fonctions usuelles

In [14]:
func_names = {

    # symboles de fonctions usuelles
    '\\min':['minimum', 'min'],
    '\\max':['maximum', 'max'],
    '\\exp':['exponentielle', 'exp'],
    '\\log':['logarithme', 'log'],
    '\\cos':['cosinus', 'cos'],
    '\\sin':['sinus', 'sin'],
    '\\arccos':['arccosinus', 'arc cosinus', 'arccos', 'arc cos'],
    '\\arcsin':['arcsinus', 'arc sinus', 'arcsin', 'arc sin'],
    '\\arctan':['arctangente', 'arc tangente', 'arctan', 'arc tan'],
    '\\deg':['deg', 'degre'],
    '\\dim':['dim', 'dimension'],
    '\\det':['det', 'determinant'],
    '\\mathrm { Re }':['partie relle'],
    '\\mathrn { Im }':['partie imaginaire'],
    '\\mathbb { P }':['probabilite'],
    '\\mathbb { E }':['esperance'],
    '\\mathbb { V }':['variance'],

    # lois
    '\\mathrm { Ber }':['loi de bernoulli', 'loi bernoulli', 'ber'],
    '\\mathrm { Bin }':['loi binomiale', 'bin'],
    '\\mathcal { G }':['loi geometrique'],
    '\\mathcal { H }':['loi hypergeometrique'],
    '\\mathcal { P }':['loi de poisson'],
    '\\mathcal { N }' : ['loi normale'],
    '\\mathcal { E }':['loi exponentielle'],
    '\\Gamma':['loi gamma'],
    '\\chi ^ 2':['chi deux', 'khi deux'],
    '\\mathcal { T }':['loi de student'],
    
}

### 6.2 Itérateurs

In [15]:
iter = {

    '\\sup':['sup', 'borne superieure'],
    '\\inf':['inf', 'borne inferieure'],
    '\\lim':['lim', 'limite'],
    '\\bigotimes':['le produit tensoriel'],
    '\\bigoplus':['la somme directe'],
    '\\bigcup':["l' union"],
    '\\bigcap':["l' intersection"],
    '\\sum':['somme'],
    '\\prod':['produit'],
    '\\int':["integrale"],
    '\\mathrm { argmin }':["argmin", "arg min", "minimiseur"],
    '\\mathrm { argmax }':["argmax", "arg max", "maximiseur"]
    
}

### 6.3 Fonctions avec controle de séquence

In [16]:
func_cseq = {

    '\\frac':['fraction', 'rapport'],
    '\\sqrt':['racine'],
    '\\binom':['binomial', 'coefficient binomial'],
    '\\lVert':['norme'],
    '\\rVert':['fermer la norme', 'fin de la norme'],
    '\\|':['valeur absolue']

}

## 7. Symboles spéciaux

### 7.1 caractères spéciaux

In [17]:
symb = {

    ':':['deux points', 'tel que'],
    '\\dots':['trois points', 'trois petits points'],
    ',':['virgule'],
    "'":['apostrophe', 'prime'],
    "^ *":['etoile', 'star'],
    '!':["point d' exclamation", "factoriel"],
    '?':["point d' interrogation"],
    '\\%':['pourcent', 'pour cent'],
    '\\vert':['conditionnellement', 'barre verticale'],
    
    
}

### 7.2 controle de sequence

In [18]:
# math cseq
math_cseq = {

    "(":["parenthese"],
    ")":["fermer la parenthese", "fermer parenthese", "parenthese fermante", "fin de parenthese", "fin parenthese"],

    "[":["crochet"],
    "]":["fermer le crochet", "fermer crochet", "crochet fermant", "fin de crochet", "fin crochet"],

    "\\{":["ensemble"],
    "\\}":["fermer l' ensemble", "fermer ensemble", "fin de l' ensemble", "fin ensemble"],

}


# tex cseq
tex_cseq = {

    "{":["accolade"],
    "}":["fermer l' accolade", "fermer accolade", "accolade fermante", "fin de l' accolade", "fin accolade"]
    
}

## 8. Raccourcis naturels

In [19]:
natural_expr = {

    # relations d'ordre
    "\\geq 0":["positif"],
    "> 0":["strictement positif"],
    "\\leq 0":["negatif"],
    "< 0":["strictement negatif"],

    # définition
    ": =":["egal par definition", "defini par"],

    # ensemble et appartenance
    "\\mathbb { N } ^ *":["entier strictement positif", "entier naturel non nul"],
    "\\mathbb { Z } ^ *":["entier non nul", "relatif non nul"],
    "\\mathbb { R } ^ +":["ensemble des reels positifs", "reel positif"],
    "\\mathbb { R } ^ -":["ensemble des reels negatifs", "reel negatif"],
    "\\mathbb { R } ^ + _ *":["ensemble des reels strictement positifs", "ensemble des reels positifs non nuls", "r plus etoile", "reel strictement positif", "reel positif non nul"],
    "\\mathbb { R } ^ - _ *":["ensemble des reels strictement negatifs", "ensemble des reels negatifs non nuls", "r moins etoile", "reel strictement negatif", "reel negatif non nul"],

    # loi normale 0, 1
    '\\mathcal { N } ( 0 , 1 )':['loi normale centree reduite','loi standard','loi n zero un'],
}

others = {

    '\\text { resp. }':['respectivement'],
    
}

## 9. Mots clés de séquence

In [29]:
key_seq = {

    "pour" : ["pour"],
    "allant_de" : ["allant de"],
    "à" : ["à"],
    "de":["de"],
    "sur" : ["sur"],
    "tout_sur" : ["le tout sur", "tout sur"],
    "facteur_de":["facteur de"],
    "tout_facteur_de" : ["le tout facteur de", "tout facteur de"],
    "chapeau" : ["chapeau"],
    "tilde":["tilde"],
    "barre":["barre"],
    "vecteur":['vecteur']
    
}

## 10. Enregistrement json

In [30]:
tex_grammar = {

    # var names
    "var_name":{

        **min_char,
        **maj_char,
        **cal_char,
        **mbb_char,
        **greek_min_vars,
        **greek_maj_vars,
        **greek_var_vars
    
    },

    # vals
    "digit":digit,
    "infty":infty,
    "sets":sets,
    
    # relations
    "rel":rel,

    # operateurs
    "quantif":quantif,
    "un_op":un_op,
    "bin_op":bin_op,

    # indices et exposants
    "subscript":subscript,
    "supscript":supscript,

    "func_names":func_names,
    "iter":iter,
    "func_cseq":func_cseq, 
    "symb":symb,

    "math_cseq":math_cseq,
    "tex_cseq":tex_cseq,

    
    "natural_expr":natural_expr,
    "other":others,

    "key_seq":key_seq,

    
}

In [31]:
with open('grammar/tex_grammar.json', 'w', encoding=ENCODING) as file:
    json.dump(tex_grammar, file, indent=4)

In [32]:
tex_words = set()
seq_words = set()


for rules in tex_grammar.values():
    commands = rules.keys()
    for rule in commands : 
        tex = rule.split()
        for t in tex : 
            tex_words.add(t)
        seqs = rules[rule]
        for seq in seqs : 
            seq = seq.split()
            for s in seq : 
                seq_words.add(s)


**Mots LaTeX pour les couches de transduction grammaticale :**

In [33]:
spec_tex = [

    "\\limits",
    "\\tilde",
    "\\overline",
    "\\hat",
    "\\vec",

]


for wrd in spec_tex : 
    tex_words.add(wrd)

In [34]:
dic_tex_words = {
    "tex_words":list(tex_words)
}

with open('tokens/tex_letters.json', 'w', encoding=ENCODING) as file:
    json.dump(dic_tex_words, file, indent=4)

In [35]:
input = []
output = []

for dic in tex_grammar.values():
    for tex in dic.keys():
        for seq in dic[tex] : 
            input.append(seq)
            output.append(tex) 

with open('rules/LexicalRules/math_rules_keyseq.txt', 'w', encoding=ENCODING) as writer:
    writer.write("INPUT;OUTPUT;\n")
    for i in range(len(input)):
        writer.write(input[i] + ";" + output[i] + ";\n")

with open('rules/LexicalRules/math_rules.txt', 'w', encoding=ENCODING) as writer:
    writer.write("INPUT;OUTPUT;\n")
    for i in range(len(input)):
        if not(output[i] in key_seq.keys()): 
            writer.write(input[i] + ";" + output[i] + ";\n")

## 11. Vocabulaire

In [36]:
input_set = set()
output_set = set()

for dic in tex_grammar.values():
    for tex in dic.keys():
        for seq in dic[tex] :
            for wrd in seq.split(): 
                input_set.add(wrd)
        for stex in tex.split():    
            output_set.add(stex)

input_set

{'0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'absolue',
 'accolade',
 'allant',
 'alpha',
 'apostrophe',
 'appartient',
 'arc',
 'arccos',
 'arccosinus',
 'arcsin',
 'arcsinus',
 'arctan',
 'arctangente',
 'arg',
 'argmax',
 'argmin',
 'avec',
 'b',
 'barre',
 'ber',
 'bernoulli',
 'beta',
 'bin',
 'binomial',
 'binomiale',
 'black',
 'board',
 'borne',
 'c',
 'calligraphique',
 'cent',
 'centree',
 'chapeau',
 'chi',
 'cinq',
 'coefficient',
 'complexe',
 'complexes',
 'conditionnellement',
 'converge',
 'cos',
 'cosinus',
 'crochet',
 'croix',
 'd',
 "d'",
 'dans',
 'de',
 'defini',
 'definition',
 'deg',
 'degre',
 'delta',
 'derivee',
 'des',
 'det',
 'determinant',
 'deux',
 'different',
 'dim',
 'dimension',
 'directe',
 'double',
 'e',
 'egal',
 'en',
 'ensemble',
 'entier',
 'entiers',
 'environ',
 'epsilon',
 'equivalent',
 'espace',
 'esperance',
 'et',
 'eta',
 'etoile',
 'exclamation',
 'existe',
 'exp',
 'exponentielle',
 'exposant',
 'f',
 'facteur',

In [37]:
with open("vocab/vocab.json", 'w', encoding=ENCODING) as writer : 
    json.dump({"vocab":[i for i in input_set]}, writer, indent=4)