# 

# Vocabulaire et grammaire du langage Tex (fr)

Ce fichier permet la définition et la génération des fichiers de règles utilisées par les transducteurs : 

 - Affectation des éléments LaTeX selon les types
 - Définition de une ou plusieurs séquences de mots associées en langage naturel

In [33]:
import pynini
import pandas as pd
import json

ENCODING = 'utf-8'

## 1. Noms de variables

### 1.1 Caractères

In [34]:
chars = [chr(i) for i in range(ord('a'), ord('z') + 1)]
Chars = [chr(i) for i in range(ord('A'), ord('Z') + 1)]

In [35]:
# minuscules
min_char = {
    char:[char] for char in chars
}

# majuscules
maj_char = {
    char:["grand " + char.lower()] for char in Chars 
}

# lettres calligraphiques
cal_char = {
    "\\mathcal { " + char + " }":[char.lower() + " calligraphique"] for char in Chars 
}

# black board
mbb_char = {
    "\\mathbb { " + char + " }":[char.lower() + " double barre", char.lower() + " avec double barre", "double barre " + char.lower(), "black board " + char.lower()] for char in Chars
}



### 1.2 Lettres grecques

In [36]:
greek_min_var_names = [
    "alpha", 
    "beta",
    "gamma",
    "delta",
    "epsilon",
    "zeta",
    "eta",
    "theta",
    "iota",
    "kappa",
    "lambda",
    "mu",
    "nu",
    "xi",
    "pi",
    "rho",
    "sigma",
    "tau",
    "upsilon",
    "phi",
    "chi",
    "psi",
    "omega"
]

greek_maj_var_names = [
    "Gamma",
    "Delta",
    "Theta",
    "Lambda",
    "Xi",
    "Pi",
    "Sigma",
    "Phi",
    "Omega"
]

In [37]:
# minuscules
greek_min_vars = {
    "\\" + var : [var] for var in greek_min_var_names
}
greek_min_vars["\\xi"] = ["xi", "ksi"]
greek_min_vars["\\chi"] = ["chi", "khi"]

# majuscules
greek_maj_vars = {
    "\\" + var : ["grand " + var.lower()] for var in greek_maj_var_names
}
greek_maj_vars["\\Xi"] = ["grand xi", "grand ksi"]

# varname
greek_var_vars = {
    "\\varepsilon":["varepsilon", "var epsilon"],
    "\\varpi":["varpi", "var pi"],
    "\\varrho":["varrho", "var rho"],
    "\\varsigma":["varsigma", "var sigma"]
}

# dvar 
dvar = {
    "d" + var : ["d" + var] for var in chars if var in ['q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
}

## 2. Valeurs 

### 2.1 Chiffres

In [38]:
# chiffres
digit = {

    "0":["zero","0"],
    "1":["un","1"],
    "2":["deux","2"],
    "3":["trois","3"],
    "4":["quatre","4"],
    "5":["cinq", "5"],
    "6":["six", "6"],
    "7":["sept", "7"],
    "8":["huit", "8"],
    "9":["neuf", "9"]

}

### 2.2 Infini

In [39]:
infty = {
    "\\infty":["infini"]
}

### 2.3 Ensembles

In [40]:
sets = {
    "\\mathbb { N }" : ["entiers naturels", "ensemble des entiers naturels", "ensemble grand n"],
    "\\mathbb { Z }" : ["entiers relatifs", "ensemble des entiers relatifs", "ensemble grand z"],
    "\\mathbb { Q }" : ["rationnels", "ensemble des rationnels", "ensemble rationnel", "ensemble grand q"],
    "\\mathbb { R }" : ["reel", "ensemble des reels", "ensemble reel", "ensemble grand r"],
    "\\mathbb { C }" : ["complexe", "ensemble des complexes", "ensemble complexe", "ensemble grand c"],

    "\\mathbb { N } ^ { * }":["entier strictement positif", "entier naturel non nul"],
    "\\mathbb { Z } ^ { * }":["entier non nul", "relatif non nul"],
    "\\mathbb { R } ^ { + }":["ensemble des reels positifs", "reel positif"],
    "\\mathbb { R } ^ { - }":["ensemble des reels negatifs", "reel negatif"],
    "\\mathbb { R } ^ { + } _ { * }":["ensemble des reels strictement positifs", "ensemble des reels positifs non nuls", "r plus etoile", "reel strictement positif", "reel positif non nul"],
    "\\mathbb { R } ^ { - } _ { * }":["ensemble des reels strictement negatifs", "ensemble des reels negatifs non nuls", "r moins etoile", "reel strictement negatif", "reel negatif non nul"],
}

## 3. Relations

In [41]:
rel = {

    # ordre
    ">":['plus grand', 'superieur', "plus grand que", "superieur à"],
    "\\geq":['plus grand ou egal', 'superieur ou egal', 'plus grand ou egal à', "superieur ou egal à"],
    "<":['plus petit', 'inferieur', 'plus petit que', 'inferieur à'],
    "\\leq":['plus petit ou egal', 'inferieur ou egal', 'plus petit ou egal à', 'inferieur ou egal à'],

    # égalité
    "=":["egal", "egal à"],
    "\\neq":["different", "different de"],
    "\\simeq":["environ egal", "environ egal à", "isomorphe", "isomorphe à"],
    "\\propto":["proportionnel", "proportionnel à"],
    "\\sim":["suit"],
    
    # ensembles
    "\\in":["appartient", "appartient à", "dans"],
    "\\notin":["appartient pas", "appartient pas à"],
    "\\subset":["inclu", "inclu dans"],
    "\\subseteq":["inclu ou egal", "inclu ou egal à"],
    "\\not\\subset":["pas inclu", "pas inclu dans", "non inclu", "non inclu dans"],
    "\\not\\subseteq":["pas inclu ou egal", "non inclu ou egal", "pas inclu ou egal à", "non inclu ou egal à"],

    # convergence et équivalences
    "\\Rightarrow":['implique'],
    "\\Leftrightarrow":["equivalent"],
    "\\rightarrow":['fleche'],
    "\\longrightarrow":['longue fleche', 'tend vers'],
    '\\overset { L } { \\longrightarrow }':['converge en loi'],
    '\\overset { \\mathbb { P } } { \\longrightarrow }':['converge en probabilite'],

    # espace entre expressions
    '\\quad':['espace']

}

## 4. Opérateurs

### 4.1 Quantificateurs logiques

In [42]:
quantif = {
    "\\forall":['pour tout'],
    "\\exists":['existe'],
    "\\exists !":['existe un unique', 'existe unique'],
}

### 4.2 Opérateurs unaires

In [43]:
un_op = {

   "+":['plus'],
   "-":["moins"],

}

### 4.3 Opérateurs binaires

In [44]:
bin_op = {

    '+':['plus'],
    '-':['moins'],
    '.':['point', 'scalaire'],
    '\\circ':['rond'],
    '\\cup':['union'],
    '\\cap':['inter'],
    '\\times':['croix'],
    '\\otimes':['tenseur', 'produit tensoriel', 'croix rond'],
    '\\oplus':['somme directe', 'plus rond'],
    '\\setminus':['prive', 'prive de'],
    "\\vee":['ou'],
    "\\wedge":['et'],

}

## 5. Indice et exposant

In [45]:
# indice
subscript = {
    "_":["indice", "underscore"]
}

# exposant 
supscript = {
    "^":["exposant", "puissance", "supscript"]
}

## 6. Fonctions et iterateurs

### 6.1 Fonctions usuelles

In [46]:
# fonctions de une variable
func = {

    '\\deg <apply>':['deg', 'degre'],
    '\\dim <apply>':['dim', 'dimension'],
    '\\det <apply>':['det', 'determinant'],
    '\\exp <apply>':['exponentielle', 'exp'],
    '\\log <apply>':['logarithme', 'log'],
    '\\cos <apply>':['cosinus', 'cos'],
    '\\sin <apply>':['sinus', 'sin'],
    '\\arccos <apply>':['arccosinus', 'arc cosinus', 'arccos', 'arc cos'],
    '\\arcsin <apply>':['arcsinus', 'arc sinus', 'arcsin', 'arc sin'],
    '\\arctan <apply>':['arctangente', 'arc tangente', 'arctan', 'arc tan'],
    '\\mathrm { Re } <apply>':['partie reelle'],
    '\\mathrm { Im } <apply>':['partie imaginaire'],

}

# fonctions spéciales (avec controle de séquence différent)
spec_func = {

    '\\frac <apply>':['fraction', 'rapport'],
    '\\sqrt <apply>':['racine'],
    '\\binom <apply>':['binomial', 'coefficient binomial'],
    '\\lVert <apply>':['norme'],
    '\\vert <apply>':['valeur absolue'],
    '\\mathbb { E } <apply>':['esperance'],
    '\\mathbb { P } <apply>':['probabilite'],
    '\\mathbb { V } <apply>':['variance'],
    "\\neg <apply>" : ['non', 'negation'],
    "\\partial <apply>" : ["d rond","derivee partielle"],
    "\\nabla <apply>" : ["gradient"]

}

# fonctions d'ensemble
set_func = {

    '\\min <apply>':['min', 'minimum'],
    '\\max <apply>':['max', 'maximum'],
    '\\mathrm { argmin } <apply>':['argmin', 'arg min', 'argument min', 'argument minimum'],
    '\\mathrm { argmax } <apply>':['argmax', 'arg max', 'argument max', 'argument maximum'],
    '\\sup <apply>':['sup', 'borne superieure'],
    '\\inf <apply>':['inf', 'borne inferieure'],


}

# lois statistique à un paramètre
law1 = {

    '\\mathrm { Ber } <apply>':['loi de bernoulli', 'bernoulli', 'ber'],
    '\\mathcal { G } <apply>':['loi geometrique'],
    '\\mathcal { P } <apply>':['loi de poisson', 'loi poisson'],
    '\\mathcal { E } <apply>':['loi exponentielle'],
    '\\Chi ^ 2 <apply>' : ['chi deux', 'khi deux']

}

# lois statistiques à deux paramètres
law2 = {

    '\\mathcal { N } <apply>' : ['loi normale'],
    '\\mathrm { Bin } <apply>' : ['loi binomiale', 'bin'],
    '\\matrhm { Gamma } <apply>' : ['loi gamma'],

}


### 6.2 Itérateurs

In [47]:
iter = {

    '\\lim':['lim', 'limite'],
    '\\bigotimes':['le produit tensoriel'],
    '\\bigoplus':['la somme directe'],
    '\\bigcup':["l' union"],
    '\\bigcap':["l' intersection"],
    '\\sum':['somme'],
    '\\prod':['produit'],
    '\\int':["integrale"],
    
    
}

## 7. Symboles spéciaux

### 7.1 caractères spéciaux

In [48]:
symb = {

    ':':['deux points', 'tel que'],
    '\\dots':['trois points', 'trois petits points'],
    ',':['virgule'],
    '?':["point d' interrogation"],
    '\\%':['pourcent', 'pour cent'],
    '\\vert':['conditionnellement', 'barre verticale'],
    
    
}

### 7.2 controle de sequence

In [49]:
# math cseq
math_cseq = {

    "(":["parenthese"],
    ")":["fermer la parenthese", "fermer parenthese", "parenthese fermante", "fin de parenthese", "fin parenthese"],

    "[":["crochet"],
    "]":["fermer le crochet", "fermer crochet", "crochet fermant", "fin de crochet", "fin crochet"],

    "\\{":["ensemble"],
    "\\}":["fermer l' ensemble", "fermer ensemble", "fin de l' ensemble", "fin ensemble"],

}


# tex cseq
tex_cseq = {

    "{":["accolade"],
    "}":["fermer l' accolade", "fermer accolade", "accolade fermante", "fin de l' accolade", "fin accolade"]
    
}

## 8. Raccourcis naturels

In [55]:
natural_expr = {

    # relations d'ordre
    "\\geq 0":  ["positif"],
    "> 0":      ["strictement positif"],
    "\\leq 0":  ["negatif"],
    "< 0":      ["strictement negatif"],

    # définition
    ": =":      ["egal par definition", "defini par"],

    # puissances 
    "^ 2":      ['carre'],
    "^ 3":      ['cube'],


    # loi normale 0, 1
    '\\mathcal { N } ( 0 , 1 )':['loi normale centree reduite','loi standard','loi n zero un'],
    '\\quad \\text { resp. } \\quad':['respectivement'],

}

## 9. Mots clés pour l'analyse grammaticale

In [56]:
grammar_keywords = {

    # decoration de variable
    "<hat>"         : ["chapeau"],
    "<tilde>"       : ["tilde"],
    "<^*>"          : ['etoile'],
    "<vec>"         : ['vecteur'],
    "<'>"           : ['apostrophe', 'prime'],
    

    # mots clés de structure grammaticale
    "<underset>"    : ["pour"],
    "<from>"        : ["allant de"],
    "<to>"          : ["à"],
    "<apply>"       : ["de"],
    "<over>"        : ["sur"],
    "<fact>"        : ["facteur", "facteur de"],
    "<overline>"    : ["barre"],
    '<!>'           : ["point d' exclamation", "factoriel"],
    "<^T>"          : ['transpose'],

    # symbole d'ouverture de type
    "{-{":[],

    # symbole de fermeture de type
    "}-}":[],
    
}

In [57]:
grammar_keys = {

    # symboles grammaticaux
    "\\limits":[],
    "\\tilde":[],
    "\\overline":[],
    "\\hat":[],
    "\\vec":[],
    "\\left":[],
    "\\right":[],
    "\\rVert":[],
    "\\,":[]

}

## 10. Enregistrement json

In [58]:
tex_grammar = {

    # noms de variables
    "var":{

        **min_char,
        **maj_char,
        **cal_char,
        **mbb_char,
        **greek_min_vars,
        **greek_maj_vars,
        **greek_var_vars,    
    },

    # differentiel
    "dvar":dvar,

    # vals
    "num":digit,
    "infty":infty,
    "set":sets,
    
    # relations
    "rel":rel,

    # operateurs
    "qtf":quantif,
    "uop":un_op,
    "bop":bin_op,

    # indices et exposants
    "sub":subscript,
    "sup":supscript,

    # fonctions et applications
    "fun":func,
    "specfun":spec_func,
    "setfun":set_func,
    "law1":law1,
    "law2":law2,
    "iter":iter, 

    # caractères spéciaux
    "symb":symb,

    # symboles de controle de sequence
    "mseq":math_cseq,
    "tseq":tex_cseq,

    # expressions naturelles
    "naturalexpr":natural_expr,

    # éléments non utilisés par la partie lexicale
    "gkey":grammar_keys,

    # mots clés de grammaire (à supprimer sur la sortie)
    "grammar_keywords":grammar_keywords,
    
}

In [60]:
# enregistrement de la grammaire
with open('grammar/tex_grammar.json', 'w', encoding=ENCODING) as file:
    json.dump(tex_grammar, file, indent=4)

# enregistrement du vocabulaire
tex_words = set()
seq_words = set()
for rules in tex_grammar.values():
    commands = rules.keys()
    for rule in commands : 
        tex = rule.split()
        for t in tex : 
            tex_words.add(t)
        seqs = rules[rule]
        for seq in seqs : 
            seq = seq.split()
            for s in seq : 
                seq_words.add(s)

dic_tex_words = {
    "tex_words":list(tex_words)
}

with open('tokens/tex_letters.json', 'w', encoding=ENCODING) as file:
    json.dump(dic_tex_words, file, indent=4)

input = []
output = []

for dic in tex_grammar.values():
    for tex in dic.keys():
        for seq in dic[tex] : 
            input.append(seq)
            output.append(tex) 

with open('rules/LexicalRules/math_rules_grammarkeys.txt', 'w', encoding=ENCODING) as writer:
    writer.write("INPUT;OUTPUT;\n")
    for i in range(len(input)):
        writer.write(input[i] + ";" + output[i] + ";\n")

with open('rules/LexicalRules/math_rules.txt', 'w', encoding=ENCODING) as writer:
    writer.write("INPUT;OUTPUT;\n")
    for i in range(len(input)):
        if not(output[i] in grammar_keys.keys() or output[i] in grammar_keywords.keys()): 
            writer.write(input[i] + ";" + output[i] + ";\n")

input_set = set()
output_set = set()

for dic in tex_grammar.values():
    for tex in dic.keys():
        for seq in dic[tex] :
            for wrd in seq.split(): 
                input_set.add(wrd)
        for stex in tex.split():    
            output_set.add(stex)

input_set

with open("vocab/vocab.json", 'w', encoding=ENCODING) as writer : 
    json.dump({symb:i for i, symb in enumerate(input_set)}, writer, indent=4)