# Math expressions - audio dataset

In [1]:
from datasets import Dataset
import torchaudio
import os
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## 1. Dataset loading

In [2]:
df = pd.read_csv("data_analysis.csv")
n = len(df)
df

Unnamed: 0,id,user_id,path,seq,tex,dna,WER_FST,WER_LEX_FST,y_seq_whisper_medium,time_whisper_medium,y_seq_whisper_base,time_whisper_base
0,line846_p2_dna,2,Data\audio\line846_p2_dna.wav,alpha indice zéro inférieur ou égal à trois pe...,\alpha _ 0 \leq \dots \leq \alpha _ N,True,0.333333,0.333333,alpha indice 0 inférieur ou égal à trois peti...,12.085763,Alpha indices row A ferie euro égal a 3 petit...,1.511340
1,line228_p3_part7,3,Data\audio\line228_p3_part7.wav,p indice grand s entre parenthèses u appartien...,p _ \mathbb { S } ( u ) \in \gamma _ 1 ^ R \cu...,False,0.380952,0.294118,P' U appartient à gamma'1'r union gamma'1'l,9.334133,P110 grand S entre parenthèses + U A partir u...,1.760738
2,line625_p2,2,Data\audio\line625_p2.wav,n supérieur ou égal à deux espace and espace d...,N \geq 2 \quad \text { and } \quad d \geq 3 \d...,False,0.375000,0.277778,n supérieur ou égal à 2 espace and espace d s...,17.151142,A une supérieure regale à deux espace ande es...,2.156214
3,line115_p3,3,Data\audio\line115_p3.wav,probabilité que b indice epsilon de b p barre ...,\mathbb { P } ( B _ \varepsilon ( \overline { ...,False,0.707692,0.564103,que b' de BP bar et Rho bar inclus dans P' BP...,12.054230,probabilité que B110 Epsilon de BP bar et au ...,2.532042
4,line820_p1_dna,1,Data\audio\line820_p1_dna.wav,supérieur ou égal à parenthèse delta indice gr...,\geq ( \delta _ P - 1 ) \alpha _ 0 + \alpha _ ...,True,0.000000,0.000000,super ou égal à parenthèse delta indice grand...,14.744848,Super rigole à la parenthèse d'alternes 10 gr...,2.286803
...,...,...,...,...,...,...,...,...,...,...,...,...
240,line776_p1_dna,1,Data\audio\line776_p1_dna.wav,degré indice alpha parenthèse grand f indice g...,\deg _ { \alpha } ( F _ H \circ g ) \geq ( d -...,True,0.138462,0.061224,degrés indice alpha parenthèse grand F indice...,34.464865,2 gré indices alpha parenthèses grnt f1 / grn...,5.300727
241,line228_p3_part3,3,Data\audio\line228_p3_part3.wav,u appartient à a chapeau indice l epsilon,u \in \hat { A } _ { L \varepsilon },False,0.727273,0.571429,lui appartient à chapeau en L' et Psi,8.849350,et lui appartient à Chapeau en Céline et de P...,1.263462
242,line363_p3,3,Data\audio\line363_p3.wav,grand d puissance rond entre parenthèses petit...,"D ^ \circ ( v , T _ { b + a } ) = - ( b + a ) ...",False,0.232558,0.064516,Grand D puissance rond entre parenthèses peti...,45.947263,Grand des puissances fournours par entesse pe...,31.158561
243,line223_p3,3,Data\audio\line223_p3.wav,grand t indice s puissance i égal l'inf entre ...,"T ^ i _ s = \inf \{ t \in [ a _ i , b _ i ] , ...",False,0.484848,0.407407,Grand T indice S puissance I = 1 Petit T appa...,19.887432,iquarement tranquillement,9.775116


In [3]:
import librosa

duration = []

for file_name in df['path']:
    duration.append(librosa.get_duration(filename=file_name))

	This alias will be removed in version 1.0.
  duration.append(librosa.get_duration(filename=file_name))


In [4]:
df['duration'] = duration

## 2. Prediction

In [5]:
from src.Score.Metrics import Metrics

### 2.1 Seq to LaTeX

In [6]:
from src.FST.transducers import * 

In [7]:
def remove_cseq(input_str):
    return input_str.replace('}', '').replace('{', '').replace('(', '').replace(')', '').replace('[', '').replace(']', '').replace('\\{', '').replace('\\}', '')

tex_lex = [remove_cseq(i) for i in df['tex']]

**Loading lexical transducer :**

In [8]:
math_fst = LexMathTransducer()

In [9]:
F_pred = [math_fst.predict(seq) for seq in df['seq']]
F_lex_pred = [remove_cseq(i) for i in F_pred]

In [10]:
F_pred

['\\alpha _ 0 \\leq \\dots \\leq \\alpha _ F',
 'p _ S ( u \\in \\gamma _ 1 ^ r \\cup \\gamma _ 1 ^ l',
 'n \\geq 2 \\quad \\quad d \\geq 3 \\delta \\quad ( d > 3 \\delta )',
 '\\mathbb { P } b _ \\epsilon b p \\wedge o _ \\infty \\subset p _ b p a \\infty _ l \\epsilon \\cup a \\infty _ l \\epsilon > 1 - \\delta',
 '\\geq ( \\delta _ P - 1 ) \\alpha _ 0 + \\alpha _ 1 + ( d - \\delta _ P ) \\alpha _ N',
 'd \\geq \\delta \\min ( N + 1 , s + 3 ) \\quad ( \\text { resp. } d > \\delta \\min ( N + 1 , s + 3 ) )',
 '\\geq \\sum p = 1 \\delta _ P - 1 \\alpha _ { j _ p } + \\alpha _ { j _ { \\delta _ P } } + ( d - \\delta _ P ) \\alpha _ N',
 '\\deg _ \\alpha ( F _ H \\circ g ) \\geq d \\alpha _ N > 0',
 '\\sum u = 1 N - s - 2 \\deg \\alpha f h \\circ g \\geq \\sum 1 \\leq u \\leq N - s - 2 \\wedge v = N - s - 1 - u \\alpha u + d - 1 \\alpha v',
 '\\deg _ \\alpha ( F _ H \\circ g ) \\geq d \\alpha _ N > 0',
 '= d - 2 \\delta p \\alpha n - \\delta _ P \\sum i = 1 N - 1 \\alpha i',
 '\\alpha _ 

**Word error rate :**

In [11]:
WER_F = [Metrics.WER(df['tex'][i], F_pred[i]) for i in range(len(df['tex']))]
WER_F_lex = [Metrics.WER(tex_lex[i], F_lex_pred[i]) for i in range(len(df['tex']))]

**Grammar Transducer** :

In [18]:
gram = LexGraOneLayerFST()

^
_
de
pour
=
à
de
pour
=
à
de
=
à
de
de
=
à
pour
allant_de
à
de
pour
allant_de
à
de
allant_de
à
de
de
allant_de
à
\int
de
à
de
\int
allant_de
à
de
pour
de
pour
\in
sur
\lVert
de
sur


In [20]:
Fgram_pred = [gram.predict(seq) for seq in df['seq']]
Fgram_lex_pred = [remove_cseq(i) for i in F_pred]

In [23]:


best_Fgram = []
for i, hyp_list in enumerate(Fgram_pred) :
    min = 100 
    for hyp in hyp_list :
        wer = Metrics.WER(df['tex'][i], hyp)
        if wer < min : 
            min = wer
    best_Fgram.append(min)



In [24]:
import numpy as np
np.mean(best_Fgram)

0.3664492583597023

In [25]:
df['Fgram_best'] = best_Fgram

In [27]:
np.mean(df.loc[df['dna'] == False, 'Fgram_best'])

0.4350543238011172

**Grammar4 :**

In [28]:
gram4 = LexGraMultiLayerFST()

chapeau
tilde
vecteur
^
_
barre
de
:
pour
=
à
de
pour
=
à
de
=
à
de
de
=
à
pour
allant_de
à
de
pour
allant_de
à
de
allant_de
à
de
de
allant_de
à
\int
de
à
de
\int
allant_de
à
de
pour
de
pour
\in
sur


In [29]:
Fgram4_pred = [gram4.predict(seq, nbest=5) for seq in df['seq']]
Fgram4_lex_pred = [remove_cseq(i) for i in F_pred]

best_Fgram4 = []
for i, hyp_list in enumerate(Fgram4_pred) :
    min = 100 
    for hyp in hyp_list :
        wer = Metrics.WER(df['tex'][i], hyp)
        if wer < min : 
            min = wer
    best_Fgram4.append(min)

In [32]:

np.mean(df.loc[df['dna'] == True, 'best_Fgram4'])

0.39793727876680185

### 2.2 Wav to Seq 

In [15]:
import numpy as np

from src.ASR.opaiWhisperModels import * 
from src.FST.transducers import *

In [13]:
# normaliseur sur les séquences
normalizer = Normalizer()

### Whisper medium

**Word error rate**

In [16]:
WER_whisper_medium = [Metrics.WER(normalizer.predict(df['seq'][i]), normalizer.predict(WhisperModel.normalize(df['y_seq_whisper_medium'][i]))) for i in range(len(df['id']))]

In [17]:
df["WER_whisper_medium"] = WER_whisper_medium
np.mean(WER_whisper_medium)

0.36572263015508727

In [142]:
y_seqs = [math_fst.predict(normalizer.predict(WhisperModel.normalize(df['y_seq_whisper_medium'][i]))) for i in range(len(df['id']))]

WER_whisper_medium_tex = [Metrics.WER(df['tex'][i], y_seqs[i]) for i in range(len(y_seqs))]

In [144]:
np.mean(WER_whisper_medium_tex)

0.5764142900915539

In [145]:
df['WER_whisper_tex'] = WER_whisper_medium_tex

In [148]:
np.mean(df.loc[df['dna'] == False, 'WER_whisper_tex'])

0.7045899475380032

In [149]:
y_seqs_pred_lex = [remove_cseq(y_seqs[i]) for i in range(len(y_seqs))]
y_tex_lex = [remove_cseq(df['tex'][i]) for i in range(len(y_seqs))]

WER_whisper_medium_tex_lex =  [Metrics.WER(y_tex_lex[i], y_seqs_pred_lex[i]) for i in range(len(y_seqs))] 

In [151]:
df['WER_whisper_tex_lex'] = WER_whisper_medium_tex_lex

In [153]:
np.mean(df.loc[df['dna'] == False, 'WER_whisper_tex_lex'])

0.6322554938391397

np.mean()

**Real time factor**

In [51]:
RTF_whisper_medium = np.array(df['time_whisper_medium'])/np.array(df['duration'])


1.2125899530576854

In [53]:
df["RTF_whisper_medium"] = RTF_whisper_medium
RTF_whisper_medium.mean()

1.2125899530576854

### Whisper base

**Word error rate :**

In [42]:
WER_whisper_base = [Metrics.WER(normalizer.predict(df['seq'][i]), normalizer.predict(WhisperModel.normalize(df['y_seq_whisper_base'][i]))) for i in range(len(df['id']))]
    

In [58]:
np.mean(WER_whisper_base)
df['WER_whisper_base'] = WER_whisper_base

**Real time factor :**

In [54]:
RTF_whisper_base = np.array(df['time_whisper_base'])/np.array(df['duration'])

In [56]:
df['RTF_whisper_base'] = RTF_whisper_base
np.mean(RTF_whisper_base)

0.34001544201872325

In [113]:
import requests

API_URL = "https://api-inference.huggingface.co/models/bigcode/starcoder"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": "Le code LaTeX associé à la phrase 'la somme pour i égal un à n de u indice i' est \\begin{equation}",
})