In [1]:
import pandas as pd
from IPython.display import HTML
import whisper
import warnings
warnings.filterwarnings("ignore")
from jiwer import wer
import re
import time

In [2]:
df1 = pd.read_csv("../../Data/raw/other.tsv", sep="\t")
df1 = df1[["path","sentence","age","gender","accents"]]
df2 = pd.read_csv("../../Data/raw/clip_durations.tsv", sep="\t")
df2 = df2.rename(columns={'clip': 'path'})

metadata = pd.merge(df1, df2, on='path')
metadata['wav_path'] = metadata['path'].str.replace('.mp3', '.wav', regex=False)

metadata

Unnamed: 0,path,sentence,age,gender,accents,duration[ms],wav_path
0,common_voice_en_40865211.mp3,"With this transition to the big time, the band...",,,United States English,5904,common_voice_en_40865211.wav
1,common_voice_en_40865212.mp3,Local brothels recruited extra staff to cope w...,,,United States English,5544,common_voice_en_40865212.wav
2,common_voice_en_40865213.mp3,"With Fox on lead vocals, the threesome did two...",,,United States English,5760,common_voice_en_40865213.wav
3,common_voice_en_40865214.mp3,Miramax requested cuts be made and Christopher...,,,United States English,5652,common_voice_en_40865214.wav
4,common_voice_en_40865215.mp3,The Key allows customers to buy Plusbus for th...,,,United States English,6120,common_voice_en_40865215.wav
...,...,...,...,...,...,...,...
18321,common_voice_en_41227190.mp3,Bolton's wife is named Liliana; they have two ...,fourties,female_feminine,Scottish English,9756,common_voice_en_41227190.wav
18322,common_voice_en_41227191.mp3,One report indicates that they formerly spoke ...,fourties,female_feminine,Scottish English,10296,common_voice_en_41227191.wav
18323,common_voice_en_41227192.mp3,He was fascinated by topics including photogra...,fourties,female_feminine,Scottish English,9576,common_voice_en_41227192.wav
18324,common_voice_en_41227193.mp3,Just some Galley Push-Ups.,fourties,female_feminine,Scottish English,5148,common_voice_en_41227193.wav


In [3]:
print(f'Sentence : {metadata["sentence"].isna().sum()}')
print(f'Age : {metadata["age"].isna().sum()}')
print(f'Gender : {metadata["gender"].isna().sum()}')
print(f'Accents : {metadata["accents"].isna().sum()}')
print(f'Duration : {metadata["duration[ms]"].isna().sum()}')

Sentence : 0
Age : 6181
Gender : 7759
Accents : 4160
Duration : 0


In [4]:
print(metadata['accents'].unique()[:3])
print(len(metadata['accents'].unique()))

['United States English' nan 'Australian English']
70


In [5]:
print(metadata['age'].unique())
print(len(metadata['age'].unique()))

[nan 'fourties' 'thirties' 'twenties' 'teens' 'sixties' 'fifties']
7


In [6]:
print(metadata['gender'].unique())
print(len(metadata['gender'].unique()))

[nan 'male_masculine' 'female_feminine' 'non-binary' 'do_not_wish_to_say'
 'transgender']
6


In [7]:
# metadata.to_csv('../../data/processed/Common Voice.csv', index=False)

In [8]:
AUDIO_PATH = "../../clips.wav/common_voice_en_40865212.wav"

In [9]:
file_path = AUDIO_PATH

HTML(f"""
<audio controls>
  <source src="{file_path}" type="audio/mpeg">
  Votre navigateur ne supporte pas la balise audio.
</audio>
""")


In [16]:
model = whisper.load_model("base")
result = model.transcribe(AUDIO_PATH)
print(result["text"])

 Local brothels recruited extra staff to cope with the increase in business.


In [11]:
metadata=metadata[["path","sentence","wav_path"]]

metadata.head(5)

Unnamed: 0,path,sentence,wav_path
0,common_voice_en_40865211.mp3,"With this transition to the big time, the band...",common_voice_en_40865211.wav
1,common_voice_en_40865212.mp3,Local brothels recruited extra staff to cope w...,common_voice_en_40865212.wav
2,common_voice_en_40865213.mp3,"With Fox on lead vocals, the threesome did two...",common_voice_en_40865213.wav
3,common_voice_en_40865214.mp3,Miramax requested cuts be made and Christopher...,common_voice_en_40865214.wav
4,common_voice_en_40865215.mp3,The Key allows customers to buy Plusbus for th...,common_voice_en_40865215.wav


In [12]:
reference = "Evaluons sa capacité à transcrire correctement une phrase."
hypothesis = "Evaluons sa capacité à transcrir correctement une phrases complexe"

score = wer(reference, hypothesis)
print(f"WER: {score:.2%}")


WER: 37.50%


In [13]:
model_tiny = whisper.load_model("tiny")
model_base = whisper.load_model("base")
model_small = whisper.load_model("small")
model_medium = whisper.load_model("medium")
model_turbo = whisper.load_model("turbo")
model_turbo = whisper.load_model("large")

list_models = ["tiny", "base", "small", "medium", "turbo","large"]


In [17]:
for model_name in list_models:
    print(f"Model: {model_name}")
    list_score = []
    list_duration = []
    for i in range(30):

        model = whisper.load_model(model_name)
        
        start = time.time()
        
        
        audio = whisper.load_audio(f'../../clips.wav/{metadata["wav_path"].iloc[i]}')
        audio = whisper.pad_or_trim(audio)

        
        mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
        
        
        options = whisper.DecodingOptions()
        result = whisper.decode(model, mel, options)

        end = time.time()

        duration = end - start
        
        reference = metadata["sentence"].iloc[i].lower()  # conversion en minuscules
        reference = re.sub(r'[^\w\s]', '', reference)  # supprime tout sauf lettres/chiffres/espaces
        

        hypothesis = result.text.lower()  # conversion en minuscules
        hypothesis = re.sub(r'[^\w\s]', '', hypothesis)  # supprime tout sauf lettres/chiffres/espaces

        score = wer(reference, hypothesis)

        list_score.append(score)
        list_duration.append(duration)
        
    moyenne_duration = sum(list_duration) / len(list_duration)
    moyenne_score = sum(list_score) / len(list_score)
    print(f"WER moyen: {moyenne_score:.2%}")
    print(f"Durée moyenne de transcription : {moyenne_duration:.2f} secondes")
    



Model: tiny
WER moyen: 14.85%
Durée moyenne de transcription : 2.80 secondes
Model: base
WER moyen: 11.53%
Durée moyenne de transcription : 5.06 secondes
Model: small
WER moyen: 5.88%
Durée moyenne de transcription : 13.82 secondes
Model: medium
WER moyen: 5.97%
Durée moyenne de transcription : 37.38 secondes
Model: turbo
WER moyen: 2.75%
Durée moyenne de transcription : 62.51 secondes
Model: large
WER moyen: 4.55%
Durée moyenne de transcription : 77.54 secondes
