In [1]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("processing"), "..")))
import numpy as np
import spacy
from inflecteur import inflecteur
import pickle
import torch
import string
from transformers import WhisperProcessor, Wav2Vec2Processor
from datasets import Dataset, Audio, load_dataset, concatenate_datasets
from utils import det_words, irregular_nouns
from utils import MODEL_PATH, DATA_KEY, TEXT_KEY
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42
np.random.seed(SEED)

TASK = "common_voice"
SPLIT = "test" 
GENERATED_IDS_PATH = f"./directory/predictions/{TASK}/{SPLIT}/"
ALIGNMENT_PATH = f"./directory/mfa/common_voice/test/outputs/"
ANNOTATED_DATA_PATH = f"./directory/datasets/{TASK}/{SPLIT}/"

# MODEL_TYPE = "whisper-base"

MODEL_PATH = {
    # 'whisper-tiny': 'openai/whisper-tiny',
    'whisper-base': 'openai/whisper-base',
    'whisper-small': 'openai/whisper-small',
    # 'whisper-medium': 'openai/whisper-medium',
    'wav2vec2-large-xlsr-53-french': 'jonatasgrosman/wav2vec2-large-xlsr-53-french',
    # 'asr-wav2vec2-french': 'bhuang/asr-wav2vec2-french',
}

NUM_LAYERS = {
    # 'whisper-tiny': 4,
    'whisper-base': 6,
    'whisper-small': 12,
    # 'whisper-medium': 24,
    'wav2vec2-large-xlsr-53-french': 24,
    # 'asr-wav2vec2-french': 24,
}

DATA_KEY = {
    "common_voice": "mozilla-foundation/common_voice_11_0",
}
TEXT_KEY = {
    'common_voice': 'sentence',
}

PROCESSOR = {
    'whisper-base': WhisperProcessor.from_pretrained(MODEL_PATH['whisper-base'], task='transcribe', language='french'),
    'whisper-small': WhisperProcessor.from_pretrained(MODEL_PATH['whisper-small'], task='transcribe', language='french'),
    # 'whisper-medium': WhisperProcessor.from_pretrained(MODEL_PATH['whisper-medium'], task='transcribe', language='french'),
    'wav2vec2-large-xlsr-53-french': Wav2Vec2Processor.from_pretrained(MODEL_PATH['wav2vec2-large-xlsr-53-french']), 
    # 'asr-wav2vec2-french': Wav2Vec2Processor.from_pretrained(MODEL_PATH['asr-wav2vec2-french']), 
}



In [3]:
if not os.path.exists(ANNOTATED_DATA_PATH):
    os.makedirs(ANNOTATED_DATA_PATH)

nlp = spacy.load("fr_core_news_md")
inflecteur = inflecteur()
inflecteur.load_dict()

Loading	 dela-fr-public...
Done.


## Download the original dataset and load the incorrectly predicted sentences

In [4]:
# load original data
print('DOWNLOADING DATA')
org_data = load_dataset(DATA_KEY[TASK], 'fr', split=SPLIT, verification_mode="all_checks")

DOWNLOADING DATA


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [5]:
# LENGTH = 5000
LENGTH = len(org_data)

In [9]:
print('CASTING FILES TO AUDIO')
org_data = org_data.cast_column("audio", Audio(sampling_rate=16_000))

# Load generated ids
generated_ids = {}
for model_name in MODEL_PATH.keys():
    with open(f'{GENERATED_IDS_PATH}{model_name}/generated_ids.pkl', 'rb') as fp:
        generated_ids[model_name] = pickle.load(fp)

CASTING FILES TO AUDIO


In [10]:
print("ALIGNING")
file_ids = [int(f.split('.')[0]) for f in os.listdir(ALIGNMENT_PATH) if f.endswith('.TextGrid')]
alignments = []

for ex in range(LENGTH):
    if ex not in file_ids:
        alignments.append({'id': ex, 'total_start': None, 'total_end': None, 'intervals': None})           
        continue
    lines = open(f"{ALIGNMENT_PATH}{ex}.TextGrid", "r").readlines()
    total_min = float(lines[3].strip().split()[2])
    total_max = float(lines[4].strip().split()[2])
    num_intervals = int(lines[13].strip().split('=')[-1])
    intervals = []
    for it in range(num_intervals):
        xmin = float(lines[15+it*4].split("=")[-1].strip())
        xmax = float(lines[16+it*4].split("=")[-1].strip())
        text = lines[17+it*4].split("=")[-1].strip()[1:-1]
        if text != "":
            intervals.append({'start': xmin, 'end': xmax, 'word': text})
    alignments.append({'total_start': total_min, 'total_end': total_max, 'intervals': intervals})           
alignments = Dataset.from_list(alignments)
alignments

ALIGNING


Dataset({
    features: ['total_start', 'total_end', 'intervals'],
    num_rows: 16089
})

In [12]:
import pandas as pd

# Load incorrect sentences
df = pd.read_csv("./directory/predictions/common_voice/test/whisper-base/annotated_cue_target.csv")

dfs = dict()

for model_name in MODEL_PATH.keys():
    dfs[model_name] = pd.read_csv(f"./directory/predictions/common_voice/test/{model_name}/annotated_det_noun.csv")

whisper-base
whisper-small
wav2vec2-large-xlsr-53-french


In [23]:
print(f'Number samples whisper-base: {len(dfs["whisper-base"])}')
print(f'Number samples whisper-small: {len(dfs["whisper-small"])}')
print(f'Number samples wav2vec2-large-xlsr-53-french: {len(dfs["wav2vec2-large-xlsr-53-french"])}')

Number samples whisper-base: 8
Number samples whisper-small: 5
Number samples wav2vec2-large-xlsr-53-french: 40


In [68]:
def find_indices(processor, ex, cue_word, target_true, target_pred, model_name):
    # find decoder indices
    target_token_dec_indices = {}
    cue_token_dec_indices = {}
    # for model_name in MODEL_PATH.keys():

    if model_name.split('-')[0] == "whisper":
        # mapping subwords to words
        generated_tokens = processor.tokenizer.convert_ids_to_tokens(generated_ids[model_name][ex].tolist())
        generated_words = []
        word_indices = []
        current_word = -1
        for token in generated_tokens:
            if token.startswith("Ġ") or token in ['<|fr|>', '<|transcribe|>', '<|notimestamps|>', '<|endoftext|>'] or token in string.punctuation or generated_words[-1] in string.punctuation:
                generated_words.append(processor.tokenizer.convert_tokens_to_string(token).strip().lower() if token.startswith("Ġ") else token.lower())
                current_word += 1
            else:
                generated_words[-1] = generated_words[-1] + processor.tokenizer.convert_tokens_to_string(token).lower()
            word_indices.append(current_word)
        generated_words = np.array(generated_words)
        # print(generated_words)
        word_indices = np.array(word_indices)

        # print(len(generated_words))

        
        # find target and cue token indices
        target_word_indices = np.where(generated_words == target_pred.lower())[0]
        cue_word_indices = np.where(generated_words == cue_word.lower())[0]

        # temp_cue_word_indices = []
        min_diff = 1000
        if len(cue_word_indices) > 1:
            for cue_word_idx in cue_word_indices:
                if np.abs(cue_word_idx - target_word_indices[0]) < min_diff:
                    min_diff = np.abs(cue_word_idx - target_word_indices[0])
                    cue_word_indices = np.array([cue_word_idx])

        # Multiple cues:
        # cue word wouldn't come after the target word
        cue_word_indices = cue_word_indices[cue_word_indices < np.max(target_word_indices)]
        # if cue ids are not consecutive that means they are not splited tokens blong to one word. we have multiple same cues so the right cue is the one nearest to the target
        while np.max(cue_word_indices) - np.min(cue_word_indices) > 1:
            cue_word_indices = np.delete(cue_word_indices, np.where(cue_word_indices == np.min(cue_word_indices)))
        # Multiple targets:
        target_word_indices = target_word_indices[target_word_indices > np.min(cue_word_indices)]
        while np.max(target_word_indices) - np.min(target_word_indices) > 1:
            target_word_indices = np.delete(target_word_indices, np.where(target_word_indices == np.max(target_word_indices)))

        # check if there are many
        if len(target_word_indices) > 1:
            print("multiple target words error")
            # continue
        if len(cue_word_indices) > 1:
            print("multiple cue words error")
            # continue
        target_token_dec_indices[model_name] = np.where(word_indices == target_word_indices)[0].tolist()
        cue_token_dec_indices[model_name] = np.where(word_indices == cue_word_indices)[0].tolist()
    else:
        # wav2vec based models do not have decoder part
        target_token_dec_indices[model_name] = None
        cue_token_dec_indices[model_name] = None

    # find encoder indices
    aligned_enc_words = [alignments[ex]['intervals'][i]['word'].lower() for i in range(len(alignments[ex]['intervals']))]
    target_word_enc_indices = np.where(np.isin(np.array(aligned_enc_words), np.array(target_true.lower())))[0]
    cue_word_enc_indices = np.where(np.isin(np.array(aligned_enc_words), np.array(cue_word.lower())))[0]

    min_diff = 1000
    if len(cue_word_enc_indices) > 1:
        for cue_word_idx in cue_word_enc_indices:
            if np.abs(cue_word_idx - target_word_enc_indices[0]) < min_diff:
                min_diff = np.abs(cue_word_idx - target_word_enc_indices[0])
                cue_word_enc_indices = np.array([cue_word_idx])

    # Multiple Cues
    # cue word wouldn't come after the target word
    cue_word_enc_indices = cue_word_enc_indices[cue_word_enc_indices < np.max(target_word_enc_indices)]
    # if cue ids are not consecutive that means they are not splited tokens blong to one word. we have multiple same cues so the right cue is the one nearest to the target
    while np.max(cue_word_enc_indices) - np.min(cue_word_enc_indices) > 1:
        cue_word_enc_indices = np.delete(cue_word_enc_indices, np.where(cue_word_enc_indices == np.min(cue_word_enc_indices)))
    # Multiple Targets
    target_word_enc_indices = target_word_enc_indices[target_word_enc_indices > np.min(cue_word_enc_indices)]
    while np.max(target_word_enc_indices) - np.min(target_word_enc_indices) > 1:
        target_word_enc_indices = np.delete(target_word_enc_indices, np.where(target_word_enc_indices == np.max(target_word_enc_indices)))
    
    cue_word_enc_indices = cue_word_enc_indices.tolist()
    target_word_enc_indices = target_word_enc_indices.tolist()

    return cue_token_dec_indices, target_token_dec_indices, cue_word_enc_indices, target_word_enc_indices

In [69]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [81]:
det_noun_data = {"whisper-base": [], "whisper-small": [], "wav2vec2-large-xlsr-53-french": []}

for model_name in MODEL_PATH.keys():
    print(model_name)
    for _, row in dfs[model_name].iterrows():
        id = row["id"]
        target_true = row['target_true']
        target_pred = row['target_pred']
        cue = row['cue']

        if alignments[id]['intervals'] is None:
            continue
        
        sentence = org_data['sentence'][id]
        print(sentence)

        if sentence.startswith('"') and sentence.endswith('"'):
            sentence = sentence[1:-1]

        doc = nlp(sentence)
        word_list = [word.text for word in doc]
        dep_list = [word.dep_ for word in doc]

        # filter if target or cue words have not been founded by aligner
        aligned_words = [alignments[id]['intervals'][i]['word'].lower() for i in range(len(alignments[id]['intervals']))]

        if cue.lower() not in aligned_words or target_true.lower() not in aligned_words:
            continue

        cue_token_dec_indices, target_token_dec_indices, cue_word_enc_indices, target_word_enc_indices = find_indices(PROCESSOR[model_name], id, cue, target_true, target_pred, model_name)
        
        labels = {'number': 'number',
            'person': 'person',
            'tense': 'tense'}

        det_noun_data[model_name].append({
            'template': 'det_noun',
            'org_id': id, 
            'text': sentence, 
            'cue_word': cue,
            'target_word': target_pred,
            'target_word_2': None,
            'path': org_data[id]['path'],
            'audio': org_data[id]['audio'],
            'alignment': alignments[id],
            'target_indices': {'enc': target_word_enc_indices, 'dec': target_token_dec_indices},
            'cue_indices': {'enc': cue_word_enc_indices, 'dec': cue_token_dec_indices},
            'target_indices_2': None,
            'label_number': labels['number'],
            'label_person': labels['person'],
            'label_tense': labels['tense'],
        })



# org_data["sentence"]

whisper-base
Les chips tortillas sont l’apéritif le plus typique des cuisines tex-mex et mexicaine.
Mais la natures est toujours là, omniprésente.
Cependant, l'échange est annulé après que Reynolds ait échoué les tests médicaux des Colts.
Il laisse également de nombreuses lettres, des vies de saints, et des poèmes.
Le fonds d'origine inclut des manuscrits de Marc-Antoine Charpentier et de Jean-Jacques Rousseau.
Mais la fille veut des princesses et le garçon des monstres.
Les brise-lames sont les chevaux de frise des fortifications contre les tempêtes.
La vigne, avec ses nouveaux plans couvre de plus en plus de surface.
whisper-small
Les tarses sont rose carmin.
Mais la natures est toujours là, omniprésente.
Les habitants se nomment les Trouille-Bourreaux.
À repère égal, les crues qu'il signale sont donc moins graves.
En son sein, nous avons tous les mêmes droits et les mêmes devoirs.
wav2vec2-large-xlsr-53-french
Et quelques mariages.
Maximiliano Richeze et Roberto Ferrari sont les équ

## Creating homophony dataset

In [82]:
# det_noun_data = det_noun(org_data[TEXT_KEY[TASK]])




In [85]:
# det_noun_data

for model_name in MODEL_PATH.keys():
    det_noun_data[model_name] = Dataset.from_list(det_noun_data[model_name])
    det_noun_data[model_name].save_to_disk(f"{ANNOTATED_DATA_PATH}{model_name}")

Saving the dataset (1/1 shards): 100%|██████████| 8/8 [00:00<00:00, 1170.82 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5/5 [00:00<00:00, 858.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 37/37 [00:00<00:00, 2105.97 examples/s]
