**Set up**

In [None]:
!pip install speechbrain transformers

In [None]:
from speechbrain.inference.ASR import EncoderASR
import os
import random
import re
import csv
import shutil
import pandas as pd

**Run inference on speechbrain ASR trained on morrocan dialect, using the test set of Devoice dataset**

The dataset could be obtained via this link : https://zenodo.org/records/6342622
It contains a folder containing all the audio files of all sets, and another folder containing three csv files: test.csv, train.csv and dev.csv. each containing the filenames corressponding to that set. 
The first step was to extract the test set files from the folder containing all the audio files and copying them to another folder

In [None]:
def extract_filename(full_string):
    # Split the string by whitespace or other separators to get parts ending with .wav
    parts = full_string.split()
    for part in parts:
        if part.endswith('.wav'):
            return part
    return None

def copy_files(csv_file, source_folder, target_folder):
    with open(csv_file, 'r') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if row: 
                full_string = row[0]  #file names are in the first column
                filename = extract_filename(full_string)
                
                if filename:
                    source_path = os.path.join(source_folder, filename)
                    target_path = os.path.join(target_folder, filename)
                    
                    try:
                        shutil.copy(source_path, target_path)
                        print(f"Successfully copied {filename} to {target_folder}")
                    except FileNotFoundError:
                        print(f"File {filename} not found in {source_folder}")
                    except Exception as e:
                        print(f"Error copying {filename}: {str(e)}")
                else:
                    print(f"No .wav file found in '{full_string}'")


csv_file = 'path_to_test_csv' 
source_folder = 'path_to_the_folder_containing_audio_files'
target_folder = 'target where to copy the test audio files'   

copy_files(csv_file, source_folder, target_folder)

Load ASR model

In [None]:
asr_model = EncoderASR.from_hparams(source="speechbrain/asr-wav2vec2-dvoice-darija", savedir="pretrained_models/asr-wav2vec2-dvoice-darija")

**Run inference on test set**

In [None]:
def run_inference_with_asr(csv_file, source_folder, target_folder):
    transcriptions = []
    file_name = []
    with open(csv_file, 'r') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if row:  
                full_string = row[0] 
                filename = extract_filename(full_string)
                if filename:
                    fullpath = os.path.join(source_folder, filename)
                    output = asr_model.transcribe_file(full_path)
                    transcriptions.append(output)
                    file_name.append(file)
    df = pd.DataFrame({'file_names': file_name, 'transcriptions': transcriptions})
    return df

Once the ASR output is analysed, we selected these errors patterns (common with most of asr models):
-missing words
-missing character
-repeating words
-false word recognition
So in order to simulate these errors, we created these function that introduce these errors patterns to an input sentence. For false word recognition, a random word is selected, and replaced by a random combination of arabic chars 

In [None]:
arabic_chars_list = ['ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي']


def remove_random_space(sentence):
    if ' ' in sentence:
        index = random.choice([i for i, char in enumerate(sentence) if char == ' '])
        return sentence[:index] + sentence[index+1:]
    return sentence

def remove_random_words(sentence):
    words = sentence.split()
    if len(words) > 2:
        num_words_to_remove = random.randint(1, 2)
        words_to_remove = random.sample(words, num_words_to_remove)
        modified_sentence = ' '.join([word for word in words if word not in words_to_remove])
        return modified_sentence
    else:
        return sentence

def remove_random_characters(sentence):
    words = sentence.split()
    if words:
        word_to_modify = random.choice(words)
        if len(word_to_modify) > 1:
            index_to_modify = random.randint(0, len(word_to_modify)-1)
            modified_word = word_to_modify[:index_to_modify] + word_to_modify[index_to_modify+1:]
            modified_sentence = ' '.join([modified_word if word == word_to_modify else word for word in words])
            return modified_sentence
    return sentence

def repeat_random_word(sentence):
    words = sentence.split()
    if words:
        word_to_repeat = random.choice(words)
        index_to_repeat = words.index(word_to_repeat)
        words.insert(index_to_repeat + 1, word_to_repeat)
        modified_sentence = ' '.join(words)
        return modified_sentence
    return sentence

def replace_with_random_chars(sentence):
    words = sentence.split()
    if words:
        word_to_replace = random.choice(words)
        num_chars = len(word_to_replace)
        num_chars_to_replace = num_chars // 2
        arabic_replacements = ''.join(random.choices(arabic_chars_list, k=num_chars_to_replace))
        replaced_word = arabic_replacements + word_to_replace[num_chars_to_replace:]
        modified_sentence = ' '.join([replaced_word if word == word_to_replace else word for word in words])
        return modified_sentence
    return sentence

def apply_random_functions(sentence):
    functions = [replace_with_random_chars , remove_random_space, remove_random_words, repeat_random_word, remove_random_characters]
    selected_functions = random.sample(functions, 3)
    for func in selected_functions:
        sentence = func(sentence)
    return sentence

def split_and_apply(sentence):
    sentences = re.split(r'[.!?,:]', sentence)
    modified_sentences = []
    for sent in sentences:
        if sent.strip():
            modified_sentence = apply_random_functions(sent.strip())
            modified_sentences.append(modified_sentence)
    return '. '.join(modified_sentences)


In [None]:
input_text = "هذه جملة باللغة العربية. وهذه جملة أخرى تتبعها. وهذه ثالثة، تحتوي على كلمات متعددة. وأخيراً، هذه الجملة الأخيرة في النص."
modified_text = split_and_apply(input_text)
print(modified_text)

Load data

In [None]:
morrocan_summarization_dataset = pd.read_csv("./darija19k.csv")

In [None]:
asr_like_outpt = []
for index, row in morrocan_summarization_dataset.iterrows():
    asr_like_outpt.append(split_and_apply(row["Text"]))
morrocan_summarization_dataset["asr_like_ouput"] = asr_like_outpt