# SpeechToText Project for Darija

Ideas:
- Get more data from twitter/facebook etc (ex: DarijaBert)
- remove noise background if any

## Import packages and datasets

In [2]:
import os
import pickle
import random
import numpy as np
import pandas as pd
import IPython.display as ipd
from datasets import ClassLabel, load_dataset, load_metric
from sklearn.model_selection import train_test_split
from IPython.display import Audio, Javascript, display, HTML, Image
import torchaudio
import torch

In [7]:
import sys
# append path folder where seamless_communication folder has been cloned
sys.path.append('/Users/lailasalhi/Documents/AI4Humanitarian/') # to fill

import seamless_communication

In [24]:
# to fill
PATH = "../../data/darija/texts/"

train_df = pd.read_csv(f"{PATH}/train.csv", sep='\t')
test_df = pd.read_csv(f'{PATH}/test.csv', sep='\t')

In [22]:
print(train_df[["words"]])

                                                  words
0     النار سمعت واحد قال لصاحبه رخص يكون عندنا شويه...
1                     مرام راكان اسمع والدلائل شيء واحد
2                               ممكن نكون جاي في الطريق
3                                       احطه قدام الباب
4                     يدك الزخمات اللي طلبنا منك بطاريه
...                                                 ...
6822                                           مرض مزمن
6823                                       ليه خمسه هنا
6824           الملح الغير ماده اللي كان بها الشيخ زايد
6825  النار سمعت واحد قال لصاحبه رخص يكون عندنا شويه...
6826                         قال لنا انا ما بقيت شو اكل

[6827 rows x 1 columns]


## Finetuning

In [None]:
import argparse
import logging
import os
from pathlib import Path

import torch
from fairseq2.models.nllb.tokenizer import NllbTokenizer
from m4t_scripts.finetune import dataloader, dist_utils, trainer

from seamless_communication.models.unity import (
    UnitTokenizer,
    UnitYModel,
    load_unity_model,
    load_unity_text_tokenizer,
    load_unity_unit_tokenizer,
)

In [None]:
device = torch.device("cuda")
    text_tokenizer: NllbTokenizer = load_unity_text_tokenizer(args.model_name)
    unit_tokenizer: UnitTokenizer = load_unity_unit_tokenizer(args.model_name)
    finetune_params = trainer.FinetuneParams(
        finetune_mode=args.mode,
        save_model_path=args.save_model_to,
        device=device,
        train_batch_size=args.batch_size,
        eval_batch_size=args.batch_size,
        patience=args.patience,
        max_epochs=args.max_epochs,
        learning_rate=args.learning_rate,
        warmup_steps=args.warmup_steps,
        eval_steps=args.eval_steps,
        log_steps=args.log_steps,
    )
    logger.info(f"Finetune params: {finetune_params}")
    model: UnitYModel = load_unity_model(
        args.model_name, device=finetune_params.device, dtype=torch.float16
    )
    logger.info(f"Model {model}")
    assert model.pad_idx == text_tokenizer.vocab_info.pad_idx
    assert model.t2u_model is not None
    assert model.t2u_model.pad_idx == unit_tokenizer.vocab_info.pad_idx

    train_dataloader = dataloader.UnitYDataLoader(
        text_tokenizer=text_tokenizer,
        unit_tokenizer=unit_tokenizer,
        batching_config=dataloader.BatchingConfig(
            batch_size=finetune_params.train_batch_size,
            rank=dist_utils.get_rank(),
            world_size=dist_utils.get_world_size(),
        ),
        dataset_manifest_path=args.train_dataset,
    )
    eval_dataloader = dataloader.UnitYDataLoader(
        text_tokenizer=text_tokenizer,
        unit_tokenizer=unit_tokenizer,
        batching_config=dataloader.BatchingConfig(
            batch_size=finetune_params.eval_batch_size,
            rank=dist_utils.get_rank(),
            world_size=dist_utils.get_world_size(),
        ),
        dataset_manifest_path=args.eval_dataset,
    )
    finetune = trainer.UnitYFinetune(
        model=model,
        params=finetune_params,
        train_data_loader=train_dataloader,
        eval_data_loader=eval_dataloader,
    )
    finetune.run()

## Test of Seamless

In [None]:
from seamless_communication.models.inference import Translator

# Initialize a Translator object with a multitask model, vocoder on the GPU.
translator = Translator("seamlessM4T_large", vocoder_name_or_card="vocoder_36langs", device=torch.device("cuda:0"))

## Test of Seamless with fine tuning on Dvoice

## Wav2Vec

## Whisper