In [None]:
from pathlib import Path
import pandas as pd
import torch
from torch.utils.data import Dataset
import torchaudio

class ParquetConcatDataset(Dataset):
    def __init__(self, parquet_dir, meta_dir, parse_fn=None):
        self.parquet_dir = Path(parquet_dir)
        self.meta_dir = meta_dir

        self.files = list(self.parquet_dir.glob("*.parquet"))
        dfs = []
        for f in self.files:
            df = pd.read_parquet(f)
            dfs.append(df)
        self.df = pd.concat(dfs, ignore_index=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            "path": self.meta_dir + row['filepath'],
            "speech": torchaudio.load(self.meta_dir + row['filepath']),
            "nisqa_mos": row["mos_pred"],
            "nisqa_noi": row["noi_pred"],
            "nisqa_dis": row["dis_pred"],
            "nisqa_col": row["col_pred"],
            "nisqa_loud": row["loud_pred"],
            "nisqa_model_name": row["model"],
            "is_single_speaker": row["is_single_speaker"],
            "text_with_stresses_and_punctuation": row["accent"],
            "rover_text": row["rover"],
            "text_with_punctuation": row["punct"],
            "phonemes": row['phonemes']
        }



In [2]:
ds = ParquetConcatDataset("/data/additional/borodin_sam/openstt_balalaika/", "/data/additional/borodin_sam/openstt_balalaika/")

In [3]:
ds[0]



{'path': '/data/additional/borodin_sam/openstt_balalaika/asr_public_phone_calls_1/0/06/c0ccc52caeab.opus',
 'speech': (tensor([[ 0.0095,  0.0046,  0.0084,  ...,  0.0248,  0.0095, -0.0050]]),
  16000),
 'nisqa_mos': np.float64(4.532845),
 'nisqa_noi': np.float64(4.071508),
 'nisqa_dis': np.float64(4.7008386),
 'nisqa_col': np.float64(4.264933),
 'nisqa_loud': np.float64(4.276836),
 'nisqa_model_name': 'NISQAv2',
 'is_single_speaker': True,
 'text_with_stresses_and_punctuation': 'Н+у, теп+ерь теб+е ост+алось умн+ожить кол+ичество орг+азмов. Пост+авь рек+орд!',
 'rover_text': 'ну теперь тебе осталось умножить количество оргазмов поставь рекорд',
 'text_with_punctuation': 'Ну, теперь тебе осталось умножить количество оргазмов. Поставь рекорд!',
 'phonemes': 'n u   tʲ ɪ pʲ e rʲ   tʲ ɪ bʲ e   ɐ s t a ɫ ə sʲ   ʊ m n o ʐ ɨ tʲ   k ɐ lʲ i t͡ɕ ɪ s t v ə   ɐ r ɡ a z m ə f   p ɐ s t a fʲ   rʲ ɪ k o r t'}

In [17]:
temp = read_audio_text_pairs("/data/additional/DATASETS/audio/ruslan/metadata_RUSLAN_22200.csv")

In [18]:
temp[0]

('/data/additional/DATASETS/audio/ruslan/RUSLAN/000001_RUSLAN.wav',
 'Кого интересуют признания литературного неудачника?')