Connected to torch_env (3.11.9) (Python 3.11.9)

In [None]:
import os
import torch
import torch.nn as nn
import torchaudio
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchmetrics.text import WordErrorRate
import wandb

# ===============================
# 1️⃣ WandB
# ===============================
wandb.init(
    project="speech2text",
    name="transformer_asr_run",
    config={
        "lr": 1e-4,
        "batch_size": 8,
        "epochs": 3,
        "d_model": 256,
        "n_head": 8,
        "num_encoders": 3,
        "num_decoders": 3,
        "dropout": 0.1,
        "n_mels": 128,
        "sample_rate": 22050
    }
)
config = wandb.config

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true&ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\majid\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mmajidahmady[0m ([33mmajid_ahmady[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
class LJSpeechDataset(Dataset):
    def __init__(self, root, csv_file, transform=None, target_transform=None):
        self.data = pd.read_csv(os.path.join(root, csv_file), delimiter='|', names=['id', 'transcript', 'normalized_transcript', 'path'])
        self.transform = transform
        self.target_transform = target_transform

        # مثال: sos=2, eos=3
        self.sos = 2
        self.eos = 3

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        waveform, sr = torchaudio.load(self.data.iloc[idx]['path'])
        waveform = waveform.squeeze(0)
        transcript = self.data.iloc[idx]['normalized_transcript']
        if self.transform:
            waveform = self.transform(waveform)
        if self.target_transform:
            transcript = self.target_transform(transcript)
        transcript = [self.sos] + transcript + [self.eos]
        transcript = torch.LongTensor(transcript)
        return waveform, transcript

# ===============================
# 3️⃣ Collate_fn برای padding
# ===============================
def collate_fn(batch):
    x, y = zip(*batch)
    x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=0).unsqueeze(1)
    y = torch.nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)
    return x, y

# ===============================
# 4️⃣ Transform برای صوت
# ===============================
transform = nn.Sequential(
    torchaudio.transforms.Resample(orig_freq=config.sample_rate, new_freq=16000),
    torchaudio.transforms.MelSpectrogram(n_mels=config.n_mels)
)

# ===============================
# 5️⃣ Target Transform ساده (مثال: حرف → عدد)
# ===============================
vocab = {c:i for i,c in enumerate("abcdefghijklmnopqrstuvwxyz '")}
def target_transform(text):
    return [vocab.get(c, 0) for c in text.lower()]

# ===============================
# 6️⃣ Dataset و DataLoader
# ===============================
root = "./Dataset-LJSpeech"
csv_file = "metadata.csv"
train_set = LJSpeechDataset(root=root, csv_file=csv_file, transform=transform, target_transform=target_transform)
train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)

# تست shape اولین batch
batch = next(iter(train_loader))
print(batch[0].shape, batch[1].shape)  # [B, 1, n_mels, T], [B, L]


# ===============================
# 3️⃣ مدل Transformer ساده
# ===============================
class DummyTransformerASR(nn.Module):
    def __init__(self, len_vocab, d_model, n_head, num_encoders, num_decoders, dropout):
        super().__init__()
        self.transform = nn.Sequential(
            torchaudio.transforms.MelSpectrogram(),
        )
        self.encoder = nn.Linear(128, d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=n_head,
                                          num_encoder_layers=num_encoders,
                                          num_decoder_layers=num_decoders,
                                          dropout=dropout)
        self.cls = nn.Linear(d_model, len_vocab)

    def forward(self, src, tgt):
        # src: [B, T] waveform
        src = self.transform(src)  # [B, n_mels, T']
        src = src.permute(2, 0, 1)  # [T', B, n_mels]
        src = self.encoder(src)
        tgt = tgt.permute(1, 0, 2)  # [L, B, D]
        out = self.transformer(src, tgt)
        out = out.permute(1, 0, 2)
        out = self.cls(out)
        return out

# فرض کنید vocab size = 50
len_vocab = 50
model = DummyTransformerASR(len_vocab=len_vocab, d_model=config.d_model, n_head=config.n_head,
                            num_encoders=config.num_encoders, num_decoders=config.num_decoders,
                            dropout=config.dropout).to(device)

# ===============================
# 4️⃣ Loss و Optimizer
# ===============================
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
metric = WordErrorRate().to(device)

# ===============================
# 5️⃣ حلقه آموزش (نمونه)
# ===============================
for epoch in range(config.epochs):
    model.train()
    for batch_idx, (waveforms, targets) in enumerate(train_loader):
        waveforms, targets = waveforms.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(waveforms, targets)
        loss = loss_fn(outputs.permute(0, 2, 1), targets[:, 1:])  # shift
        loss.backward()
        optimizer.step()
        
        # ===============================
        # 6️⃣ لاگ به WandB
        # ===============================
        wandb.log({"loss": loss.item(), "epoch": epoch+1})

        # WER به صورت نمونه (فرض تبدیل خروجی مدل به متن انجام شده)
        # generates, transcripts = postprocess(outputs, targets)
        # metric.update(generates, transcripts)
        # wer = metric.compute()
        # wandb.log({"WER": wer})

# ===============================
# 7️⃣ ذخیره مدل
# ===============================
torch.save(model.state_dict(), "transformer_asr.pth")
wandb.save("transformer_asr.pth")

# %%



FileNotFoundError: [Errno 2] No such file or directory: './Dataset-LJSpeech\\metadata.csv'

In [None]:
root = 'M:/Git/Sound2Text/Dataset-LJSpeech'

In [None]:
config = wandb.config

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
root = 'M:/Git/Sound2Text/Dataset-LJSpeech'

# ===============================
# 2️⃣ Dataset سفارشی
# ===============================
class LJSpeechDataset(Dataset):
    def __init__(self, root, csv_file, transform=None, target_transform=None):
        self.data = pd.read_csv(os.path.join(root, csv_file), delimiter='|', names=['id', 'transcript', 'normalized_transcript', 'path'])
        self.transform = transform
        self.target_transform = target_transform

        # مثال: sos=2, eos=3
        self.sos = 2
        self.eos = 3

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        waveform, sr = torchaudio.load(self.data.iloc[idx]['path'])
        waveform = waveform.squeeze(0)
        transcript = self.data.iloc[idx]['normalized_transcript']
        if self.transform:
            waveform = self.transform(waveform)
        if self.target_transform:
            transcript = self.target_transform(transcript)
        transcript = [self.sos] + transcript + [self.eos]
        transcript = torch.LongTensor(transcript)
        return waveform, transcript

# ===============================
# 3️⃣ Collate_fn برای padding
# ===============================
def collate_fn(batch):
    x, y = zip(*batch)
    x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=0).unsqueeze(1)
    y = torch.nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)
    return x, y

# ===============================
# 4️⃣ Transform برای صوت
# ===============================
transform = nn.Sequential(
    torchaudio.transforms.Resample(orig_freq=config.sample_rate, new_freq=16000),
    torchaudio.transforms.MelSpectrogram(n_mels=config.n_mels)
)

# ===============================
# 5️⃣ Target Transform ساده (مثال: حرف → عدد)
# ===============================
vocab = {c:i for i,c in enumerate("abcdefghijklmnopqrstuvwxyz '")}
def target_transform(text):
    return [vocab.get(c, 0) for c in text.lower()]

# ===============================
# 6️⃣ Dataset و DataLoader
# ===============================
root = f'{root}/metadata.csv'# "./Dataset-LJSpeech"

csv_file = "metadata.csv"
train_set = LJSpeechDataset(root=root, csv_file=csv_file, transform=transform, target_transform=target_transform)
train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)

# تست shape اولین batch
batch = next(iter(train_loader))
print(batch[0].shape, batch[1].shape)  # [B, 1, n_mels, T], [B, L]

FileNotFoundError: [Errno 2] No such file or directory: 'M:/Git/Sound2Text/Dataset-LJSpeech/metadata.csv\\metadata.csv'

In [None]:
root = f'{root}'# "./Dataset-LJSpeech"

csv_file = "metadata.csv"
train_set = LJSpeechDataset(root=root, csv_file=csv_file, transform=transform, target_transform=target_transform)
train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)

# تست shape اولین batch
batch = next(iter(train_loader))
print(batch[0].shape, batch[1].shape)  # [B, 1, n_mels, T], [B, L]

FileNotFoundError: [Errno 2] No such file or directory: 'M:/Git/Sound2Text/Dataset-LJSpeech/metadata.csv\\metadata.csv'

In [None]:
root = 'M:/Git/Sound2Text/Dataset-LJSpeech'

In [None]:
csv_file = "metadata.csv"

In [None]:
train_set = LJSpeechDataset(root=root, csv_file=csv_file, transform=transform, target_transform=target_transform)

In [None]:
train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
# تست shape اولین batch
batch = next(iter(train_loader))
print(batch[0].shape, batch[1].shape)  # [B, 1, n_mels, T], [B, L]

TypeError: Invalid file: nan

In [None]:
batch = next(iter(train_loader))

TypeError: Invalid file: nan

In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x1716c2e9890>

In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x1716c2e9890>

In [None]:
batch = next(iter(train_loader))

TypeError: Invalid file: nan