In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import numpy as np
import pandas as pd
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize
import librosa
import Levenshtein
from tqdm import tqdm

# ======== Настройки ========
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#DEVICE = torch.device("cpu")
print(f"Дивайс обучения: {DEVICE}")
MORSEALP = "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ 1234567890#"
SAMPLE_RATE = 8000
N_MELS = 64
LAMBDA_REG = 1
TOME_MASK_PARAM = 5
FREQ_MASK_PARAM = 5
BATCH_SIZE = 16
EPOCHS = 10

def purified_signal_matrix(x_noisy:np.array, lambda_reg: int):
    n = len(x_noisy)
    
    def loss_function(x):
        # |x - x_noisy||2(Вторая норма) + λ *  sum(xi+1 - xi)
        x_norm = x - x_noisy
        second_norm = x_norm  @ x_norm.T
        
        reg = lambda_reg * np.sum(np.diff(x)**2)
        
        return second_norm + reg
    
    def gradient(x):
        main_grad = 2 * (x - x_noisy)
        
        reg_grad = np.zeros(n)
        reg_grad[:-1] += 2 * lambda_reg * (x[:-1] - x[1:])
        reg_grad[1:] += 2 * lambda_reg * (x[1:] - x[:-1])
        return main_grad + reg_grad
    
    x0 = x_noisy.copy()

    res = minimize(loss_function, x0, method='L-BFGS-B', jac=gradient)
    
    return res.x


def mel_augment(spec):
    time_masking = torchaudio.transforms.TimeMasking(time_mask_param=TOME_MASK_PARAM) 
    freq_masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=FREQ_MASK_PARAM)

    spec_tensor = torch.from_numpy(spec) # torch.Size([64, 126])
    augment = time_masking(spec_tensor)
    augment = freq_masking(augment)

    return augment.numpy()


def collate_fn(batch):
    x = torch.stack([item[0] for item in batch])
    y = pad_sequence([item[1] for item in batch], batch_first=True, padding_value=0)
    y_len = torch.stack([item[2] for item in batch])
    return x, y, y_len


# ===== Dataset =====
class MorseDataset(Dataset):
    def __init__(self, audio_paths, messages, mode='train'):
        self.audio_paths = audio_paths
        self.messages = messages
        self.mode = mode
        self.lambda_reg = LAMBDA_REG
        self.sample_rate = SAMPLE_RATE
        self.n_mels = N_MELS

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        path = self.audio_paths[idx]
        message = self.messages[idx]

        audio, sr = librosa.load(path, sr=self.sample_rate)
        audio = purified_signal_matrix(audio, lambda_reg=self.lambda_reg)

        mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=self.n_mels)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec = mel_augment(mel_spec)

        std = mel_spec.std()
        mel_spec = (mel_spec - mel_spec.mean()) / (std if std >= 1e-5 else 1e-5)
        mel_spec = torch.FloatTensor(mel_spec).unsqueeze(0)

        if self.mode == 'test':
            return mel_spec

        label = torch.LongTensor([MORSEALP.find(c) + 1 for c in message if MORSEALP.find(c) != -1])
        label_len = torch.LongTensor([len(label)])

        return mel_spec, label, label_len


# ===== Медель =====
class MorseNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 8, 3, padding=1), nn.ReLU(), nn.BatchNorm2d(8), nn.MaxPool2d(2),
            nn.Conv2d(8, 16, 3, padding=1), nn.ReLU(), nn.BatchNorm2d(16), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.BatchNorm2d(32), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.BatchNorm2d(64), nn.MaxPool2d((2, 1))
        )
        self.conv_size = self.add_module

        self.size = self.get_size()

        
        self.rnn = nn.LSTM(input_size=self.size, hidden_size=128, num_layers=2,
                           bidirectional=True, batch_first=True)
        self.layer = nn.Linear(256, len(MORSEALP) + 1)

    def get_size(self):
        inp = torch.rand(1,1,64,128)
        with torch.no_grad():
            out = self.conv(inp)
        
        return out.size(1) * out.size(2)

    def forward(self, x):
        x = self.conv(x)
        x = nn.Softmax(x)
        b, c, h, t = x.size()
        x = x.permute(0, 3, 2, 1).reshape(b, t, h * c)
        x, h = self.rnn(x)
        
        return self.layer(x)


# === Обучение ===
def train(model, train_loader, val_loader, optimizer, criterion, epochs):
    model.to(DEVICE)
    for epoch in tqdm(range(epochs), desc="Training"):
        model.train()
        total_loss = 0
        for mel, labels, label_lens in train_loader:
            mel, labels, label_lens = mel.to(DEVICE), labels.to(DEVICE), label_lens.to(DEVICE)

            optimizer.zero_grad()

            output = model(mel)
            T = output.size(1)
            N = output.size(0)

            input_lens = torch.full(size=(N,), fill_value=T, dtype=torch.long).to(DEVICE)

            output = output.permute(1, 0, 2)  #(N, T, C) -> (T, N, C)
            flat_labels = torch.cat([labels[i, :label_lens[i]] for i in range(labels.size(0))])
            if T < label_lens.max().item():
                continue

            loss = criterion(output, flat_labels, input_lens, label_lens.view(-1))
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()

        val_score = validate(model, val_loader)
        print(f"[Номер эпохи {epoch+1}]\nЛосс за эпоху: {total_loss:.4f}\nМетрика Лихтенштейна: {val_score:.4f}")

# ===== Валидация =====
def validate(model, val_loader):
    model.eval()
    distances = []
    with torch.no_grad():
        for mel, labels, label_lens in val_loader:
            mel = mel.to(DEVICE)
            log_probs = model(mel).log_softmax(2)
            decoded = decoder(log_probs)


            targets = []
            for i in range(labels.size(0)):
                true_chars = [MORSEALP[i - 1] for i in labels[i][:label_lens[i]] if i > 0]
                targets.append("".join(true_chars))

            # Расстояние Левенштейна
            for t, p in zip(targets, decoded):
                distance = Levenshtein.distance(t, p)
                norm_dist = distance / max(len(t), 1)
                distances.append(norm_dist)

    mean_distance = np.mean(distances)
    return mean_distance

# ===== Декодер =====
def decoder(log_mass):
    preds = torch.argmax(log_mass, dim=2).cpu().numpy()
    decoded = []
    for pred in preds:
        prev = -1
        tokens = []
        for p in pred:
            if p != prev and p != 0:
                tokens.append(MORSEALP[p - 1])
            prev = p
        decoded.append("".join(tokens))
    return decoded

Дивайс обучения: cuda


In [2]:
# ======== SETUP ========
audio_dir = Path(os.getcwd()) / 'morse_dataset' / 'morse_dataset'
train_data = pd.read_csv("train.csv")
val_data = pd.read_csv("test.csv")

train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

train_paths = [str(audio_dir / f"{id_}") for id_ in train_data['id']]
val_paths = [str(audio_dir / f"{id_}") for id_ in val_data['id']]

train_dataset = MorseDataset(train_paths, train_data['message'], mode='train')
val_dataset = MorseDataset(val_paths, val_data['message'], mode='val')

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

model = MorseNet().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CTCLoss(blank=0, zero_infinity=True)



In [3]:
for batch in train_dataset:
    tensor1_batch, tensor2_batch, tensor3_batch = batch
    print("tensor1_batch:", tensor1_batch) 
    print("tensor2_batch:", tensor2_batch)  
    print("tensor3_batch:", tensor3_batch)
    #print("tensor3_batch:", tensor4_batch.shape)
    break

tensor1_batch: tensor([[[-1.2132, -0.8879, -0.7573,  ..., -0.8810, -0.9047, -0.7022],
         [-1.2625, -1.0439, -0.8171,  ..., -0.9026, -0.8088, -0.6537],
         [-1.1210, -0.8837, -0.7991,  ..., -0.9774, -0.8377, -0.6510],
         ...,
         [-1.3600, -1.2079, -1.0604,  ..., -1.0058, -1.0339, -1.0167],
         [-1.4935, -1.3644, -1.2531,  ..., -1.1216, -1.0721, -1.0919],
         [-1.2546, -1.3095, -1.3762,  ..., -1.2617, -1.0958, -1.2398]]])
tensor2_batch: tensor([18,  4, 26, 36, 42, 22])
tensor3_batch: tensor([6])


In [8]:
for batch in train_loader:
    tensor1_batch, tensor2_batch, tensor3_batch = batch
    print("tensor1_batch:", tensor1_batch) 
    print("tensor2_batch:", tensor2_batch)  
    print("tensor3_batch:", tensor3_batch)
    #print("tensor3_batch:", tensor4_batch.shape)
    break

tensor1_batch: tensor([[[[-1.0541, -0.7825, -0.9030,  ..., -1.0322, -0.8811, -0.8301],
          [-1.2141, -0.9159, -0.9817,  ..., -0.9520, -0.8866, -0.9231],
          [-1.3195, -0.9746, -0.9710,  ..., -1.0299, -1.0051, -0.9503],
          ...,
          [-1.5111, -1.4181, -1.2645,  ..., -1.1768, -0.9005, -0.9363],
          [-1.8098, -1.6104, -1.3758,  ..., -1.2394, -0.9458, -0.9515],
          [-1.7699, -1.6016, -1.4929,  ..., -1.4902, -1.1644, -0.9715]]],


        [[[-1.1919, -0.9199, -0.8436,  ..., -0.5418, -0.5759, -0.2558],
          [-1.0464, -0.7832, -0.7768,  ..., -0.4769, -0.5402, -0.2700],
          [-1.0556, -0.8439, -0.7895,  ..., -0.4651, -0.4730, -0.2705],
          ...,
          [-1.4193, -1.2527, -0.9879,  ..., -1.2731, -1.1959, -0.9471],
          [-1.7506, -1.4554, -1.2752,  ..., -1.4613, -1.2385, -0.9420],
          [-2.0344, -1.5516, -1.3191,  ..., -1.5833, -1.2439, -0.9166]]],


        [[[-1.7822, -1.5881, -1.5404,  ..., -1.3062, -1.2927, -1.3855],
          [

In [3]:
train(model, train_loader, val_loader, optimizer, criterion, EPOCHS)

Training:  10%|█         | 1/10 [07:03<1:03:29, 423.30s/it]

[Номер эпохи 1]
Лосс за эпоху: 5403.5642
Метрика Лихтенштейна: 0.8575


Training:  20%|██        | 2/10 [12:25<48:31, 363.88s/it]  

[Номер эпохи 2]
Лосс за эпоху: 4341.7236
Метрика Лихтенштейна: 0.7018


Training:  30%|███       | 3/10 [18:06<41:14, 353.48s/it]

[Номер эпохи 3]
Лосс за эпоху: 3504.0994
Метрика Лихтенштейна: 0.5685


Training:  40%|████      | 4/10 [22:18<31:19, 313.32s/it]

[Номер эпохи 4]
Лосс за эпоху: 3014.3972
Метрика Лихтенштейна: 0.5059


Training:  50%|█████     | 5/10 [26:30<24:16, 291.31s/it]

[Номер эпохи 5]
Лосс за эпоху: 2749.2774
Метрика Лихтенштейна: 0.4888


Training:  60%|██████    | 6/10 [30:34<18:20, 275.06s/it]

[Номер эпохи 6]
Лосс за эпоху: 2561.2326
Метрика Лихтенштейна: 0.4616


Training:  70%|███████   | 7/10 [34:46<13:22, 267.57s/it]

[Номер эпохи 7]
Лосс за эпоху: 2414.6891
Метрика Лихтенштейна: 0.4652


Training:  80%|████████  | 8/10 [38:59<08:46, 263.11s/it]

[Номер эпохи 8]
Лосс за эпоху: 2302.3685
Метрика Лихтенштейна: 0.4450


Training:  90%|█████████ | 9/10 [43:11<04:19, 259.55s/it]

[Номер эпохи 9]
Лосс за эпоху: 2207.8196
Метрика Лихтенштейна: 0.4503


Training: 100%|██████████| 10/10 [47:23<00:00, 284.37s/it]

[Номер эпохи 10]
Лосс за эпоху: 2124.2111
Метрика Лихтенштейна: 0.4262





In [62]:
def collate_fn(batch):
    max_len = max([x[0].shape[2] for x in batch])

    inputs = []
    
    for x in batch:
        pad_amount = max_len - x[0].shape[2]
        padded = torch.nn.functional.pad(x[0], (0, pad_amount), "constant", 0)
        inputs.append(padded)
    
    inputs = torch.stack(inputs)
    
    return inputs

In [4]:
test_data = pd.read_csv('test.csv')
test_df = test_data.loc[0:4982]
test_paths = str(audio_dir) + '\\' + test_df['id'].to_numpy()
test_dataset = MorseDataset(test_paths, [""] * len(test_df), mode='test')
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
test_data

Unnamed: 0,id
0,30001.opus
1,30002.opus
2,30003.opus
3,30004.opus
4,30005.opus
...,...
4995,34996.opus
4996,34997.opus
4997,34998.opus
4998,34999.opus


In [74]:
for batch in test_dataset:
    tensor1_batch = batch
    print("tensor1_batch:", tensor1_batch) 
    #print("tensor3_batch:", tensor4_batch.shape)
    break

tensor1_batch: tensor([[[-0.7153, -0.3446, -0.4299,  ..., -0.5348, -0.4746, -0.6873],
         [-0.4614, -0.2721, -0.3467,  ..., -0.4737, -0.4239, -0.8345],
         [-0.5832, -0.2881, -0.2785,  ..., -0.3055, -0.3830, -0.7288],
         ...,
         [-1.7764, -1.7091, -1.7299,  ..., -1.6883, -1.5957, -1.7402],
         [-1.8427, -1.7542, -1.8486,  ..., -1.7320, -1.6841, -1.9052],
         [-1.9164, -1.7880, -1.8117,  ..., -1.7044, -1.7400, -1.8446]]])


In [75]:
for batch in test_loader:
    tensor1_batch = batch
    print("tensor1_batch:", tensor1_batch.shape) 
    #print("tensor3_batch:", tensor4_batch.shape)
    break

tensor1_batch: torch.Size([16, 1, 64, 126])


In [89]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA devices: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA devices: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3080


In [5]:
def print_memory_stats():
    print(f"Allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
    print(f"Reserved: {torch.cuda.memory_reserved()/1024**2:.2f} MB")
    print(f"Max allocated: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")

print_memory_stats()

Allocated: 28.89 MB
Reserved: 136.00 MB
Max allocated: 94.15 MB


In [7]:
def inference(model,df = test_df, test_loader = test_loader):
    model.eval()
    predic = []
    torch.cuda.empty_cache()
    with torch.no_grad():

        for batch in tqdm(test_loader, desc="Вывод"):
            batch = batch.to(DEVICE)
            log_probs = model(batch).log_softmax(2)
            decoded = decoder(log_probs)
            predic.extend(decoded)

    df['message'] = predic
    # output_path = output_csv_path or test_csv
    # test_df.to_csv(output_path, index=False)
    print(f"Результаты записаны в {df}")

inference(model, test_loader = test_loader)

Вывод: 100%|██████████| 312/312 [01:09<00:00,  4.46it/s]

Результаты записаны в               id     message
0     30001.opus    ЯЮ8 КЖБШ
1     30002.opus     КЬ 0Ж 9
2     30003.opus     #ЬЭ461Я
3     30004.opus  ЖЖНЖ9РЫНЦ3
4     30005.opus      ЩЦ3ЮЧЬ
...          ...         ...
4978  34979.opus     ЮНЮ7ИЪЫ
4979  34980.opus   Ф7О634БТЫ
4980  34981.opus    ЛГ88Е 6Ъ
4981  34982.opus     П2БСА 8
4982  34983.opus    ЗСА0СЫНВ

[4983 rows x 2 columns]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['message'] = predic


In [10]:
test_df.to_csv('filename.csv', index=False)