In [None]:
!unzip -q "/content/audio.zip" -d /content/audio

In [None]:
audio_dir = "/content/audio/content/audio"
csv_path  = "/content/df_IEMOCAP_with_text.csv"

In [None]:
import os, glob
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv(csv_path)
target = {"ang","dis","fea","hap","sad","sur", "neu"}
df = df[df['emotion'].isin(target)].reset_index(drop=True)
label_map = {emo:i for i, emo in enumerate(sorted(target))}

file_paths, labels = [], []
for _, row in df.iterrows():
    wav_fn = f"{row['name_files']}.wav"
    wav_fp = os.path.join(audio_dir, wav_fn)
    if os.path.isfile(wav_fp):
        file_paths.append(wav_fp)
        labels.append(label_map[row['emotion']])

print(f"Найдено {len(file_paths)} файлов для обучения")

In [None]:
train_paths, val_paths, train_labels, val_labels = train_test_split(
    file_paths, labels, test_size=0.2, stratify=labels, random_state=42
)
print("Train/Val:", len(train_paths), "/", len(val_paths))

In [None]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import Resample, MelSpectrogram, AmplitudeToDB

class SimpleAudioDataset(Dataset):
    def __init__(self, paths, labels, sr=16000, n_mels=64):
        self.paths, self.labels = paths, labels
        self.resampler = Resample(orig_freq=48000, new_freq=sr)
        self.melspec   = MelSpectrogram(sample_rate=sr, n_mels=n_mels)
        self.db        = AmplitudeToDB()

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, i):
        path = self.paths[i]
        try:
            wav, sr = torchaudio.load(path)
        except Exception:
            return self.__getitem__((i+1) % len(self))
        if sr != 16000:
            wav = self.resampler(wav)
        if wav.size(0) > 1:
            wav = wav.mean(0, keepdim=True)
        mel_db = self.db(self.melspec(wav)).squeeze(0)
        label  = self.labels[i]
        return mel_db, label

def collate_fn(batch):
    specs, labs = zip(*batch)
    max_t = max(s.shape[-1] for s in specs)
    padded = [torch.nn.functional.pad(s, (0, max_t-s.shape[-1])) for s in specs]
    return torch.stack(padded), torch.tensor(labs)

train_loader = DataLoader(
    SimpleAudioDataset(train_paths, train_labels),
    batch_size=16, shuffle=True,  num_workers=0, collate_fn=collate_fn
)
val_loader = DataLoader(
    SimpleAudioDataset(val_paths,   val_labels),
    batch_size=16, shuffle=False, num_workers=0, collate_fn=collate_fn
)

In [None]:
batch_mel, batch_lbl = next(iter(train_loader))
print("Batch mel:", batch_mel.shape, "Labels:", batch_lbl.shape)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MelCNN(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(),
        )
        self.classifier = nn.Sequential(
            nn.Linear(32, 128),
            nn.ReLU(),
            nn.Linear(128, n_classes)
        )

    def forward(self, x):
        x = x.unsqueeze(1)
        feat = self.features(x)
        return self.classifier(feat)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MelCNN(n_classes=7).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

In [None]:
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score

def evaluate(loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for mel, y in loader:
            mel, y = mel.to(device), y.to(device)
            logits = model(mel)
            p = logits.argmax(dim=1)
            preds += p.cpu().tolist()
            trues += y.cpu().tolist()
    return accuracy_score(trues, preds), f1_score(trues, preds, average="macro")

epochs = 5
for ep in range(1, epochs+1):
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch {ep}")
    for X, y in loop:
        X, y = X.to(device), y.to(device)
        logits = model(X)
        loss = F.cross_entropy(logits, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=f"{loss.item():.3f}")
    acc, f1 = evaluate(val_loader)
    print(f"→ Ep{ep}: Val Acc={acc:.3f}, Macro-F1={f1:.3f}")

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

CSV_ALL   = "/content/drive/MyDrive/df_IEMOCAP_with_text.csv"
AUDIO_DIR = "/content/drive/MyDrive/iemocap_audio"

df = pd.read_csv(CSV_ALL)

target = ["ang", "dis", "fea", "hap", "sad", "sur"]
df = df[df["emotion"].isin(target)].reset_index(drop=True)

train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["emotion"],
    random_state=42
)

train_df["path_to_wav"] = train_df["name_files"].apply(
    lambda fn: os.path.join(AUDIO_DIR, fn + ".wav")
)
val_df["path_to_wav"] = val_df["name_files"].apply(
    lambda fn: os.path.join(AUDIO_DIR, fn + ".wav")
)

for split_df, name in [(train_df, "train"), (val_df, "validation")]:
    missing = split_df["path_to_wav"].apply(os.path.exists).value_counts()
    print(f"{name} — файлы найдены? \n{missing}\n")

ds = DatasetDict({
    "train":      Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df.reset_index(drop=True)),
})

print(ds)
print("Пример записи:", ds["train"][0])


In [None]:
train_df = train_df[train_df["path_to_wav"].apply(os.path.exists)].reset_index(drop=True)
val_df   = val_df[  val_df["path_to_wav"].apply(os.path.exists)].reset_index(drop=True)

ds = DatasetDict({
    "train":      Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
})

In [None]:
from tqdm.auto import tqdm
import librosa, os
from datasets import Dataset, DatasetDict
import pandas as pd

def is_loadable(path):
    try:
        _y, _ = librosa.load(path, sr=16000)
        return True
    except:
        return False

tqdm.pandas()
print("Train before:", len(train_df))
train_df["ok"] = train_df["path_to_wav"].progress_apply(is_loadable)
train_df = train_df[train_df["ok"]].drop(columns="ok").reset_index(drop=True)
print("Train after :", len(train_df))

print("Valid before:", len(val_df))
val_df["ok"] = val_df["path_to_wav"].progress_apply(is_loadable)
val_df = val_df[val_df["ok"]].drop(columns="ok").reset_index(drop=True)
print("Valid after :", len(val_df))

ds = DatasetDict({
    "train":      Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
})

from transformers import Wav2Vec2FeatureExtractor
fe = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
label_list = sorted(ds["train"].unique("emotion"))
label2id    = {e:i for i,e in enumerate(label_list)}

def preprocess(batch):
    speech, sr = librosa.load(batch["path_to_wav"], sr=16000)
    inputs = fe(speech, sampling_rate=sr, return_tensors="pt")
    return {
        "input_values": inputs.input_values[0].numpy(),
        "labels":       label2id[batch["emotion"]]
    }

ds = ds.map(
    preprocess,
    remove_columns=ds["train"].column_names,
    batched=False,
    num_proc=4,
    load_from_cache_file=False
)

In [None]:
import numpy as np
import torch
from transformers import (
    Wav2Vec2ForSequenceClassification,
    Wav2Vec2FeatureExtractor,
    TrainingArguments,
    Trainer,
    TrainerCallback,
    TrainerState,
    TrainerControl
)
import evaluate

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(label2id),
    label2id=label2id,
    id2label={i: e for e, i in label2id.items()},
)

metric = evaluate.load("f1")

def compute_metrics(pred):
    logits = pred.predictions
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=pred.label_ids, average="macro")

training_args = TrainingArguments(
    output_dir="wav2vec2_ser",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="no",
    report_to="none",
)

class ConsoleProgressCallback(TrainerCallback):
    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if not state.log_history:
            return
        logs = state.log_history[-1]
        if "loss" in logs and state.global_step % args.logging_steps == 0:
            print(f"[Step {state.global_step}/{state.max_steps}]  Loss: {logs['loss']:.4f}")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset= ds["validation"],
    tokenizer=fe,
    compute_metrics=compute_metrics,
    callbacks=[ConsoleProgressCallback]
)

trainer.train()
trainer.evaluate()

In [None]:
import os
import numpy as np
import torch
import pandas as pd
import librosa
import soundfile as sf

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForSequenceClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback, TrainerState, TrainerControl,
    DataCollatorWithPadding,
)
import evaluate

In [None]:
CSV_ALL   = "/content/df_IEMOCAP_with_text.csv"
AUDIO_DIR = "/content/audio/content/audio"

df = pd.read_csv(CSV_ALL)
target = ["ang", "dis", "fea", "hap", "sad", "sur"]
df = df[df["emotion"].isin(target)].reset_index(drop=True)

train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df["emotion"], random_state=42
)
train_df["path_to_wav"] = train_df["name_files"].map(lambda fn: f"{AUDIO_DIR}/{fn}.wav")
val_df  ["path_to_wav"] = val_df  ["name_files"].map(lambda fn: f"{AUDIO_DIR}/{fn}.wav")

print("Train files:", train_df["path_to_wav"].map(os.path.exists).value_counts())
print("Val   files:", val_df  ["path_to_wav"].map(os.path.exists).value_counts())

Train files: path_to_wav
True     2343
False       1
Name: count, dtype: int64
Val   files: path_to_wav
True     586
False      1
Name: count, dtype: int64


In [None]:
ds = DatasetDict({
    "train":      Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df .reset_index(drop=True)),
})

def is_valid(ex):
    try:
        sf.info(ex["path_to_wav"])
        return True
    except:
        return False

ds = ds.filter(is_valid)

In [None]:
fe = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
label_list = sorted(ds["train"].unique("emotion"))
label2id    = {e:i for i,e in enumerate(label_list)}
id2label    = {i:e for e,i in label2id.items()}

In [None]:
def preprocess(example):
    speech, sr = librosa.load(example["path_to_wav"], sr=16_000)
    inputs = fe(speech, sampling_rate=sr, return_tensors="np")  # numpy
    rms = librosa.feature.rms(y=speech)[0]
    prosody = np.array([rms.mean(), rms.std()], dtype=np.float32)

    return {
        "input_values": inputs["input_values"][0],
        "labels":       label2id[example["emotion"]],
        "prosody":      prosody,
    }

ds = ds.map(
    preprocess,
    remove_columns=ds["train"].column_names,
    batched=False
)

print("После map:", len(ds["train"]), "записей; пример:", ds["train"][0])

In [None]:
data_collator = DataCollatorWithPadding(
    tokenizer=fe,
    padding=True
)

In [None]:
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(label2id),
    label2id=label2id,
    id2label={i: e for e, i in label2id.items()},
)

metric = evaluate.load("f1")
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids, average="macro")

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

In [None]:
from transformers import DataCollatorWithPadding

class ProsodyCollator:
    def __init__(self, processor):
        self.base_collator = DataCollatorWithPadding(tokenizer=processor)

    def __call__(self, features):
        prosody_features = [f.pop("prosody") for f in features]
        batch = self.base_collator(features)



        return batch

prosody_data_collator = ProsodyCollator(processor=processor)

In [None]:
from transformers import ProgressCallback

class ConsoleProgress(TrainerCallback):
    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.global_step % args.logging_steps == 0 and state.log_history:
            loss = state.log_history[-1].get("loss")
            if loss is not None:
                print(f"[Step {state.global_step}/{state.max_steps}]  Loss: {loss:.4f}")


training_args = TrainingArguments(
    output_dir="./wav2vec2_with_prosody",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_steps=50,
    eval_steps=200,
    eval_strategy="steps",
    save_strategy="no",
    report_to="none",
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    data_collator=prosody_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[ConsoleProgress, ProgressCallback],
)

trainer.train()
trainer.evaluate()