## Bibliotecas Utilizadas

In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import os
import librosa
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
       print(os.path.join(dirname, filename))



/kaggle/input/zenodo-sounds/emoUERJ/m04s08.wav
/kaggle/input/zenodo-sounds/emoUERJ/m03n08.wav
/kaggle/input/zenodo-sounds/emoUERJ/w02n01.wav
/kaggle/input/zenodo-sounds/emoUERJ/w03a05.wav
/kaggle/input/zenodo-sounds/emoUERJ/m03h04.wav
/kaggle/input/zenodo-sounds/emoUERJ/m01s07.wav
/kaggle/input/zenodo-sounds/emoUERJ/m02a09.wav
/kaggle/input/zenodo-sounds/emoUERJ/m02n05.wav
/kaggle/input/zenodo-sounds/emoUERJ/w01a01.wav
/kaggle/input/zenodo-sounds/emoUERJ/w01s08.wav
/kaggle/input/zenodo-sounds/emoUERJ/w04s07.wav
/kaggle/input/zenodo-sounds/emoUERJ/m03n03.wav
/kaggle/input/zenodo-sounds/emoUERJ/m01n02.wav
/kaggle/input/zenodo-sounds/emoUERJ/w04a16.wav
/kaggle/input/zenodo-sounds/emoUERJ/m01n09.wav
/kaggle/input/zenodo-sounds/emoUERJ/w04s16.wav
/kaggle/input/zenodo-sounds/emoUERJ/w01h10.wav
/kaggle/input/zenodo-sounds/emoUERJ/w04a11.wav
/kaggle/input/zenodo-sounds/emoUERJ/m04n03.wav
/kaggle/input/zenodo-sounds/emoUERJ/w04s08.wav
/kaggle/input/zenodo-sounds/emoUERJ/m02a08.wav
/kaggle/input

## Criação de DATAFRAME 

In [2]:

data_path = "/kaggle/input/zenodo-sounds/emoUERJ"
file_names = os.listdir(data_path)

rows = []
for f in file_names:
    if f.endswith(".wav"):
        # Ex.: f = "m01a01.wav"
        # Remove extensão
        name_without_ext = os.path.splitext(f)[0]  # "m01a01"
        
        # Emoção na posição 4 (índice 3)
        emotion_char = name_without_ext[3]
        
        # Mapeia o caractere para rótulo completo
        if emotion_char == 'h':
            emotion_label = 'happy'
        elif emotion_char == 'a':
            emotion_label = 'angry'
        elif emotion_char == 's':
            emotion_label = 'sad'
        elif emotion_char == 'n':
            emotion_label = 'neutral'
        else:
            emotion_label = 'unknown'
        
        # Caminho completo do arquivo para posterior uso
        full_path = os.path.join(data_path, f)
        
        rows.append([full_path, emotion_label])

# Cria DataFrame
df = pd.DataFrame(rows, columns=["path", "label"])
df.head(10)


Unnamed: 0,path,label
0,/kaggle/input/zenodo-sounds/emoUERJ/m04s08.wav,sad
1,/kaggle/input/zenodo-sounds/emoUERJ/m03n08.wav,neutral
2,/kaggle/input/zenodo-sounds/emoUERJ/w02n01.wav,neutral
3,/kaggle/input/zenodo-sounds/emoUERJ/w03a05.wav,angry
4,/kaggle/input/zenodo-sounds/emoUERJ/m03h04.wav,happy
5,/kaggle/input/zenodo-sounds/emoUERJ/m01s07.wav,sad
6,/kaggle/input/zenodo-sounds/emoUERJ/m02a09.wav,angry
7,/kaggle/input/zenodo-sounds/emoUERJ/m02n05.wav,neutral
8,/kaggle/input/zenodo-sounds/emoUERJ/w01a01.wav,angry
9,/kaggle/input/zenodo-sounds/emoUERJ/w01s08.wav,sad


## Salvando o DataFrame 

In [3]:
df.to_csv("metadata.csv", index=False)


## Treino e Validação 

In [4]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

print("Tamanho treino:", len(train_df))
print("Tamanho validação:", len(val_df))


Tamanho treino: 301
Tamanho validação: 76


## Carregamento do Modelo

In [5]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification

model_name = "facebook/wav2vec2-large-xlsr-53"  # Exemplo
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

# Defina o número de classes baseado no seu DF
unique_labels = sorted(df['label'].unique().tolist())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
num_labels = len(unique_labels)

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label
)


preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Criação de Datasets a partir de DataFrames

In [6]:
from datasets import Dataset

train_dataset_ds = Dataset.from_pandas(train_df)
val_dataset_ds   = Dataset.from_pandas(val_df)


## Extração de Características

In [7]:

def preprocess_function(examples):
    audio_path = examples["path"]
    # Carrega o áudio
    waveform, sr = librosa.load(audio_path, sr=16000, mono=True)
    
    # Extrai features
    inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    
    # Se seu dataset tiver "label", converta em índice
    label_str = examples["label"]
    label_id = label2id[label_str]
    
    # Monta o dicionário de saída
    out = {
        "input_values": inputs["input_values"][0], 
        "labels": label_id
    }
    # Se o feature_extractor retornar "attention_mask", você pode incluir aqui:
    if "attention_mask" in inputs:
        out["attention_mask"] = inputs["attention_mask"][0]
    
    return out


## função de pré-processamento a cada exemplo dos datasets de treino e validação, transformando os dados brutos para o formato necessário ao modelo

In [8]:
train_dataset_ds = train_dataset_ds.map(preprocess_function)
val_dataset_ds   = val_dataset_ds.map(preprocess_function)                                             


Map:   0%|          | 0/301 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

## Início de treinamento do modelo 

In [9]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


# Aplicar a função de pré-processamento aos datasets
train_dataset_ds = train_dataset_ds.map(preprocess_function)
val_dataset_ds   = val_dataset_ds.map(preprocess_function)

# Remover as colunas que não são necessárias (como "path" e "label")
train_dataset_ds = train_dataset_ds.remove_columns(["path", "label"])
val_dataset_ds   = val_dataset_ds.remove_columns(["path", "label"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="wav2vec2_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=9,
    per_device_train_batch_size=4,
    learning_rate=1e-4,
    logging_steps=10,
    report_to=[],
    save_total_limit=1  # mantém apenas o checkpoint mais recente
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_ds,
    eval_dataset=val_dataset_ds,
    tokenizer=feature_extractor,  # se usar 'trainer.predict'
    compute_metrics=compute_metrics
)

trainer.train()


Map:   0%|          | 0/301 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.3736,1.299542,0.513158,0.478643
2,1.1933,0.98682,0.605263,0.526798
3,0.9052,0.806108,0.605263,0.549567
4,0.6907,0.806955,0.539474,0.43046
5,0.7409,0.845383,0.618421,0.571713
6,0.6274,0.807916,0.671053,0.644819
7,0.6128,0.5173,0.789474,0.782329
8,0.4608,0.500408,0.828947,0.817804
9,0.5138,0.521481,0.842105,0.836291




TrainOutput(global_step=342, training_loss=0.8045652470393487, metrics={'train_runtime': 591.8201, 'train_samples_per_second': 4.577, 'train_steps_per_second': 0.578, 'total_flos': 4.01268417805559e+17, 'train_loss': 0.8045652470393487, 'epoch': 9.0})

In [10]:
eval_metrics = trainer.evaluate()
print(eval_metrics)
  



{'eval_loss': 0.5214807987213135, 'eval_accuracy': 0.8421052631578947, 'eval_f1': 0.8362914862914862, 'eval_runtime': 6.1456, 'eval_samples_per_second': 12.367, 'eval_steps_per_second': 0.814, 'epoch': 9.0}


## Testando o Modelo Áudio único Locutor 

In [11]:
def predict_emotion(audio_path):
    # Carrega o áudio com 16kHz e mono
    audio, sr = librosa.load(audio_path, sr=16000, mono=True)
    
    # Extrai features com truncation e padding, definindo max_length (ex: 16000 amostras = 1s)
    inputs = feature_extractor(
        audio,
        sampling_rate=sr,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=16000
    )
    
    # Mova os tensores para o mesmo dispositivo do modelo (ex.: cuda:0)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred_id = torch.argmax(logits, dim=-1).item()
    
    return id2label[pred_id]

# Exemplo de uso:
audio_test_path = "/kaggle/input/zenodo-sounds/emoUERJ/w03a04.wav"   
predicted_emotion = predict_emotion(audio_test_path)
print("Emoção detectada:", predicted_emotion)


Emoção detectada: happy


## Testando o Modelo Áudio mais de um  Locutor 

In [12]:
!pip install pyannote.audio huggingface_hub

Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.1-py3-none-any.whl.metadata (39 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio)
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.w