In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install librosa



In [3]:
import numpy as np
import pandas as pd
import librosa
from scipy.io import wavfile

In [4]:
import torch
import torch.nn as nn
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor, AdamW, get_linear_schedule_with_warmup

In [5]:
# 데이터 로드
df_all_txt = pd.read_csv("/content/drive/MyDrive/Text_Audio_Multimodal/data/df_all_txt.csv")

In [6]:
# 라벨 인코딩
df_all_txt.loc[(df_all_txt['Label'] == "neutral"), 'Label'] = 0
df_all_txt.loc[(df_all_txt['Label'] == "happiness"), 'Label'] = 1
df_all_txt.loc[(df_all_txt['Label'] == "surprise"), 'Label'] = 2
df_all_txt.loc[(df_all_txt['Label'] == "sadness"), 'Label'] = 3
df_all_txt.loc[(df_all_txt['Label'] == "angry"), 'Label'] = 4
df_all_txt.loc[(df_all_txt['Label'] == "disgust"), 'Label'] = 5
df_all_txt.loc[(df_all_txt['Label'] == "fear"), 'Label'] = 6

In [7]:
all_wav_tmp = df_all_txt["Seg"].tolist()
all_emotion = df_all_txt["Label"].tolist()

In [8]:
# Wav2Vec2 Processor 및 모델 로드
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base", num_labels=7)

model.to('cuda')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [9]:
# audio data -> wav2vec input
def audio2input(filepath):
    samplerate, audio = wavfile.read(filepath, mmap=True)
    if samplerate != 16000:  # Check if the sampling rate is different from 16000 Hz
        audio = librosa.resample(audio.astype(np.float32), orig_sr=samplerate, target_sr=16000)  # Resample to 16000 Hz if necessary
    input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values  # Explicitly set sampling rate
    return input_values

In [None]:
docs = []
for i in range(len(all_wav_tmp)):
    filename = all_wav_tmp[i]
    label = all_emotion[i]

    input_values = audio2input(f"/content/drive/MyDrive/Text_Audio_Multimodal/data/merged_wav_folder/{filename}.wav")
    input_values = input_values.to('cuda')

    docs.append({
        'fileName': filename,
        'input_values': input_values,
        'label': torch.tensor([int(label)], device='cuda')  # Convert label to integer
    })

KeyboardInterrupt: 

In [None]:
NUM_EPOCHS = 16

In [None]:
# Train / Test Split
# data split(total 19374) -> 15499(0.8)+3875(0.2)
train_list = docs[:15499]
test_list = docs[15499:]

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-4, eps=1e-8)

total_steps = len(train_list) * NUM_EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

total_steps = 1

In [None]:
import time

# 학습
for epoch in range(NUM_EPOCHS):
    model.train()

    # 에포크 시작 시간 기록
    start_time = time.time()

    for every_trainlist in train_list:
        input_values = every_trainlist['input_values']
        label = every_trainlist['label']

        optimizer.zero_grad()
        outputs = model(input_values, labels=label)
        loss = outputs.loss

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        if total_steps % 1000 == 0:
            with torch.no_grad():
                print(f'Epoch: {epoch + 1} \tStep: {total_steps} \tLoss: {loss.item():.4f}')
        total_steps += 1

    # 에포크 종료 시간 기록 및 경과 시간 계산
    end_time = time.time()
    elapsed_time = end_time - start_time

    # 경과 시간을 시/분/초로 변환하여 출력
    hours, rem = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(rem, 60)
    print(f'Epoch {epoch + 1} completed in {int(hours)}h {int(minutes)}m {int(seconds)}s')


In [None]:
# 학습 모델 저장
torch.save(model, './audio_train_model.pt')