In [1]:
import os
import pandas as pd
from IPython.display import display, Audio
import torch, torchaudio
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, TrainingArguments, Trainer
import utils

In [2]:
SR = 16_000
folder = './data/jotun 02-05-2023/'
output_dir = './training_data/'

In this notebook I will be attempting to teach wav2vec2 to recognize the voices of myself and my players.

### Preparing the dataset

The intention here is to generate a torch dataset object with 5 minutes of samples longer than 1s for each speaker.

In [12]:
project = utils.audacity.get_audacity_object('./data/jotun 02-05-2023/')

dfs = []
for file in project['files']:
    filename = file['filename']
    speakerName = utils.get_discord_name(filename)
    targetDir = os.path.join(output_dir, speakerName)
    os.makedirs(targetDir, exist_ok=True)
    
    fullPath = os.path.join(project['data_path'], filename)
    vadData = utils.prepare.get_vad_data(fullPath)
    df = pd.DataFrame(vadData)
    df['length'] = df['end'] - df['start']
    df['speaker'] = speakerName
    df = df[df['length'] > SR]
    dfs.append(df)

segments_df = pd.concat(dfs)
segments_df

Unnamed: 0,start,end,length,speaker
2,3249696,3299296,49600,JadePixie_7138
11,4489760,4544992,55232,JadePixie_7138
12,5837856,5864928,27072,JadePixie_7138
13,5873696,5899744,26048,JadePixie_7138
14,5901344,5941216,39872,JadePixie_7138
...,...,...,...,...
1489,193191968,193212384,20416,Crux_4429
1490,193484320,193596896,112576,Crux_4429
1491,194253344,194291680,38336,Crux_4429
1492,194312224,194329056,16832,Crux_4429


In [14]:
segments_df.speaker.value_counts()

CS12_4510           954
Crux_4429           781
IlMaximuslI_9218    427
JadePixie_7138      291
Marburg42_5566      281
Name: speaker, dtype: int64

In [13]:
segments_df.groupby('speaker').length.sum() / SR

speaker
CS12_4510           3365.272
Crux_4429           1710.092
IlMaximuslI_9218     953.300
JadePixie_7138       658.132
Marburg42_5566       645.564
Name: length, dtype: float64

### Preparing the models

In [7]:
model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-sid")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-sid")


In [None]:
batch_size = 32

args = TrainingArguments(
    f"wav2vec2-base-superb-sid-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset= ...,
    eval_dataset= ...,
    tokenizer= feature_extractor,
    compute_metrics= ...
)

In [None]:
from datasets import load_metric

load_metric('accuracy')

In [23]:
accuracy_metric = load_metric("accuracy")
results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 2, 0, 1, 2], )
results

{'accuracy': 1.0}