In [1]:
import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor

In [31]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = AutoModelForAudioClassification.from_pretrained('alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech').to(DEVICE)
processor = Wav2Vec2FeatureExtractor.from_pretrained('alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech')

In [25]:
model.eval()

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=

In [29]:
preds = []

for path in train.path:
    speech, _ = librosa.load('../data/'+path[2:], sr=16000)
    input_value=processor(speech, sampling_rate=16000, return_tensors="pt", padding=True).input_values
    input_value=input_value.to(DEVICE)
    logit=model(input_value).logits
    preds.append(int(torch.argmax(logit)))

In [30]:
gender_class = pd.DataFrame({'id':train.id, 'gender':preds})
gender_class.to_csv('../data/train_gender.csv', index=False)

In [33]:
preds = []

for path in test.path:
    speech, _ = librosa.load('../data/'+path[2:], sr=16000)
    input_value=processor(speech, sampling_rate=16000, return_tensors="pt", padding=True).input_values
    input_value=input_value.to(DEVICE)
    logit=model(input_value).logits
    preds.append(int(torch.argmax(logit)))

In [34]:
gender_class = pd.DataFrame({'id':test.id, 'gender':preds})
gender_class.to_csv('../data/test_gender.csv', index=False)