In [3]:
!unzip /content/birdsong-recognition-cmu.zip

Archive:  /content/birdsong-recognition-cmu.zip
  inflating: Data_files/test/xc101862.flac  
  inflating: Data_files/test/xc101932.flac  
  inflating: Data_files/test/xc101933.flac  
  inflating: Data_files/test/xc101934.flac  
  inflating: Data_files/test/xc101940.flac  
  inflating: Data_files/test/xc102048.flac  
  inflating: Data_files/test/xc102106.flac  
  inflating: Data_files/test/xc102788.flac  
  inflating: Data_files/test/xc102803.flac  
  inflating: Data_files/test/xc102932.flac  
  inflating: Data_files/test/xc110167.flac  
  inflating: Data_files/test/xc116226.flac  
  inflating: Data_files/test/xc118441.flac  
  inflating: Data_files/test/xc120666.flac  
  inflating: Data_files/test/xc121963.flac  
  inflating: Data_files/test/xc123167.flac  
  inflating: Data_files/test/xc123168.flac  
  inflating: Data_files/test/xc124052.flac  
  inflating: Data_files/test/xc125137.flac  
  inflating: Data_files/test/xc132392.flac  
  inflating: Data_files/test/xc134291.flac  
  infla

# Prepare data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import tqdm

In [2]:
feature_map = {'Acrocephalus':0, 'Anthus':1, 'Columba':2, 'Corvus':3, 'Emberiza':4,
              'Motacilla':5, 'Passer':6, 'Phylloscopus':7, 'Pluvialis':8, 'Poecile':9,
               'Streptopelia':10, 'Sylvia':11, 'Tringa':12 ,'Turdus':13}

In [3]:
train_df = pd.read_csv('/content/train.csv')
train_df['file_path'] = '/content/Data_files/train/xc'+train_df['file_id'].astype(str) + '.flac'
train_df['genus'] = train_df['genus'].map(feature_map)
train_df.head()

Unnamed: 0,file_id,genus,file_path
0,71748,7,/content/Data_files/train/xc71748.flac
1,94958,13,/content/Data_files/train/xc94958.flac
2,125777,4,/content/Data_files/train/xc125777.flac
3,143002,3,/content/Data_files/train/xc143002.flac
4,82715,0,/content/Data_files/train/xc82715.flac


In [4]:
!pip -q install datasets

In [5]:
import datasets

In [4]:
from datasets import Dataset, Audio
import pandas as pd
dataset = Dataset.from_pandas(train_df)
dataset = dataset.rename_columns({"genus": "genre", "file_path":"file"})
dataset = dataset.add_column("audio", dataset["file"])
dataset = dataset.cast_column("audio", Audio())
print(dataset[0])

{'file_id': 71748, 'genre': 7, 'file': '/content/Data_files/train/xc71748.flac', 'audio': {'path': '/content/Data_files/train/xc71748.flac', 'array': array([ 0.        ,  0.        ,  0.        , ...,  0.        ,
       -0.00036621, -0.00030518]), 'sampling_rate': 44100}}


In [6]:
import numpy as np

sample = dataset[0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: 1.52e-05, Variance: 0.000501


# Modeling

In [7]:
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [9]:
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [10]:
dataset[0]

{'file_id': 71748,
 'genre': 7,
 'file': '/content/Data_files/train/xc71748.flac',
 'audio': {'path': '/content/Data_files/train/xc71748.flac',
  'array': array([-9.40302957e-13,  2.00470703e-13, -1.16861219e-13, ...,
          1.97424181e-03,  1.50410668e-03, -3.00955580e-04]),
  'sampling_rate': 16000}}

In [11]:
import numpy as np

sample = dataset[0]['audio']

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: 1.52e-05, Variance: 0.000499


In [12]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")

print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

inputs keys: ['input_values', 'attention_mask']
Mean: 1.34e-09, Variance: 1.0


In [13]:
max_duration = 30.0

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [14]:
dataset

Dataset({
    features: ['file_id', 'genre', 'file', 'audio'],
    num_rows: 54
})

In [15]:
dataset_encoded = dataset.map(
    preprocess_function,
    remove_columns=["audio","file_id", "file"],
    batched=True,
    batch_size=10,
    num_proc=1,
)

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

In [16]:
dataset_encoded

Dataset({
    features: ['genre', 'input_values', 'attention_mask'],
    num_rows: 54
})

In [17]:
dataset_encoded = dataset_encoded.rename_column("genre", "label")

# Start finetuning

In [18]:
feature_map = {'Acrocephalus':0, 'Anthus':1, 'Columba':2, 'Corvus':3, 'Emberiza':4,
              'Motacilla':5, 'Passer':6, 'Phylloscopus':7, 'Pluvialis':8, 'Poecile':9,
               'Streptopelia':10, 'Sylvia':11, 'Tringa':12 ,'Turdus':13}

In [19]:
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=14
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
#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
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-birdsong",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=0.001,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=False,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
    report_to="none"
)



In [22]:
dataset_encoded

Dataset({
    features: ['label', 'input_values', 'attention_mask'],
    num_rows: 54
})

In [23]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset_encoded,
    tokenizer=feature_extractor
)

trainer.train()

  trainer = Trainer(


HfHubHTTPError: (Request ID: Root=1-67d5688b-752704527db791ef3a1f15be;3f5453a7-7e87-4bee-bd0b-fb7f48f9e778)

403 Forbidden: You don't have the rights to create a model under the namespace "Nano2527".
Cannot access content at: https://huggingface.co/api/repos/create.
Make sure your token has the correct permissions.

# push to hub

In [24]:
kwargs = {
    "dataset_tags": "birdsong",
    "dataset": "BirdSong",
    "model_name": f"{model_name}-finetuned-birdsong",
    "finetuned_from": model_id,
    "tasks": "bird-song-classification",
}

In [28]:
trainer.push_to_hub(**kwargs)

NameError: name 'trainer' is not defined

In [25]:
from transformers import pipeline

pipe = pipeline("audio-classification", model="Jamvess/distilhubert-finetuned-birdsong")

Device set to use cuda:0


In [26]:
dataset[2]

{'file_id': 125777,
 'genre': 4,
 'file': '/content/Data_files/train/xc125777.flac',
 'audio': {'path': '/content/Data_files/train/xc125777.flac',
  'array': array([-1.95336057e-07,  1.85430125e-07, -1.65784499e-07, ...,
          1.25919385e-02,  9.13696364e-03,  9.52405669e-03]),
  'sampling_rate': 16000}}

In [27]:
pipe(dataset[2]['audio']['array'])

[{'score': 0.9650967121124268, 'label': 'LABEL_4'},
 {'score': 0.01428588293492794, 'label': 'LABEL_7'},
 {'score': 0.01092238537967205, 'label': 'LABEL_1'},
 {'score': 0.0025958570186048746, 'label': 'LABEL_0'},
 {'score': 0.0016002139309421182, 'label': 'LABEL_9'}]

# Do the test set

In [28]:
import pandas as pd

In [29]:
test_df = pd.read_csv('/content/test.csv')
test_df['file_path'] = '/content/Data_files/test/xc'+test_df['file_id'].astype(str) + '.flac'
test_df = test_df.drop(columns='genus')
test_df.head()

Unnamed: 0,file_id,file_path
0,27145,/content/Data_files/test/xc27145.flac
1,101862,/content/Data_files/test/xc101862.flac
2,75092,/content/Data_files/test/xc75092.flac
3,42224,/content/Data_files/test/xc42224.flac
4,101940,/content/Data_files/test/xc101940.flac


In [30]:
import datasets
from datasets import Dataset
from datasets import Audio

In [31]:
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.rename_columns({'file_path':'file'})
test_dataset = test_dataset.add_column("audio", test_dataset["file"])
test_dataset = test_dataset.cast_column("audio", Audio())

In [32]:
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [33]:
sample = test_dataset[0]
sample

{'file_id': 27145,
 'file': '/content/Data_files/test/xc27145.flac',
 'audio': {'path': '/content/Data_files/test/xc27145.flac',
  'array': array([-3.04912930e-07,  2.72263151e-07, -2.24238363e-07, ...,
         -3.35667355e-05,  3.12080056e-06, -1.26919958e-05]),
  'sampling_rate': 16000}}

In [34]:
pipe(sample['audio']['array'])

[{'score': 0.9817489385604858, 'label': 'LABEL_3'},
 {'score': 0.004519849084317684, 'label': 'LABEL_2'},
 {'score': 0.0037799146957695484, 'label': 'LABEL_12'},
 {'score': 0.003193098120391369, 'label': 'LABEL_5'},
 {'score': 0.0014414542820304632, 'label': 'LABEL_8'}]

In [35]:
from tqdm import tqdm
ans_label = []
for sample in tqdm(test_dataset):
    sample=sample['audio']['array']
    ans_label.append(pipe(sample)[0]['label'])

 15%|█▍        | 8/54 [00:03<00:17,  2.66it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 54/54 [00:24<00:00,  2.24it/s]


In [36]:
ans_label = [label.split('_')[1] for label in ans_label]

In [37]:
ans_label = [int(x) for x in ans_label]
print(ans_label)

In [38]:
test_df['genus'] = ans_label

In [39]:
fm = {0:'Acrocephalus', 1:'Anthus', 2:'Columba', 3:'Corvus', 4:'Emberiza',
              5:'Motacilla', 6:'Passer', 7:'Phylloscopus', 8:'Pluvialis', 9:'Poecile',
               10:'Streptopelia', 11:'Sylvia', 12:'Tringa' ,13:'Turdus'}

In [40]:
test_df = test_df.drop(columns='file_path')
test_df['genus'] = test_df['genus'].map(fm)

In [41]:
test_df.head()

Unnamed: 0,file_id,genus
0,27145,Corvus
1,101862,Poecile
2,75092,Streptopelia
3,42224,Columba
4,101940,Streptopelia


In [42]:
test_df.to_csv('signal_submit_2.csv', index=False)