In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd ./gdrive/MyDrive/ML-KTH/birdclef-2023
!ls

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/ML-KTH/birdclef-2023
Bird.ipynb		  sample_submission.csv  test_soundscapes  train_metadata.csv
eBird_Taxonomy_v2021.csv  test			 train_audio	   wav2vec2-base-finetuned-ks


In [None]:
!pip install evaluate
!pip install -U accelerate
!pip install -U transformers

In [4]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'
# Root directory containing bird sound folders
root_dir = './test'

# Get folder names (bird types)
bird_types = os.listdir(root_dir)
bird_types.sort()  # Ensure consistent order for label assignment
bird_types = bird_types[1:]
# Initialize lists for file paths and labels
audio_files = []
numeric_labels = []

# Label encoder - map bird type to a numeric value
label2id = {bird: idx for idx, bird in enumerate(bird_types)}
print(bird_types)
# Traverse each directory and collect file paths and labels
for bird_type in bird_types:
    bird_folder = os.path.join(root_dir, bird_type)
    for file in os.listdir(bird_folder):
        if file.endswith('.ogg'):
            file_path = os.path.join(bird_folder, file)
            audio_files.append(file_path)
            numeric_labels.append(label2id[bird_type])

import numpy as np

from torch.utils.data import Dataset, DataLoader
import torch
import librosa
def preprocess_function(example, feature_extractor, max_duration=15.0):
    audio_array = example["audio"]["array"]
    inputs = feature_extractor(
        audio_array,
        sampling_rate=16000,
        max_length=int(16000 * max_duration),
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    return inputs

def pad_audio_array(audio_array, target_length):
    # Calculate the number of zeros to add
    padding_length = target_length - audio_array.shape[0]
    if padding_length > 0:
        # Pad with zeros if the array is shorter than the target length
        padded_array = np.pad(audio_array, (0, padding_length), mode='constant')
    else:
        # Truncate the array if it is longer than the target length
        padded_array = audio_array[:target_length]
    return padded_array

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # 'batch' is a list of tuples with (input_values, label)
    input_values = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    # Stack input values and labels into tensors
    input_values_tensor = torch.stack(input_values)
    labels_tensor = torch.tensor(labels)

    return input_values_tensor.squeeze(1), labels_tensor

class CustomDataset(Dataset):
    def __init__(self, file_paths, labels, label2id, id2label, feature_extractor, target_sampling_rate=16000):
        self.file_paths = file_paths
        self.labels = labels
        self.label2id = label2id
        self.id2label = id2label
        self.target_sampling_rate = target_sampling_rate
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        file_name = file_path.split('/')[-2]
        label = self.label2id[file_name]

        # Load and resample the audio file
        audio_data, sampling_rate = librosa.load(file_path, sr=self.target_sampling_rate)
        audio_data_padded = pad_audio_array(audio_data, 120000)
        example = {"audio": {"array": audio_data_padded, "sampling_rate": self.target_sampling_rate}}
        processed_example = preprocess_function(example, self.feature_extractor)

        # label = self.label2id[self.labels[idx]]
        return processed_example["input_values"], label


labels = bird_types
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}


from sklearn.model_selection import train_test_split
from transformers import Wav2Vec2Processor
from transformers import AutoFeatureExtractor, ASTForAudioClassification
# model_checkpoint = "facebook/wav2vec2-base"
# feature_extractor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

# Assuming audio_files and numeric_labels are already defined
train_files, test_files, train_labels, test_labels = train_test_split(audio_files, numeric_labels, test_size=0.2, random_state=42)
val_files, test_files, val_labels, test_labels = train_test_split(test_files, test_labels, test_size=0.5, random_state=42)

train_dataset = CustomDataset(train_files, train_labels, label2id, id2label, feature_extractor)
val_dataset = CustomDataset(val_files, val_labels, label2id, id2label, feature_extractor)
test_dataset = CustomDataset(test_files, test_labels, label2id, id2label, feature_extractor)

custom_datasets = {"train": train_dataset, "validation": val_dataset, "test": test_dataset}


batch_size = 4
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

# from datasets import load_dataset, load_metric
# metric = load_metric("accuracy")
import evaluate
metric = evaluate.load("accuracy")
dataset = custom_datasets
print(dataset['train'])

from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
# model = AutoModelForAudioClassification.from_pretrained(
#     model_checkpoint,
#     num_labels=num_labels,
#     label2id=label2id,
#     id2label=id2label,
# )
model = ASTForAudioClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True
)
import torch.nn as nn

model.classifier = nn.Sequential(
    nn.Linear(768, 20),
    nn.LogSoftmax(dim=1)
)

model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

import numpy as np

# def compute_metrics(eval_pred):
#     """Computes accuracy on a batch of predictions"""
#     predictions = np.argmax(eval_pred.predictions, axis=1)
#     return metric.compute(predictions=predictions, references=eval_pred.label_ids)

def compute_metrics(pred_logits, true_labels):
    preds = np.argmax(pred_logits, axis=1)
    return metric.compute(predictions=preds, references=true_labels)



['blakit1', 'cohmar1', 'colsun2', 'combul2', 'combuz1', 'comsan', 'eaywag1', 'eubeat1', 'gnbcam2', 'greegr', 'hoopoe', 'litegr', 'rbsrob1', 'rerswa1', 'somgre1', 'thrnig1', 'wbrcha2', 'wlwwar', 'woosan']
<__main__.CustomDataset object at 0x7f608431a380>


Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([19]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([19, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
len(train_loader)

1403

In [None]:
def save_checkpoint(model, optimizer, epoch, filename="checkpoint.pth"):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)



In [8]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
optimizer = AdamW(model.parameters(), lr=3e-5)
criterion = torch.nn.CrossEntropyLoss()  # For classification task

num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0
    num = 0
    for batch in train_loader:
        optimizer.zero_grad()  # Clear existing gradients

        inputs, labels = batch
        inputs = inputs.squeeze(1)
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)  # Forward pass
        logits = outputs.logits
        loss = criterion(logits, labels)  # Compute loss

        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights
        num += 1
        print(num)
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss}")

    # Validation step
    model.eval()  # Set model to evaluation mode
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in val_loader:
        with torch.no_grad():
            inputs, labels = batch
            inputs = inputs.squeeze(1)
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            logits = outputs.logits
            loss = criterion(logits, labels)

            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            total_eval_accuracy += compute_metrics(logits,label_ids)["accuracy"]
            # print(compute_metrics(logits,label_ids))
        # break


    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    avg_val_loss = total_eval_loss / len(val_loader)

    print(f"Validation Loss: {avg_val_loss}, Accuracy: {avg_val_accuracy}")


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [7]:
torch.cuda.empty_cache()

In [21]:
!apt install psmisc

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
psmisc is already the newest version (23.4-2build3).
0 upgraded, 0 newly installed, 0 to remove and 9 not upgraded.


In [22]:
!sudo fuser /dev/nvidia*

/dev/nvidia0:         6050m
/dev/nvidiactl:       6050m
/dev/nvidia-uvm:      6050m


In [None]:
!kill -9 6050
!nvidia-smi

In [13]:

trainer = Trainer(
    model,
    args,
    train_dataset=train_loader,
    eval_dataset=val_loader,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)


In [None]:
import os
import shutil
from pathlib import Path

def get_project_count(folder_path):
    """ Count the number of projects (files and sub-folders) in a folder. """
    return len([name for name in os.listdir(folder_path)])

def copy_top_folders(source_path, destination_path, top_n=20):
    # Ensure the destination path exists
    Path(destination_path).mkdir(parents=True, exist_ok=True)

    # List all folders in the source path
    folders = [os.path.join(source_path, d) for d in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, d))]

    # Count projects in each folder and sort them
    folder_counts = [(folder, get_project_count(folder)) for folder in folders]
    sorted_folders = sorted(folder_counts, key=lambda x: x[1], reverse=True)

    # Copy the top N folders
    for folder, count in sorted_folders[:top_n]:
        destination = os.path.join(destination_path, os.path.basename(folder))
        shutil.copytree(folder, destination)
        print(f"Copied {folder} to {destination}")

# Example usage
source_path = './train_audio'
destination_path = './test'
copy_top_folders(source_path, destination_path)


In [None]:
import os

# Root directory containing bird sound folders
root_dir = './test'

# Get folder names (bird types)
bird_types = os.listdir(root_dir)
bird_types.sort()  # Ensure consistent order for label assignment

# Initialize lists for file paths and labels
audio_files = []
numeric_labels = []

# Label encoder - map bird type to a numeric value
label2id = {bird: idx for idx, bird in enumerate(bird_types)}

# Traverse each directory and collect file paths and labels
for bird_type in bird_types:
    bird_folder = os.path.join(root_dir, bird_type)
    for file in os.listdir(bird_folder):
        if file.endswith('.ogg'):
            file_path = os.path.join(bird_folder, file)
            audio_files.append(file_path)
            numeric_labels.append(label2id[bird_type])

In [None]:
def preprocess_function(example, feature_extractor, max_duration=15.0):
    audio_array = example["audio"]["array"]
    inputs = feature_extractor(
        audio_array,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    return inputs


In [None]:
import numpy as np

from torch.utils.data import Dataset, DataLoader
import torch
import librosa


class CustomDataset(Dataset):
    def __init__(self, file_paths, labels, label2id, id2label, feature_extractor, target_sampling_rate=16000):
        self.file_paths = file_paths
        self.labels = labels
        self.label2id = label2id
        self.id2label = id2label
        self.target_sampling_rate = target_sampling_rate
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        file_name = file_path.split('/')[-2]
        label = self.label2id[file_name]

        # Load and resample the audio file
        audio_data, sampling_rate = librosa.load(file_path, sr=self.target_sampling_rate)
        example = {"audio": {"array": audio_data, "sampling_rate": self.target_sampling_rate}}
        processed_example = preprocess_function(example, self.feature_extractor)
        # label = self.label2id[self.labels[idx]]
        return processed_example, label


labels = bird_types
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}


from sklearn.model_selection import train_test_split
from transformers import Wav2Vec2Processor

feature_extractor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
# Assuming audio_files and numeric_labels are already defined
train_files, test_files, train_labels, test_labels = train_test_split(audio_files, numeric_labels, test_size=0.2, random_state=42)
val_files, test_files, val_labels, test_labels = train_test_split(test_files, test_labels, test_size=0.5, random_state=42)

train_dataset = CustomDataset(train_files, train_labels, label2id, id2label, feature_extractor)
val_dataset = CustomDataset(val_files, val_labels, label2id, id2label, feature_extractor)
test_dataset = CustomDataset(test_files, test_labels, label2id, id2label, feature_extractor)

custom_datasets = {"train": train_dataset, "validation": val_dataset, "test": test_dataset}



model_checkpoint = "facebook/wav2vec2-base"
batch_size = 4
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# from datasets import load_dataset, load_metric
# metric = load_metric("accuracy")
import evaluate
metric = evaluate.load("accuracy")
dataset = custom_datasets
print(dataset['train'])


tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

<__main__.CustomDataset object at 0x7fb971473160>


In [None]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)


pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.bias', 'projector.weight', 'classifier.weight', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

ImportError: ignored