In [None]:
!pip install tf-models-official==2.15
!pip install keras==2.15
!pip install tensorflow-addons==0.23.0
!pip install vit-keras

In [7]:
import numpy as np
import pandas as pd
import random
import os
from joblib import Parallel, delayed
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import tensorflow as tf
from tensorflow_addons.optimizers import RectifiedAdam
from tensorflow.keras.layers import Conv2D, UpSampling2D, Input, TimeDistributed, Dropout, BatchNormalization, Dense
from tensorflow.keras.models import Model
from vit_keras import vit
import time

In [3]:
def extract_audio(filename, sr=16000):
    filepath = '/kaggle/input/birdclef-2024/train_audio/' + filename
    audio, _ = librosa.load(filepath, sr=sr)
    return audio.astype(np.float16)

def pad_array(array):
    target_dim = 98
    current_dim = array.shape[0]
    if current_dim < target_dim:
        padding_amount = target_dim - current_dim
        padding_shape = (padding_amount,) + array.shape[1:]
        padding_array = np.zeros(padding_shape, dtype=array.dtype)
        padded_array = np.concatenate((array, padding_array), axis=0)
        if len(padded_array.shape)==1: print(array.shape, padding_array.shape)
        return padded_array.astype(np.float16)
    else:
        truncated_array = array[:target_dim, ...]
        return truncated_array.astype(np.float16)

def augment_data(audio_array, sr, n_fft):
    audio_array = librosa.effects.pitch_shift(y=audio_array.astype(np.float32), sr=sr,
                                              n_steps=np.random.uniform(low=.970, high=1.03),
                                              n_fft=n_fft)
    audio_array = librosa.effects.time_stretch(y=audio_array,
                                               rate=np.random.uniform(low=.975, high=1.025)) #n_fft=16384
    noise_amp = np.random.normal(loc=0.0, scale=0.005)*np.amax(audio_array)
    audio_array = audio_array + noise_amp*np.random.normal(size=audio_array.shape[0])
    shift_range = int(np.random.uniform(low=-5, high=5)*sr*10)
    audio_array = np.roll(audio_array, shift_range)
    return audio_array.astype(np.float16)

# Function to generate augmented data
def generate_augmented_data(df, column, label_column, threshold, sr=16000, n_fft=2048):
    augmented_data = []
    for label, count in df[label_column].value_counts().items():
        if count < threshold:
            # Select rows with this label
            num_needed = threshold-count
            label_df = df[df[label_column] == label]
            num_samples = min(num_needed, len(label_df))
            label_df = label_df.sample(n= num_samples,
                                        random_state=random.randint(0, 10000))

            for _, row in label_df.iterrows():
                npy_array = row[column]
                augmented_npy = augment_data(npy_array, sr, n_fft)
                augmented_row = row.copy()
                augmented_row[column] = augmented_npy
                augmented_row['augmented'] = True
                augmented_data.append(augmented_row)
    # Append the augmented data to the original dataframe
    augmented_df = pd.DataFrame(augmented_data)
    result_df = pd.concat([df, augmented_df], ignore_index=True)
    return result_df

def trim_df_by_label(df, label_column, threshold):
    # Create an empty DataFrame to store the results
    trimmed_df = pd.DataFrame(columns=df.columns)
    # Group by the label column
    grouped = df.groupby(label_column)
    for label, group in grouped:
        if len(group) > threshold:
            sampled_group = group.sample(n=threshold, random_state=1)
        else:
            sampled_group = group
        
        trimmed_df = pd.concat([trimmed_df, sampled_group])
    return trimmed_df

def extract_framed(audio):
    """
    Load audio files from the given DataFrame, extract framed audios,
    and add the framed audios to the given DataFrame

    Parameters:
      dataframe (pd.DataFrame): DataFrame containing audio file information.
      augment (bool): Whether to apply augmentation to the audio.
    """
    window_size_s=2.5
    hop_size_s=2.5
    sr=16000
    frame_length = int(window_size_s * sr)
    frame_step = int(hop_size_s * sr)
    return tf.cast(tf.signal.frame(audio, frame_length, frame_step, pad_end=True), tf.float16)

def compute_mfcc(frame, sr, n_mfcc, hop_length):
    return librosa.feature.mfcc(y=frame, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)

def compute_rms(frame, hop_length):
    return librosa.feature.rms(y=frame, hop_length=hop_length)

def extract_features(row):
    n_mfcc=33
    sr=16000
    hop_length=626
    frames = row['npy'].numpy().astype(np.float32)
    mfcc = Parallel(n_jobs=-1, backend='threading')(delayed(compute_mfcc)(frame, sr, n_mfcc, hop_length) for frame in frames)
    mfcc = np.array(mfcc).astype(np.float16)
    rms = Parallel(n_jobs=-1, backend='threading')(delayed(compute_rms)(frame, hop_length) for frame in frames)
    rms = np.array(rms).astype(np.float16)
    row['npy'] = None
    return pd.Series({'mfcc': mfcc, 'rms': rms})

# Function to normalize the entire column data using Min-Max Normalization
def normalize_column(data, scaler):
    stacked = np.concatenate(data)
    original_shape = stacked.shape
    reshaped_stacked = stacked.reshape(-1, original_shape[-1])
    normalized_reshaped = scaler.fit_transform(reshaped_stacked)
    normalized_stacked = normalized_reshaped.reshape(original_shape).astype(np.float16)
    split_indices = np.cumsum([arr.shape[0] for arr in data[:-1]])
    normalized_data = np.split(normalized_stacked, split_indices)
    return normalized_data

# Function to concatenate numpy arrays along a specified axis and replace original values with None
def concat_and_replace(row, axis=0):
    reshaped_long = np.full(row['rms'].shape, row['longitude'])
    reshaped_lat = np.full(row['rms'].shape, row['latitude'])
    concatenated_data = np.concatenate((row['rms'], reshaped_long), axis=-2)
    concatenated_data = np.concatenate((concatenated_data, reshaped_lat), axis=-2)
    concatenated_data = np.concatenate((row['mfcc'], concatenated_data), axis=-2)
    concatenated_data = np.transpose(concatenated_data, (0, 2, 1))
    concatenated_data = np.expand_dims(concatenated_data, axis=-1)
    row['rms'] = None
    row['mfcc'] = None
    row['longitude'] = None
    row['latitude'] = None    
    return concatenated_data.astype(np.float16)

def birds_stratified_split(df, target_col, test_size=0.2):
    class_counts = df[target_col].value_counts()
    low_count_classes = class_counts[class_counts < 2].index.tolist() ### Birds with single counts
    df['train'] = df[target_col].isin(low_count_classes)
    train_df, test_df = train_test_split(df[~df['train']], test_size=test_size, stratify=df[~df['train']][target_col], random_state=42)
    train_df = pd.concat([train_df, df[df['train']]], axis=0).reset_index(drop=True)
    # Remove the 'valid' column
    train_df.drop('train', axis=1, inplace=True)
    test_df.drop('train', axis=1, inplace=True)

    return train_df, test_df

In [4]:
ViT = vit.vit_b16(
    image_size=224,
    activation='sigmoid',
    pretrained=True,
    include_top=True,
    pretrained_top=True,
    classes=182
)

input_shape = (98, 64, 36, 1)
inputs = Input(shape=(input_shape))
batch_size = 20
x = TimeDistributed(Conv2D(30, (9, 9), padding='valid', activation='relu', strides=(2, 1)))(inputs)
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
x = TimeDistributed(Conv2D(6, (13, 13), padding='valid', activation='tanh', strides=(1, 1)))(x)
x = Dropout(0.2)(x)
x = tf.keras.layers.Reshape((224, 224, 3))(x)
x = ViT(x)
x = tf.keras.layers.Dense(384, activation = 'relu')(x)
x = BatchNormalization()(x)
x = Dense(79, activation = 'softmax')(x)
model = Model(inputs=inputs, outputs=x)
model.summary()



Downloading data from https://github.com/faustomorales/vit-keras/releases/download/dl/ViT-B_16_imagenet21k+imagenet2012.npz




Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 98, 64, 36, 1)]   0         
                                                                 
 time_distributed (TimeDist  (None, 98, 28, 28, 30)    2460      
 ributed)                                                        
                                                                 
 dropout (Dropout)           (None, 98, 28, 28, 30)    0         
                                                                 
 batch_normalization (Batch  (None, 98, 28, 28, 30)    120       
 Normalization)                                                  
                                                                 
 time_distributed_1 (TimeDi  (None, 98, 16, 16, 6)     30426     
 stributed)                                                      
                                                             

In [None]:
# # detect and init the TPU
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

# # instantiate a distribution strategy
# tf.tpu.experimental.initialize_tpu_system(tpu)
# tpu_strategy = tf.distribute.TPUStrategy(tpu)

# # instantiating the model in the strategy scope creates the model on the TPU
# with tpu_strategy.scope():
#     model = tf.keras.Sequential( … ) # define your model normally
#     model.compile( … )

# # train model normally
# model.fit(training_dataset, epochs=EPOCHS, steps_per_epoch=…)

In [5]:
filtered_df = pd.read_csv('/kaggle/input/filtered-df-final/filtered_df_final.csv')
print(filtered_df['primary_label'].nunique())
print(len(filtered_df))

79
18295


In [3]:
# value_counts = filtered_df['primary_label'].value_counts()
# filtered_categories = value_counts[value_counts > 80].index
# filtered_df = filtered_df[filtered_df['primary_label'].isin(filtered_categories)]
# filtered_df = filtered_df.sample(frac=1).reset_index(drop=True)

# print(filtered_df['primary_label'].nunique())
# print(len(filtered_df))

# filtered_df.to_csv('/kaggle/working/filtered_df_final.csv')

79
18295


In [None]:
# filtered_df = filtered_df.sample(frac=1).reset_index(drop=True)

######################################################################
# interested_species = ['grewar3', 'asikoe2', 'commyn', 'rorpar'] #'blrwar1', 'woosan', 'grnsan'
# filtered_df = filtered_df[filtered_df['primary_label'].isin(interested_species)]

df = filtered_df[['primary_label', 'latitude', 'longitude', 'filename']]
print(f"num of species: {df['primary_label'].nunique()}")
initial_length = len(df)
label_counts = df['primary_label'].value_counts().to_dict()
new_df = pd.DataFrame(columns=[df.columns])

learning_rate = 1e-4
optimizer = RectifiedAdam(lr=learning_rate)
# optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping_callbacks = tf.keras.callbacks.EarlyStopping(patience = 20, restore_best_weights = True, verbose = 1)

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False)
# Fit and transform the labels
_ = encoder.fit_transform(df['primary_label'].values.reshape(-1, 1))
long_lat_scaler = MinMaxScaler(feature_range=(-1, 1))
_ = long_lat_scaler.fit_transform(df['longitude'].values.reshape(-1, 1))

# Path to save models
output_dir = '/kaggle/working/saved_models/'
os.makedirs(output_dir, exist_ok=True)

train_count = 0
# While current DataFrame length is greater than a certain value
while len(df) > initial_length * 0.005:  # For example, continue until half of the initial length
    start_time = time.time()  # Record start time
    print(f"trained data count: {train_count}")
    print(f"progress through data: {(train_count/initial_length)*100}%")
    new_df = df.sample(frac=1).reset_index(drop=True)
    new_df = pd.DataFrame(columns=df.columns)
    # Iterate through each key in the stored frequency dictionary
    for label, count in label_counts.items():
        # Extract 2% of the data with that value
        sample_size = max(int(count * 0.02), 1)
        extracted_data = df[df['primary_label'] == label].head(sample_size)
        df = df.drop(extracted_data.index)
        new_df = pd.concat([new_df, extracted_data])
    # Keep count of trained data
    train_count += len(new_df)
    # Load, Augment, Pad and Trim Audio Data
    new_df['npy'] = new_df['filename'].apply(extract_audio)
    print(f"extracted npy")
    new_df['augmented'] = False
    threshold = min(max(new_df['primary_label'].value_counts().values.tolist()), 6)
    print(f"Threshold: {threshold}")
    while min(new_df['primary_label'].value_counts().values.tolist())<threshold:
        new_df = generate_augmented_data(new_df, 'npy', 'primary_label', threshold, sr=16000, n_fft=2048)
    new_df = trim_df_by_label(new_df, 'primary_label', threshold)
    print(f"augmented npy")
    new_df['npy'] = new_df['npy'].apply(extract_framed)
    new_df[['mfcc', 'rms']] = new_df.apply(lambda row: extract_features(row), axis=1)
    
    new_df['mfcc'] = new_df['mfcc'].apply(pad_array)
    new_df['rms'] = new_df['rms'].apply(pad_array)
    print(f"extracted mfcc and rms")
    new_df['mfcc'] = normalize_column(new_df['mfcc'].values, MinMaxScaler(feature_range=(-1, 1)))
    new_df['rms'] = normalize_column(new_df['rms'].values, MinMaxScaler(feature_range=(-1, 1)))
    new_df['longitude'] = long_lat_scaler.transform(new_df['longitude'].values.reshape(-1, 1)).astype(np.float16)
    new_df['latitude'] = long_lat_scaler.transform(new_df['latitude'].values.reshape(-1, 1)).astype(np.float16)
    new_df['features'] = new_df.apply(lambda row: concat_and_replace(row, axis=1), axis=1)
    new_df = new_df[['primary_label', 'features']]
    print(f"ready to train")
    # Split Stratified Train and Test Sets
    print(f"Preprocessed data entries for training: {len(new_df)}")
    train_df, test_df = birds_stratified_split(new_df, 'primary_label', 0.3)
    # Fit and transform the labels
    train_labels = encoder.transform(train_df['primary_label'].values.reshape(-1, 1)).astype(np.float16)
    test_labels = encoder.transform(test_df['primary_label'].values.reshape(-1, 1)).astype(np.float16)
    # Convert features to numpy arrays
    train_features = np.stack(train_df['features'].values)
    test_features = np.stack(test_df['features'].values)
    train_features = np.nan_to_num(train_features, nan=0)
    test_features = np.nan_to_num(test_features, nan=0)
    # Train The Model
    history = model.fit(train_features, train_labels, epochs=40, batch_size=28, validation_data=(test_features, test_labels), callbacks = early_stopping_callbacks)
    # Save the model after each chunk
    model_save_path = os.path.join(output_dir, 'bird_clef_24_model.h5')
    model.save(model_save_path)
    print(f'Saved model to {model_save_path}')
    end_time = time.time()  # Record end time
    print(f'Time taken for prev iteration: {(end_time - start_time)/60} mins')

num of species: 79
trained data count: 0
progress through data: 0.0%


  super().__init__(name, **kwargs)
  new_df = pd.concat([new_df, extracted_data])


extracted npy
Threshold: 6


  trimmed_df = pd.concat([trimmed_df, sampled_group])


augmented npy
extracted mfcc and rms
ready to train
Preprocessed data entries for training: 474
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 32: early stopping


  saving_api.save_model(


Saved model to /kaggle/working/saved_models/bird_clef_24_model.h5
Time taken for prev iteration: 11.633314502239227 mins
trained data count: 326
progress through data: 1.7819076250341621%


  new_df = pd.concat([new_df, extracted_data])


extracted npy
Threshold: 6


  trimmed_df = pd.concat([trimmed_df, sampled_group])


augmented npy
