In [41]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import librosa as lb
import librosa.display
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers,models,Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import tensorflow as tf
import os
import gc
from PIL import Image as PIL_Image
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**For the audio classification**
<br>
1) We take audio create 3 channeled image data for the audio
2) Just like how we take rgb in the image we need 3 channels for the audio it would be<br>
       i) Fourier transformation (more of frequency over time)stft<br>
       ii) meft (image displaying frequencey that we humnas can listen to)<br>
       iii)chroma(smaller pitches and notes)

In [2]:

def format_shape(data, target_height=128, target_width=1000):
    data = np.asarray(data, dtype=np.float32)

    
    # 1D -> 2D
    if data.ndim == 1:
        data = data[np.newaxis, :]

    # Min-max normalization
    data=(255 * (data - np.min(data)) / (np.max(data) - np.min(data))).astype(np.uint8)

    # Resize rows to target_height
    rows, cols = data.shape
    if rows < target_height:
        reps = int(np.ceil(target_height / rows))
        data = np.tile(data, (reps, 1))[:target_height, :]
    elif rows > target_height:
        data = data[:target_height, :]

    # Resize columns to target_width
    rows, cols = data.shape
    if cols < target_width:
        pad_width = target_width - cols
        data = np.pad(data, ((0,0), (0, pad_width)), mode="constant")
    elif cols > target_width:
        data = data[:, :target_width]

    return data.astype(np.float32)


In [4]:
#this function will be responsible to convert all the audios into images
def audio_to_image(file=None,max_size=1000,y=None,sr=22050):
    #loading up the image
    if not file is None:
        if y is None:
            y,sr=librosa.load(file,sr=22050)
    y = np.asarray(y, dtype=np.float32)

    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, n_fft=2048, hop_length=512)

    #next channel which mfcc is which is audio graph that is audible to human
    mels_db = librosa.power_to_db(mels, ref=np.max)
    

    
    mels_delta = librosa.feature.delta(mels_db)
    
    mels_delta2 = librosa.feature.delta(mels_db, order=2)

    def normalize(X):
        x_min, x_max = X.min(), X.max()
        if x_max - x_min > 0:
            return (255 * (X - x_min) / (x_max - x_min)).astype(np.uint8)
        return np.zeros_like(X, dtype=np.uint8)
        
    layer0 = format_shape(mels_db)     
    layer1 = format_shape(mels_delta)  
    layer2 = format_shape(mels_delta2)

    #this makes cube by taking 3 images and stacking on top of each other
    final_image = np.dstack([layer0, layer1, layer2]).astype(np.float32)
    return final_image,y,sr
    

In [5]:
def convert_dir_to_audio(file_path, output_dir):
    class_names = [d for d in os.listdir(file_path) if os.path.isdir(os.path.join(file_path, d))]

    for i, class_name in enumerate(class_names):
        class_input_path = os.path.join(file_path, class_name)
        save_folder = os.path.join(output_dir, class_name)
        os.makedirs(save_folder, exist_ok=True)
        
        print(f"--- Processing Class: {class_name} ---")
        
        for file_name in os.listdir(class_input_path):
            if file_name.endswith((".wav", ".m4a")):
                full_file_path = os.path.join(class_input_path, file_name)

                try:
                    # 1. Process Original
                    # Note: We use 'pixel_data' everywhere now
                    pixel_data, y, sr = audio_to_image(full_file_path, y=None)
                    
                    # Save Original
                    im = PIL_Image.fromarray(pixel_data.astype(np.uint8))
                    im.save(os.path.join(save_folder, f"{file_name}_orig.png"))
                    
                    # 2. Process Augmentations
                    for j in range(3): 
                        y_aug = augment_audio(y, sr)
                        aug_pixel_data, _, _ = audio_to_image(file=None, y=y_aug, sr=sr)
                        
                        im_aug = PIL_Image.fromarray(aug_pixel_data.astype(np.uint8))
                        # Use f-string to safely combine string and number j
                        im_aug.save(os.path.join(save_folder, f"{file_name}_aug_{j}.png"))
                        
                        del y_aug, aug_pixel_data 

                    # Cleanup the original data for this file
                    del pixel_data, y
                    
                except Exception as e:
                    print(f"Skipping {file_name}: {e}")
        
        gc.collect()

    return class_names

In [6]:
def augment_audio(y,sr):
    if np.random.random()>0.5:
        y=librosa.effects.pitch_shift(y,sr=sr,n_steps=np.random.uniform(-2,2))
        y = np.asarray(y, dtype=np.float32)
    noise_amp=0.005*np.random.uniform()*np.amax(y)
    noise_amp = 0.005 * np.random.uniform() * np.amax(np.abs(y)) if y.size > 0 else 0.0
    y = y + noise_amp * np.random.normal(size=y.shape).astype(np.float32)
    return y.astype(np.float32)

In [7]:
def split_data(
    x,
    y,
    test_size=0.2,
    val_size=0.5,
    random_state=42,
    stratify=True
):
    x = np.array(x)
    y = np.array(y)

    strat = y if stratify else None

    # Train / temp split
    x_train, x_temp, y_train, y_temp = train_test_split(
        x,
        y,
        test_size=test_size,
        random_state=random_state,
        stratify=strat
    )

    # Val / test split
    x_val, x_test, y_val, y_test = train_test_split(
        x_temp,
        y_temp,
        test_size=val_size,
        random_state=random_state,
        stratify=(y_temp if stratify else None)
    )

    return x_train, y_train, x_val, y_val, x_test, y_test

In [8]:
def for_single_audio(file):
    img,y,sr=audio_to_image(file)
    img_array=np.expand_dims(img_array, axis=0)

In [9]:
input="/kaggle/input/forest-sound-dataset/forestdataset"
output="/kaggle/working/processed"
class_names=convert_dir_to_audio(input,output)



--- Processing Class: logging ---
--- Processing Class: poaching ---
--- Processing Class: natural sound ---
--- Processing Class: fire ---


  y,sr=librosa.load(file,sr=22050)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [13]:
import shutil

# Zip the folder for easy download
shutil.make_archive("/kaggle/working/forest_processed", 'zip', "/kaggle/working/processed")

'/kaggle/working/forest_processed.zip'

In [42]:
from tensorflow.keras.utils import image_dataset_from_directory, load_img, img_to_array
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"

In [43]:
#setting up important var for loading images later on
BATCH_SIZE=32
IMG_SIZE=(224,224)
SEED=42
EXTRACT_PATH="/kaggle/input/forest-sound-spectograph/forest_processed"
selected_class=["natural sound","unnatural"]

In [18]:
#loading up images and dividing them for the neural net
train_data=tf.keras.utils.image_dataset_from_directory(
    EXTRACT_PATH,
    class_names=selected_class,
    validation_split=0.2,
    subset="training",
    seed=SEED,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="binary" 
)

#validation dataset
validation_data=tf.keras.utils.image_dataset_from_directory(
    EXTRACT_PATH,
    class_names=selected_class,
    validation_split=0.2,
    subset="validation",
    seed=SEED,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="binary"
)

Found 8016 files belonging to 2 classes.
Using 6413 files for training.


I0000 00:00:1767457556.022727      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1767457556.026719      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Found 8016 files belonging to 2 classes.
Using 1603 files for validation.


In [20]:
#now dividing the validation data set into 
total_number_of_batches_in_validation_data=validation_data.cardinality().numpy() #converts total number of batches in the set and converts the tensor into python integer with .numpy function here
no_of_batches_in_validation_data=total_number_of_batches_in_validation_data//2 #this is a floor division operator


#now from the set of batches of the image creating subset into the validation and the test subset
validation=validation_data.take(no_of_batches_in_validation_data)
test=validation_data.skip(no_of_batches_in_validation_data)

In [21]:
print(f"Class names: {train_data.class_names}")
print("For values:\n")
for i, class_name in enumerate(train_data.class_names):
    print(f"{class_name}:{i}\n")

Class names: ['natural sound', 'unnatural']
For values:

natural sound:0

unnatural:1



In [22]:
#now we take pretrained model and train them for our data

data_augmentation=tf.keras.Sequential([
     tf.keras.layers.RandomTranslation(
        height_factor=0.0,  # no vertical shift by default
        width_factor=0.1,   # shift up to 10% horizontally
        fill_mode='constant'
    ),

    # Random vertical shift (frequency axis)
    tf.keras.layers.RandomTranslation(
        height_factor=0.1,  # shift up to 10% vertically
        width_factor=0.0,
        fill_mode='constant'
    ),

    # Random brightness adjustment (small)
    tf.keras.layers.RandomBrightness(factor=0.1),

    # Random contrast adjustment
    tf.keras.layers.RandomContrast(factor=0.1),

])



In [28]:
mobile=tf.keras.applications.EfficientNetB0(
    input_shape=(224,224,3),
    include_top=False,
    weights="imagenet"
)
mobile.trainable=False

model=models.Sequential([
    tf.keras.layers.Input(shape=(128,1000,3)),
    data_augmentation,
    tf.keras.layers.Resizing(224,224),

    #mobile net takes values from -1 to 1 so
    tf.keras.layers.Lambda(tf.keras.applications.efficientnet.preprocess_input),
    mobile,

    tf.keras.layers.GlobalAveragePooling2D(),
  
    tf.keras.layers.Dense(256, activation="relu"),

    tf.keras.layers.Dropout(0.5),
    
    tf.keras.layers.Dense(1,activation="sigmoid")
    
    
    
    
])


model.compile(optimizer="adam",
             loss="binary_crossentropy",
             metrics=["accuracy"])

history=model.fit(
    train_data,
    validation_data=validation,
    epochs= 20,
    callbacks=[
        EarlyStopping(
            monitor="val_accuracy", #checks for val_accuracy
            patience=5,#wait tills 5 epochs
            restore_best_weights=True,#uses best weight
        ),
        ModelCheckpoint(
            "best_model.keras",#givesbest model according to val_accuracy
            monitor="val_accuracy",
            save_best_only=True,
            verbose=1 #prints only certain line of epoch for 1 and for 0 is silence and for 2 is every line
        )
    ]
)

test_loss,test_acc=model.evaluate(validation)
print(f"Accuracy: {test_acc:4f}")

Epoch 1/20


E0000 00:00:1767458956.925758      55 meta_optimizer.cc:967] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inStatefulPartitionedCall/sequential_4_1/efficientnetb0_1/block2b_drop_1/stateless_dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


[1m200/201[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 76ms/step - accuracy: 0.7492 - loss: 0.5007
Epoch 1: val_accuracy improved from -inf to 0.80375, saving model to best_model.keras
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 100ms/step - accuracy: 0.7495 - loss: 0.5002 - val_accuracy: 0.8037 - val_loss: 0.4074
Epoch 2/20
[1m200/201[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 78ms/step - accuracy: 0.7965 - loss: 0.4238
Epoch 2: val_accuracy improved from 0.80375 to 0.81375, saving model to best_model.keras
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 90ms/step - accuracy: 0.7966 - loss: 0.4236 - val_accuracy: 0.8138 - val_loss: 0.3970
Epoch 3/20
[1m200/201[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 76ms/step - accuracy: 0.8058 - loss: 0.4092
Epoch 3: val_accuracy improved from 0.81375 to 0.81625, saving model to best_model.keras
[1m201/201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 88ms/step 

In [29]:
result=model.evaluate(test)
print(f"For the unseen test data of the entire training this model has accuracy of {result[1]:.4f}\\")

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 85ms/step - accuracy: 0.8909 - loss: 0.2703
For the unseen test data of the entire training this model has accuracy of 0.8842\


natural 4252
unnatural 3764

model accuracy = 88.48

In [30]:
os.makedirs("/kaggle/working/models",exist_ok=True)
model.save("/kaggle/working/models/audio_forest.keras")
import shutil

# Zip the folder for easy download
shutil.make_archive("/kaggle/working/models", 'zip', "/kaggle/working/models")


'/kaggle/working/models.zip'

**No of files:**
<br>
1) Fire=336<br>
2) Logging=455<br>
3) Natural=1063<br>
4) Poaching=531<br>
5) Total= 2385<br>
So, threshold accuracy=(531/2385)*100%=22.26%

In [44]:
selected_class=["fire","logging", "poaching"]

In [45]:
#loading up images and dividing them for the neural net
train_data=tf.keras.utils.image_dataset_from_directory(
    EXTRACT_PATH,
    class_names=selected_class,
    validation_split=0.2,
    subset="training",
    seed=SEED,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int" 
)

#validation dataset
validation_data=tf.keras.utils.image_dataset_from_directory(
    EXTRACT_PATH,
    class_names=selected_class,
    validation_split=0.2,
    subset="validation",
    seed=SEED,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int"
)

Found 3764 files belonging to 3 classes.
Using 3012 files for training.
Found 3764 files belonging to 3 classes.
Using 752 files for validation.


In [46]:
#now dividing the validation data set into 
total_number_of_batches_in_validation_data=validation_data.cardinality().numpy() #converts total number of batches in the set and converts the tensor into python integer with .numpy function here
no_of_batches_in_validation_data=total_number_of_batches_in_validation_data//2 #this is a floor division operator


#now from the set of batches of the image creating subset into the validation and the test subset
validation=validation_data.take(no_of_batches_in_validation_data)
test=validation_data.skip(no_of_batches_in_validation_data)

In [47]:
print(f"Class names: {train_data.class_names}")
print("For values:\n")
for i, class_name in enumerate(train_data.class_names):
    print(f"{class_name}:{i}\n")

Class names: ['fire', 'logging', 'poaching']
For values:

fire:0

logging:1

poaching:2



In [48]:
mobile2=tf.keras.applications.EfficientNetB0(
    input_shape=(224,224,3),
    include_top=False,
    weights="imagenet"
)
mobile2.trainable=False

model=models.Sequential([
    tf.keras.layers.Input(shape=(128,1000,3)),
    data_augmentation,
    tf.keras.layers.Resizing(224,224),

    #mobile net takes values from -1 to 1 so
    tf.keras.layers.Lambda(tf.keras.applications.efficientnet.preprocess_input),
    mobile2,

    tf.keras.layers.GlobalAveragePooling2D(),
  
    tf.keras.layers.Dense(256, activation="relu"),

    tf.keras.layers.Dropout(0.5),
    
    tf.keras.layers.Dense(3,activation="softmax")
    
    
    
    
])


model.compile(optimizer="adam",
             loss="sparse_categorical_crossentropy",
             metrics=["accuracy"])

history=model.fit(
    train_data,
    validation_data=validation,
    epochs= 20,
    callbacks=[
        EarlyStopping(
            monitor="val_accuracy", #checks for val_accuracy
            patience=5,#wait tills 5 epochs
            restore_best_weights=True,#uses best weight
        ),
        ModelCheckpoint(
            "best_model.keras",#givesbest model according to val_accuracy
            monitor="val_accuracy",
            save_best_only=True,
            verbose=1 #prints only certain line of epoch for 1 and for 0 is silence and for 2 is every line
        )
    ]
)

test_loss,test_acc=model.evaluate(validation)
print(f"Accuracy: {test_acc:4f}")

Epoch 1/20


E0000 00:00:1767461190.548493      55 meta_optimizer.cc:967] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inStatefulPartitionedCall/sequential_9_1/efficientnetb0_1/block2b_drop_1/stateless_dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


[1m94/95[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 79ms/step - accuracy: 0.6993 - loss: 0.6823
Epoch 1: val_accuracy improved from -inf to 0.81510, saving model to best_model.keras
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 121ms/step - accuracy: 0.7003 - loss: 0.6802 - val_accuracy: 0.8151 - val_loss: 0.5361
Epoch 2/20
[1m94/95[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 80ms/step - accuracy: 0.8098 - loss: 0.4634
Epoch 2: val_accuracy improved from 0.81510 to 0.86198, saving model to best_model.keras
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 96ms/step - accuracy: 0.8098 - loss: 0.4632 - val_accuracy: 0.8620 - val_loss: 0.3935
Epoch 3/20
[1m94/95[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 78ms/step - accuracy: 0.8277 - loss: 0.4067
Epoch 3: val_accuracy did not improve from 0.86198
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 87ms/step - accuracy: 0.8277 - loss: 0.4065 - val_accuracy: 0.

In [49]:
result=model.evaluate(test)
print(f"For the unseen test data of the entire training this model has accuracy of {result[1]:.4f}\\")

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 110ms/step - accuracy: 0.8803 - loss: 0.3052
For the unseen test data of the entire training this model has accuracy of 0.8750\


fire - 1344
poaching - 600
logging - 1820

total = 3764

threshold accuracy = 48.35

model 2 accuracy = 87.50

In [51]:
os.makedirs("/kaggle/working/models",exist_ok=True)
model.save("/kaggle/working/models/audio_multi_classification.keras")
import shutil

# Zip the folder for easy download
shutil.make_archive("/kaggle/working/maheshdalle", 'zip', "/kaggle/working/models")


'/kaggle/working/maheshdalle.zip'