In [2]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import torch 
from torch.utils.data import Dataset, DataLoader, random_split, Subset

In [3]:
audio_dir = "/Users/manidatta/Documents/Neural Nets and Deep Learning/Project/fsd50k/FSD50K.dev_audio_16k/"
output_path = "processed_data.npz"


SAMPLE_RATE = 22050
N_MELS = 128
SPEC_SHAPE = (128, 128)
duration = 5

df = pd.read_csv("FSD50K_Data.csv")

risk_encoder = LabelEncoder()
df['risklevel_encoded'] = risk_encoder.fit_transform(df['risk_level'])

location_encoder = LabelEncoder()
df['location_encoded'] = location_encoder.fit_transform(df['location'])


X = []
locations = []
y = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    file_path = os.path.join(audio_dir, f"{row['fname']}.wav")
    if not os.path.isfile(file_path):
        continue

    audio, sr = librosa.load(file_path, sr=SAMPLE_RATE,duration=duration)

    if len(audio) < sr * duration:
        audio = np.pad(audio, (0,sr*duration - len(audio)))
        
    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=N_MELS)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    mel_db = librosa.util.fix_length(mel_db, size=216, axis=1)

    mel_db = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-9) # Normalize

    X.append(mel_db)
    y.append(row['risklevel_encoded'])
    locations.append(row["location_encoded"])


X = np.array(X).astype(np.float32)
y = np.array(y)
locations = np.array(locations)

# Save
np.savez(output_path, X=X, y=y, locations=locations, risklevel_classes = risk_encoder.classes_, location_classes = location_encoder.classes_)

print(f"Saved preprocessed data to {output_path}")

100%|██████████| 10800/10800 [00:39<00:00, 273.42it/s]


Saved preprocessed data to processed_data.npz


### Loading the data

In [4]:
class ESC50Data(Dataset):
    def __init__(self,data_path):
        data = np.load(data_path,allow_pickle=True)
        self.X_audio = data['X']
        self.location = data['locations']
        self.y = data['y']
        self.location_classes = data['location_classes']
        self.risklevel_classes = data['risklevel_classes']

    def __len__(self):
        return len(self.X_audio)
    
    def __getitem__(self, index):
        x = self.X_audio[index]
        y = self.y[index]
        location = self.location[index]
        # converting the spectogram into float tensor with shape(channel dimension,128,128)
        # usually spectograms has 1 channel like gray scale images
        x = torch.tensor(x).unsqueeze(0).float()
        # converting the encoded locations into tensor
        locations = torch.tensor(location).long()
        # converting the encoded labels into long tensor
        y = torch.tensor(y).long()

        return x,y, locations
    
# Loading the dataset
data = ESC50Data("/Users/manidatta/Documents/Neural Nets and Deep Learning/Project/processed_data.npz")

np.random.seed(42)

# Generating the suffled indices
indices = np.random.permutation(len(data))
# splitting the data into train,val and test 
train_size = int(0.8 * len(data))
val_size = int(0.1 * len(data))
test_size = len(data) - train_size - val_size

train_idx = indices[:train_size]
val_idx = indices[train_size:train_size + val_size]
test_idx = indices[train_size + val_size:]

train_set = Subset(data, train_idx)
val_set = Subset(data, val_idx)
test_set = Subset(data, test_idx)

## Model

In [5]:
import torch
import torch.nn as nn

class RiskLevelClassifier(nn.Module):
    def __init__(self, n_locations,n_mels =128, spec_len=216,n_classes=3,hidden_dim=128, n_heads = 4, n_layers=2):
        super().__init__()

        # Each incoming spectogram has the dimension spec_len x n_mels
        # diving the spectogram into 16 x 16 batches and passing the resulted tokens as the sequence to the encoder

        self.patch_embedded = nn.Conv2d(1,hidden_dim, kernel_size=(16,16),stride=(16,16))
        num_patches = (n_mels // 16) * (spec_len // 16)

        self.postional_embedding = nn.Parameter(torch.randn(1,num_patches,hidden_dim))

        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_dim,nhead=n_heads,dim_feedforward=hidden_dim*2, batch_first=True),
            num_layers=n_layers
        )

        self.location_embeddding = nn.Embedding(n_locations,hidden_dim)


        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim,n_classes)
        )

    # feed forward network
    def forward(self, x, loc):
        batch_size = x.size(0)

        patches = self.patch_embedded(x) # [batch_size, hidden_size,H,W] H and W are the number of patches along x and y 

        patches = patches.flatten(2).transpose(1,2) # flattening the last two dimensions gives the number of total patches

        # adding the positional encoding in order for the encoder to identify the patch
        patches = patches + self.postional_embedding

        # passing the patches to the encoder
        audio = self.transformer_encoder(patches)

        # taking average across the patches 
        audio = audio.mean(dim=1)

        # adding the embedding to the locations
        location = self.location_embeddding(loc)

        # fusioning of both audio features and locations
        fusion = torch.cat([audio,location],dim=1)

        output = self.classifier(fusion)

        return output

In [7]:
# Trained the model and performing the hyperparameter tuning using validation data
import optuna
import torch.optim as optim
# Initializing the  Model, Loss, Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

global_best_model_state = None
global_best_accuracy = 0  

def optimization(trial):
    global global_best_model_state, global_best_accuracy 
    print(f"Running Trail {trial.number} \n")
    # defining different hyperparameters
    learning_rate = trial.suggest_loguniform("learning_rate",1e-3, 5e-3)
    hidden_dimension= trial.suggest_categorical("hidden_dimension",[128,256,512])
    n_heads = trial.suggest_categorical("n_heads",[4,8])
    batch_size = trial.suggest_categorical("batch_size",[32,64])
    optimizers = trial.suggest_categorical("optimizer",["Adam","SGD","AdamW"])
    
    print(f"Selected Hyperparameters: learning rate={learning_rate},hidden dimension={hidden_dimension}, optimizer={optimizers}, heads={n_heads},batch size={batch_size}")

    # Device config
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    n_locations = len(location_encoder.classes_)
    
    # defining the model
    model = RiskLevelClassifier(hidden_dim=hidden_dimension,n_locations=n_locations).to(device)

    # Loss 
    criterion = nn.CrossEntropyLoss()

    # initializing the selected optimizer
    if optimizers =="Adam":
        optimizer = optim.Adam(model.parameters(),lr=learning_rate,weight_decay=1e-4)
    elif optimizers =="SGD":
        optimizer = optim.SGD(model.parameters(),lr=learning_rate,momentum=0.9)
    elif optimizers =="AdamW":
        optimizer = optim.AdamW(model.parameters(),lr=learning_rate,weight_decay=1e-4)
    
    # loading data for different batch sizes
    train_data = DataLoader(train_set,batch_size=batch_size, shuffle=True)
    val_data = DataLoader(val_set,batch_size=batch_size,shuffle=False)

    best_validation_accuracy = 0
    best_model_state = None
    # Training for five epochs
    for epoch in range(5):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
    
        for spectrograms, labels, locations in train_data:
            spectrograms = spectrograms.to(device)
            locations = locations.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(spectrograms, locations)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * spectrograms.size(0)
            _, predicted = outputs.max(1)
            train_total += labels.size(0)
            train_correct += predicted.eq(labels).sum().item()
    

        # Validation
        model.eval()
        val_loss =0
        val_correct, val_total = 0,0

        with torch.no_grad():
            for spectrograms, labels, locations in val_data:
                spectrograms = spectrograms.to(device)
                locations = locations.to(device)
                labels = labels.to(device)

                outputs = model(spectrograms, locations)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * spectrograms.size(0)
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()

        # calculating the training and validation loss per each batch 
        average_train_loss = train_loss / train_total
        average_val_loss = val_loss / val_total

        # calculating the validation accuracy and train accuracy
        validation_accuracy = val_correct/val_total
        train_accuracy = train_correct/ train_total
        print(f"Epoch [{epoch+1}/{5}] - Train Loss: {average_train_loss:.4f},train Acc: {train_accuracy:.4f}, Val Loss: {average_val_loss:.4f}, Val Acc: {validation_accuracy:.4f}")

        if validation_accuracy > best_validation_accuracy:
            best_validation_accuracy = validation_accuracy
            best_model_state = model.state_dict()

        # Early Stopping if Validation accuracy is Not Improving
        # reporting the loss for pruning
        trial.report(validation_accuracy,epoch)

        # Stopping the epoch loop early if the validation is not improving much
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    print(f"Trial {trial.number} Completed!  Val Acc: {best_validation_accuracy:.4f}\n")
    if best_validation_accuracy > global_best_accuracy:
        global_best_accuracy = best_validation_accuracy
        global_best_model_state = best_model_state
    # returning the validation loss
    return best_validation_accuracy

# Running the hyperparameter tuning with optuna
# aim is to minimize the validation loss
study = optuna.create_study(direction="maximize")
# Running 10 hyperparameter trials
study.optimize(optimization,n_trials=10)

print("Best Hyperparameters:",study.best_params)


# Saving the Best Model
if global_best_model_state is not None:
    torch.save(global_best_model_state, "best_multimodal_classifier.pth")
    print("Best model saved successfully!")
else:
    print("No best model found to save.")


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-04-24 21:23:16,931] A new study created in memory with name: no-name-3a6ba43c-ce56-47ff-a322-cc67aab15a72
  learning_rate = trial.suggest_loguniform("learning_rate",1e-3, 5e-3)


Running Trail 0 

Selected Hyperparameters: learning rate=0.0023796694633943785,hidden dimension=256, optimizer=Adam, heads=4,batch size=32
Epoch [1/5] - Train Loss: 0.3599,train Acc: 0.8414, Val Loss: 0.2884, Val Acc: 0.8630
Epoch [2/5] - Train Loss: 0.3295,train Acc: 0.8481, Val Loss: 0.3161, Val Acc: 0.8657
Epoch [3/5] - Train Loss: 0.3207,train Acc: 0.8512, Val Loss: 0.2824, Val Acc: 0.8722
Epoch [4/5] - Train Loss: 0.3119,train Acc: 0.8501, Val Loss: 0.2830, Val Acc: 0.8657


[I 2025-04-24 21:25:26,919] Trial 0 finished with value: 0.8722222222222222 and parameters: {'learning_rate': 0.0023796694633943785, 'hidden_dimension': 256, 'n_heads': 4, 'batch_size': 32, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.8722222222222222.


Epoch [5/5] - Train Loss: 0.3101,train Acc: 0.8554, Val Loss: 0.2830, Val Acc: 0.8602
Trial 0 Completed!  Val Acc: 0.8722

Running Trail 1 

Selected Hyperparameters: learning rate=0.004142081295742737,hidden dimension=128, optimizer=Adam, heads=8,batch size=32


  learning_rate = trial.suggest_loguniform("learning_rate",1e-3, 5e-3)


Epoch [1/5] - Train Loss: 0.3634,train Acc: 0.8407, Val Loss: 0.2972, Val Acc: 0.8657
Epoch [2/5] - Train Loss: 0.3165,train Acc: 0.8537, Val Loss: 0.3007, Val Acc: 0.8454
Epoch [3/5] - Train Loss: 0.3212,train Acc: 0.8501, Val Loss: 0.2859, Val Acc: 0.8676
Epoch [4/5] - Train Loss: 0.3116,train Acc: 0.8500, Val Loss: 0.2807, Val Acc: 0.8639


[I 2025-04-24 21:26:42,490] Trial 1 finished with value: 0.8675925925925926 and parameters: {'learning_rate': 0.004142081295742737, 'hidden_dimension': 128, 'n_heads': 8, 'batch_size': 32, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.8722222222222222.


Epoch [5/5] - Train Loss: 0.3084,train Acc: 0.8543, Val Loss: 0.2728, Val Acc: 0.8676
Trial 1 Completed!  Val Acc: 0.8676

Running Trail 2 

Selected Hyperparameters: learning rate=0.0015836849234370496,hidden dimension=512, optimizer=SGD, heads=4,batch size=64
Epoch [1/5] - Train Loss: 0.5445,train Acc: 0.8049, Val Loss: 0.3473, Val Acc: 0.8620
Epoch [2/5] - Train Loss: 0.3431,train Acc: 0.8507, Val Loss: 0.3065, Val Acc: 0.8426
Epoch [3/5] - Train Loss: 0.3220,train Acc: 0.8512, Val Loss: 0.2882, Val Acc: 0.8731
Epoch [4/5] - Train Loss: 0.3171,train Acc: 0.8542, Val Loss: 0.2910, Val Acc: 0.8602


[I 2025-04-24 21:31:27,204] Trial 2 finished with value: 0.8731481481481481 and parameters: {'learning_rate': 0.0015836849234370496, 'hidden_dimension': 512, 'n_heads': 4, 'batch_size': 64, 'optimizer': 'SGD'}. Best is trial 2 with value: 0.8731481481481481.


Epoch [5/5] - Train Loss: 0.3121,train Acc: 0.8519, Val Loss: 0.2816, Val Acc: 0.8620
Trial 2 Completed!  Val Acc: 0.8731

Running Trail 3 

Selected Hyperparameters: learning rate=0.0032717950490971375,hidden dimension=128, optimizer=Adam, heads=4,batch size=32
Epoch [1/5] - Train Loss: 0.3719,train Acc: 0.8394, Val Loss: 0.2903, Val Acc: 0.8676
Epoch [2/5] - Train Loss: 0.3171,train Acc: 0.8538, Val Loss: 0.2963, Val Acc: 0.8667
Epoch [3/5] - Train Loss: 0.3080,train Acc: 0.8562, Val Loss: 0.2753, Val Acc: 0.8648
Epoch [4/5] - Train Loss: 0.3063,train Acc: 0.8556, Val Loss: 0.2760, Val Acc: 0.8741


[I 2025-04-24 21:32:33,831] Trial 3 finished with value: 0.8740740740740741 and parameters: {'learning_rate': 0.0032717950490971375, 'hidden_dimension': 128, 'n_heads': 4, 'batch_size': 32, 'optimizer': 'Adam'}. Best is trial 3 with value: 0.8740740740740741.


Epoch [5/5] - Train Loss: 0.3048,train Acc: 0.8575, Val Loss: 0.2814, Val Acc: 0.8722
Trial 3 Completed!  Val Acc: 0.8741

Running Trail 4 

Selected Hyperparameters: learning rate=0.0021149315292142185,hidden dimension=512, optimizer=SGD, heads=8,batch size=32
Epoch [1/5] - Train Loss: 0.4193,train Acc: 0.8331, Val Loss: 0.3077, Val Acc: 0.8676
Epoch [2/5] - Train Loss: 0.3196,train Acc: 0.8493, Val Loss: 0.2841, Val Acc: 0.8630
Epoch [3/5] - Train Loss: 0.3137,train Acc: 0.8528, Val Loss: 0.2771, Val Acc: 0.8657
Epoch [4/5] - Train Loss: 0.3093,train Acc: 0.8532, Val Loss: 0.2786, Val Acc: 0.8602


[I 2025-04-24 21:37:08,959] Trial 4 finished with value: 0.8703703703703703 and parameters: {'learning_rate': 0.0021149315292142185, 'hidden_dimension': 512, 'n_heads': 8, 'batch_size': 32, 'optimizer': 'SGD'}. Best is trial 3 with value: 0.8740740740740741.


Epoch [5/5] - Train Loss: 0.3025,train Acc: 0.8545, Val Loss: 0.2738, Val Acc: 0.8704
Trial 4 Completed!  Val Acc: 0.8704

Running Trail 5 

Selected Hyperparameters: learning rate=0.0021822709586366627,hidden dimension=256, optimizer=SGD, heads=4,batch size=32


[I 2025-04-24 21:37:35,023] Trial 5 pruned. 


Epoch [1/5] - Train Loss: 0.4733,train Acc: 0.8208, Val Loss: 0.3338, Val Acc: 0.8583
Running Trail 6 

Selected Hyperparameters: learning rate=0.0015889094334850258,hidden dimension=256, optimizer=AdamW, heads=8,batch size=32


[I 2025-04-24 21:38:01,147] Trial 6 pruned. 


Epoch [1/5] - Train Loss: 0.3605,train Acc: 0.8449, Val Loss: 0.2941, Val Acc: 0.8611
Running Trail 7 

Selected Hyperparameters: learning rate=0.0018783542446182497,hidden dimension=128, optimizer=AdamW, heads=8,batch size=64


[I 2025-04-24 21:38:16,001] Trial 7 pruned. 


Epoch [1/5] - Train Loss: 0.3913,train Acc: 0.8341, Val Loss: 0.2893, Val Acc: 0.8574
Running Trail 8 

Selected Hyperparameters: learning rate=0.004805201212798715,hidden dimension=256, optimizer=Adam, heads=4,batch size=64
Epoch [1/5] - Train Loss: 0.3867,train Acc: 0.8309, Val Loss: 0.2869, Val Acc: 0.8722
Epoch [2/5] - Train Loss: 0.3263,train Acc: 0.8507, Val Loss: 0.2805, Val Acc: 0.8704
Epoch [3/5] - Train Loss: 0.3174,train Acc: 0.8495, Val Loss: 0.2871, Val Acc: 0.8519
Epoch [4/5] - Train Loss: 0.3120,train Acc: 0.8532, Val Loss: 0.2746, Val Acc: 0.8694


[I 2025-04-24 21:40:24,906] Trial 8 finished with value: 0.8722222222222222 and parameters: {'learning_rate': 0.004805201212798715, 'hidden_dimension': 256, 'n_heads': 4, 'batch_size': 64, 'optimizer': 'Adam'}. Best is trial 3 with value: 0.8740740740740741.


Epoch [5/5] - Train Loss: 0.3125,train Acc: 0.8558, Val Loss: 0.2806, Val Acc: 0.8574
Trial 8 Completed!  Val Acc: 0.8722

Running Trail 9 

Selected Hyperparameters: learning rate=0.0020023276319776724,hidden dimension=512, optimizer=AdamW, heads=4,batch size=32


[I 2025-04-24 21:41:19,946] Trial 9 pruned. 


Epoch [1/5] - Train Loss: 0.3734,train Acc: 0.8354, Val Loss: 0.3124, Val Acc: 0.8648
Best Hyperparameters: {'learning_rate': 0.0032717950490971375, 'hidden_dimension': 128, 'n_heads': 4, 'batch_size': 32, 'optimizer': 'Adam'}
Best model saved successfully!


In [9]:
n_locations = len(location_encoder.classes_)

best_model = RiskLevelClassifier(hidden_dim=study.best_params["hidden_dimension"],n_locations=n_locations).to(device)

best_model.load_state_dict(torch.load("best_multimodal_classifier.pth"))
best_model.eval()

test_data = DataLoader(val_set,batch_size=study.best_params["batch_size"],shuffle=False)

predictions = []

optimizers = study.best_params["optimizer"]


if optimizers =="Adam":
    optimizer = optim.Adam(best_model.parameters(),lr=study.best_params["learning_rate"],weight_decay=1e-4)
elif optimizers =="SGD":
    optimizer = optim.SGD(best_model.parameters(),lr=study.best_params["learning_rate"],momentum=0.9)
elif optimizers =="AdamW":
    optimizer = optim.AdamW(best_model.parameters(),lr=study.best_params["learning_rate"],weight_decay=1e-4)

criterion = nn.CrossEntropyLoss()

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for spectrograms, labels, locations in test_data:
        spectrograms = spectrograms.to(device)
        locations = locations.to(device)
        labels = labels.to(device)

        outputs = best_model(spectrograms, locations)
        loss = criterion(outputs, labels)

        test_loss += loss.item() * spectrograms.size(0)
        _, predicted = outputs.max(1)
        test_total += labels.size(0)
        test_correct += predicted.eq(labels).sum().item()
    
    test_accuracy = test_correct/ test_total
    print("Test Accuracy:",test_accuracy)


predicted

Test Accuracy: 0.8722222222222222


tensor([2, 1, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2])

## Explanation Generator 

In [72]:
data = pd.read_csv("FSD50K_Data.csv")

In [73]:
data.head()

Unnamed: 0,fname,label,location,risk_level
0,128802,Acoustic_guitar,Home,Normal
1,8462,Acoustic_guitar,Home,Normal
2,41750,Acoustic_guitar,Home,Normal
3,252380,Acoustic_guitar,Home,Normal
4,18483,Acoustic_guitar,Home,Normal


In [93]:
import random

def generate_explanation(label, location, risk):
    danger_templates = [
        f"A {label} sound in the {location} signals a critical situation. Take action immediately.",
        f"Warning: {label} noise detected at the {location}. It's classified as dangerous.",
        f"Emergency alert! A {label} was heard in the {location}. Respond now.",
        f"A potentially life-threatening {label} sound occurred in the {location}. Urgent attention required.",
        f"The {location} is experiencing a dangerous {label} event. Please evacuate or seek shelter.",
        f"Detected a hazardous {label} at the {location}. Stay away from the area.",
        f"Critical: {label} identified at the {location}. Engage emergency protocols.",
        f"Authorities have flagged the {label} sound at the {location} as highly dangerous.",
        f"A high-risk {label} sound was reported in the {location}. Monitor the situation closely.",
        f"The presence of {label} in the {location} may indicate a serious threat.",
        f"A {label} was recorded in the {location}, requiring urgent investigation.",
        f"Serious warning: A {label} sound was picked up in the {location}. Act fast.",
        f"Immediate action advised: {label} sound from the {location} poses severe risk.",
        f"A {label} sound was confirmed at the {location}, triggering an emergency response.",
        f"Risk alert: Dangerous {label} sounds at the {location} demand immediate attention.",
    ]

    threat_templates = [
        f"A {label} was heard at the {location}, indicating a potential threat.",
        f"Sound monitoring flagged a {label} in the {location}. Stay alert.",
        f"A suspicious {label} sound emerged from the {location}. Further review needed.",
        f"{label} noise in the {location} may indicate unusual activity. Exercise caution.",
        f"A potentially concerning {label} was picked up in the {location}.",
        f"Monitor the {location} closely after detecting a {label} sound.",
        f"Attention: {label} activity in the {location} may suggest a developing situation.",
        f"A {label} was detected in the {location}, possibly signaling a minor threat.",
        f"Advisory: {label} sounds reported from the {location}. Take precautions.",
        f"The {location} registered a {label} sound that may need follow-up.",
        f"A {label} sound could indicate early signs of a threat at the {location}.",
        f"{label} occurrence at the {location} warrants increased observation.",
        f"Initial detection of {label} in the {location} could precede an escalation.",
        f"The presence of a {label} at the {location} should not be ignored.",
        f"Unusual {label} activity reported in the {location}. Monitor for updates.",
    ]

    safe_templates = [
        f"A {label} sound was recorded at the {location}, but it poses no threat.",
        f"Routine {label} noise was detected in the {location}. No action needed.",
        f"Normal acoustic activity: {label} sound observed in the {location}.",
        f"The sound of {label} in the {location} is consistent with safe conditions.",
        f"{label} detected in the {location} is categorized as safe.",
        f"The environment at the {location} is secure despite the {label} sound.",
        f"A {label} was picked up in the {location}, classified as non-threatening.",
        f"Low-risk {label} activity in the {location} detected. Situation is stable.",
        f"The {location} experienced a typical {label} sound event.",
        f"A {label} sound is heard in the {location}, but it's nothing to worry about.",
        f"Acoustic scan shows normal {label} presence in the {location}.",
        f"Sound levels at the {location}, including {label}, are within safe bounds.",
        f"The {label} sound in the {location} is part of usual background activity.",
        f"{label} was identified in the {location}, with no associated risk.",
        f"Everything is calm in the {location} despite the detection of {label}.",
    ]

    if risk == "Danger":
        return random.choice(danger_templates)
    elif risk == "Potential Threat":
        return random.choice(threat_templates)
    else:
        return random.choice(safe_templates)

def generate_input_text(label, location, risk):
    return f"Audio: {label} | Location: {location} | Risk: {risk}"

In [94]:
data['input_text'] = data.apply(lambda row: generate_input_text(row['label'], row['location'], row['risk_level']), axis=1)
data['target_text'] = data.apply(lambda row: generate_explanation(row['label'], row['location'], row['risk_level']), axis=1)

# Saving the  the prepared data
data[['input_text', 'target_text']].to_csv("explanation_generation_data.csv", index=False)

In [95]:
from datasets import Dataset

# reading the data
data_text = pd.read_csv("explanation_generation_data.csv")


In [96]:
data_text.head()

Unnamed: 0,input_text,target_text
0,Audio: Acoustic_guitar | Location: Home | Risk...,Acoustic scan shows normal Acoustic_guitar pre...
1,Audio: Acoustic_guitar | Location: Home | Risk...,Routine Acoustic_guitar noise was detected in ...
2,Audio: Acoustic_guitar | Location: Home | Risk...,A Acoustic_guitar sound was recorded at the Ho...
3,Audio: Acoustic_guitar | Location: Home | Risk...,"Sound levels at the Home, including Acoustic_g..."
4,Audio: Acoustic_guitar | Location: Home | Risk...,Acoustic_guitar detected in the Home is catego...


In [97]:
# Applying the tokenization for the data
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import evaluate 
from sklearn.model_selection import train_test_split
import torch
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")


def preprocess(text):
    inputs= tokenizer(text["input_text"],max_length =128, truncation=True,padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(text["target_text"],max_length =64, truncation=True,padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs


# Converting the current pandas dataframe into hugging face dataset Object, the format used by the hugging face trainer and transfomer models
dataset = Dataset.from_pandas(data_text[['input_text', 'target_text']])

# splitting the data into train, validation and test sets

train_data_text = dataset.select(train_idx.tolist())
val_data_text = dataset.select(val_idx.tolist())
test_data_text = dataset.select(test_idx.tolist())

# applying the preprocess function to all the data
preprocessed_data_train = train_data_text.map(preprocess,batched=True)
preprocessed_data_val = val_data_text.map(preprocess,batched=True)
preprocessed_data_test = test_data_text.map(preprocess,batched=True)

# loading the BERTScore
bertscore = evaluate.load("bertscore")

# function to compute the metrics 
def eval_metrics(predictions):
    pred, labels = predictions
    # pred is the token ID's predicted by the model
    # If predictions are logits, use argmax (not needed if you're using generate)
    if isinstance(pred, tuple):
        pred = pred[0]
        pred = torch.argmax(torch.tensor(pred), dim=-1)

    # Flatten if the elements inside are still lists (e.g. shape: [batch_size, sequence_len])
    pred = [p.tolist() if isinstance(p, torch.Tensor) else p for p in pred]
    labels = [l.tolist() if isinstance(l, torch.Tensor) else l for l in labels]

    # converting the token ID's into human readable textual format
    pred_decoded = tokenizer.batch_decode(pred,skip_special_tokens = True)
    labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens= True)


    results = bertscore.compute(predictions =pred_decoded, references= labels_decoded, lang="en")

    return {
        "precision": sum(results["precision"]) / len(results["precision"]),
        "recall": sum(results["recall"]) / len(results["recall"]),
        "f1 score" : sum(results["f1"]) / len(results["f1"])
    }

Map: 100%|██████████| 8640/8640 [00:00<00:00, 9523.93 examples/s]
Map: 100%|██████████| 1080/1080 [00:00<00:00, 10533.48 examples/s]
Map: 100%|██████████| 1080/1080 [00:00<00:00, 10508.70 examples/s]


In [98]:
from transformers import Seq2SeqTrainingArguments

# declaring the training arguments
train_arguments =Seq2SeqTrainingArguments(
    output_dir = "./explanation_model",
    do_train = True,
    do_eval = True,
    learning_rate = 2e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 2,
    num_train_epochs = 5,
    weight_decay = 0.01,
    save_total_limit = 2,
    logging_dir = "./logs",
    predict_with_generate = True,
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10
)

In [99]:
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Ensures no CUDA fallback
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # optional, disable memory limit (but not enough alone)

In [100]:
device= torch.device("mps")

model = model.to(device)

In [101]:
# Data Loader which is used to load the data in the form of batches
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)

In [102]:
from transformers import Trainer

class MyTrainer(Trainer):
    def _prepare_inputs(self, inputs):
        return {k: v.to("mps") if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}


In [103]:
# declaring the trainer

small_eval_dataset = preprocessed_data_val.select(range(300))
#small_train_dataset = preprocessed_data_train.select(range(300))
trainer = MyTrainer(
    model=model,
    args = train_arguments,
    train_dataset = preprocessed_data_train,
    eval_dataset = small_eval_dataset,
    tokenizer=tokenizer,
    data_collator  = data_collator,
    compute_metrics = eval_metrics
)

  trainer = MyTrainer(


In [104]:
# Training the model
trainer.train()
trainer.save_model("./explanation_model_final")

Epoch,Training Loss,Validation Loss,Precision,Recall,F1 score
1,0.3229,0.207532,0.94734,0.950382,0.948835
2,0.1907,0.107456,0.972006,0.973913,0.972946
3,0.1358,0.081559,0.975602,0.977326,0.976452
4,0.1205,0.071515,0.97906,0.980137,0.979589
5,0.1176,0.069098,0.979477,0.980568,0.980012


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [105]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("./explanation_model_final")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

input_text = "Audio: Wind_instrument_and_woodwind_instrument | Location: School | Risk: Danger"
inputs = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)


# Generate output
outputs = model.generate(
    inputs["input_ids"],
    do_sample=True,
    temperature=0.9,
    top_k=50,
    top_p=0.95,
    max_new_tokens=60
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


A Wind_instrument_and_woodwind_instrument sound was recorded at the School, triggering an emergency response.
