In [1]:
!pip install torch librosa numpy sounddevice


Collecting sounddevice
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Downloading sounddevice-0.5.1-py3-none-any.whl (32 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.5.1


In [2]:
import torch
import numpy as np
import librosa
from torch import nn
import torch
import numpy as np
import librosa
from torch import nn
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns

class MFCC_LSTM(nn.Module):
    def __init__(self, input_size=13, hidden_size=128, num_layers=2, num_classes=8, dropout=0.3):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=input_size,      # 13 MFCC features
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.attention = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1)
        )

        self.fc1 = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(dropout)
        )

        self.fc2 = nn.Sequential(
            nn.Linear(hidden_size, num_classes)
        )

    def attention_net(self, lstm_output):
        attn_weights = self.attention(lstm_output)
        soft_attn_weights = torch.softmax(attn_weights, 1)
        context = torch.sum(lstm_output * soft_attn_weights, 1)
        return context

    def forward(self, x):
        # x shape: (batch_size, 13, 200)
        # Transpose to (batch_size, 200, 13) for LSTM
        x = x.transpose(1, 2)

        lstm_out, _ = self.lstm(x)

        # Apply attention
        attn_out = self.attention_net(lstm_out)

        # Fully connected layers
        out = self.fc1(attn_out)
        out = self.fc2(out)
        return out

In [3]:
# Unique classes and their corresponding class IDs:
#    classID             class
# 0        0   air_conditioner
# 1        1          car_horn
# 2        2  children_playing
# 3        3          dog_bark
# 4        4          drilling
# 5        5     engine_idling
# 6        6          gun_shot
# 7        7        jackhammer
# 8        8             siren
# 9        9      street_music

# Audio and model parameters
SAMPLE_RATE = 22050  # Sampling rate for MFCC extraction
MFCC_FEATURES = 13   # Number of MFCC features
TIME_STEPS = 200     # Number of time steps in each MFCC frame
CHUNK_DURATION = 2.0 # Duration (in seconds) of each audio chunk to classify

In [4]:
def load_model(model_path, device='cpu'):
    model = MFCC_LSTM(input_size=MFCC_FEATURES, hidden_size=128, num_layers=2, num_classes=7)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    return model

# Extract MFCC features from a 4-second audio chunk
def extract_mfcc(audio_chunk):
    mfcc = librosa.feature.mfcc(y=audio_chunk, sr=SAMPLE_RATE, n_mfcc=MFCC_FEATURES)
    # Pad or truncate to TIME_STEPS
    if mfcc.shape[1] < TIME_STEPS:
        pad_width = ((0, 0), (0, TIME_STEPS - mfcc.shape[1]))
        mfcc = np.pad(mfcc, pad_width, mode='constant')
    else:
        mfcc = mfcc[:, :TIME_STEPS]
    return mfcc

# Perform prediction on a single 4-second audio segment
def detect_audio_segment(audio_segment, model, device='cpu'):
    mfcc_features = extract_mfcc(audio_segment)
    features_tensor = torch.FloatTensor(mfcc_features).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(features_tensor)
        probabilities = torch.softmax(outputs, dim=1)
        predicted_class = torch.argmax(outputs, dim=1).item()
        confidence = probabilities[0][predicted_class].item()

    label_map = {0: 0, 1: 2, 2: 3, 3: 4, 4: 6, 5: 8, 6: 9}
    label_map = {
        0: 'air_conditioner',
        1: 'children_playing',
        2: 'dog_bark',
        3: 'drilling',
        4: 'gun_shot',
        5: 'siren',
        6: 'street_music'
    }
    original_label = label_map[predicted_class]
    return original_label, confidence

# Process the audio file in 4-second chunks
def process_audio_file_in_chunks(audio_path, model, device='cpu'):
    chunk_samples = int(CHUNK_DURATION * SAMPLE_RATE)

    # Load only the needed part of the audio each time
    y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
    num_chunks = len(y) // chunk_samples

    for i in range(num_chunks):
        start = i * chunk_samples
        end = start + chunk_samples
        audio_chunk = y[start:end]

        # Classify this 4-second chunk
        label, confidence = detect_audio_segment(audio_chunk, model, device)
        print(f"Chunk {i+1}: Detected Class = {label}, Confidence = {confidence:.2%}")


In [5]:

# Load model and start processing
model_path = '/content/best_model.pth'  # Replace with your actual model path
audio_path = '/content/gun_asmr.wav'  # Replace with your actual audio file path
device = 'cpu'  # Change to 'cuda' if using GPU

model = load_model(model_path, device)
process_audio_file_in_chunks(audio_path, model, device)


  model.load_state_dict(torch.load(model_path, map_location=device))


Chunk 1: Detected Class = gun_shot, Confidence = 99.83%
Chunk 2: Detected Class = dog_bark, Confidence = 98.84%
Chunk 3: Detected Class = gun_shot, Confidence = 49.77%
Chunk 4: Detected Class = gun_shot, Confidence = 99.51%
Chunk 5: Detected Class = drilling, Confidence = 68.18%
Chunk 6: Detected Class = drilling, Confidence = 41.38%
Chunk 7: Detected Class = gun_shot, Confidence = 83.25%
Chunk 8: Detected Class = gun_shot, Confidence = 94.98%
Chunk 9: Detected Class = gun_shot, Confidence = 98.29%
Chunk 10: Detected Class = dog_bark, Confidence = 69.88%
Chunk 11: Detected Class = gun_shot, Confidence = 61.14%
Chunk 12: Detected Class = dog_bark, Confidence = 70.94%
Chunk 13: Detected Class = street_music, Confidence = 92.30%
Chunk 14: Detected Class = gun_shot, Confidence = 100.00%
Chunk 15: Detected Class = gun_shot, Confidence = 100.00%
Chunk 16: Detected Class = gun_shot, Confidence = 100.00%
Chunk 17: Detected Class = gun_shot, Confidence = 100.00%
Chunk 18: Detected Class = gun_s

In [6]:
import torch
import numpy as np
import librosa
import smtplib
from torch import nn
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

# Model parameters
# SAMPLE_RATE = 22050  # Sampling rate for MFCC extraction
# MFCC_FEATURES = 13   # Number of MFCC features
# TIME_STEPS = 200     # Number of time steps in each MFCC frame
# CHUNK_DURATION = 2.0 # Duration (in seconds) of each audio chunk to classify
GUNSHOT_THRESHOLD = 0.8  # Confidence threshold for detecting a gunshot sound
live_video_link = "http://127.0.0.1:5000/"  # Replace with your actual video link

def send_email(sender_email, receiver_email, email_password, video_link):
    """Send an email alert for gunshot detection with a link to CCTV footage."""
    msg = MIMEMultipart()
    msg['From'] = sender_email
    msg['To'] = receiver_email
    msg['Subject'] = "Gunshot Detection Alert"

    # HTML body with a clickable link
    body = f"""
    <html>
        <body>
            <p>Gunshot detected in the audio surveillance. Please check the CCTV footage.</p>
            <p><a href="{live_video_link}" target="_blank">View CCTV Footage</a></p>
        </body>
    </html>
    """
    msg.attach(MIMEText(body, 'html'))  # Set the MIME type to 'html' for rendering HTML content

    # Send the email
    try:
        with smtplib.SMTP('smtp.gmail.com', 587) as server:
            server.starttls()  # Secure the connection
            server.login(sender_email, email_password)
            server.send_message(msg)
        print("Email alert sent successfully!")
    except Exception as e:
        print(f"Failed to send email: {e}")

# Process audio file
def process_audio_file_in_chunks_for_email(audio_path, model, device='cpu', sender_email=None, receiver_email=None, email_password=None):
    chunk_samples = int(CHUNK_DURATION * SAMPLE_RATE)
    y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
    num_chunks = len(y) // chunk_samples

    for i in range(num_chunks):
        start = i * chunk_samples
        end = start + chunk_samples
        audio_chunk = y[start:end]

        # Classify the audio chunk
        label, confidence = detect_audio_segment(audio_chunk, model, device)
        print(f"Chunk {i+1}: Detected Class = {label}, Confidence = {confidence:.2%}")

        # If gunshot is detected with high confidence, send an email
        if label == 'gun_shot' and confidence >= GUNSHOT_THRESHOLD:
            print("Gunshot detected with high confidence. Sending email alert...")
            send_email(sender_email, receiver_email, email_password, live_video_link)
            break  # Stop further processing once email is sent

# Usage example
model_path = '/content/best_model.pth'  # Replace with your model path
audio_path = '/content/gun_asmr.wav'  # Replace with your audio file path
sender_email = "manas.divekar76@gmail.com"  # Replace with your email
receiver_email = "me.atharvajadhav@gmail.com"  # Replace with receiver's email
email_password = "jemu imks maqm kaow"  # Replace with your email password

device = 'cpu'  # Change to 'cuda' if using a GPU

In [10]:

# Process audio file
def process_audio_file_in_chunks_for_email_lmt(audio_path, model, device='cpu', sender_email=None, receiver_email=None, email_password=None):
    chunk_samples = int(CHUNK_DURATION * SAMPLE_RATE)
    y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
    num_chunks = len(y) // chunk_samples

    # Track gunshot detections within a rolling window of the last 6 chunks
    recent_gunshots = [0] * 6  # A list to track gunshot detections in last 6 chunks
    gunshot_count = 0  # Initialize gunshot counter

    for i in range(num_chunks):
        start = i * chunk_samples
        end = start + chunk_samples
        audio_chunk = y[start:end]

        # Classify the audio chunk
        label, confidence = detect_audio_segment(audio_chunk, model, device)
        print(f"Chunk {i+1}: Detected Class = {label}, Confidence = {confidence:.2%}")

        # Check if gunshot is detected with high confidence
        if label == 'gun_shot' and confidence >= GUNSHOT_THRESHOLD:
            recent_gunshots[i % 6] = 1  # Mark this chunk as having detected a gunshot
        else:
            recent_gunshots[i % 6] = 0  # Mark this chunk as not having a gunshot

        # Update gunshot count based on the rolling window
        gunshot_count = sum(recent_gunshots)

        # If there are 3 gunshots in the last 6 chunks, send an email
        if gunshot_count >= 3:
            print("3 gunshots detected within the last 6 chunks. Sending email alert...")
            send_email(sender_email, receiver_email, email_password, live_video_link)
            recent_gunshots = [0] * 6  # Reset recent detections after sending an email
            gunshot_count = 0  # Reset gunshot counter

In [11]:

model = load_model(model_path, device)
process_audio_file_in_chunks_for_email_lmt(audio_path, model, device, sender_email, receiver_email, email_password)


  model.load_state_dict(torch.load(model_path, map_location=device))


Chunk 1: Detected Class = gun_shot, Confidence = 99.83%
Chunk 2: Detected Class = dog_bark, Confidence = 98.84%
Chunk 3: Detected Class = gun_shot, Confidence = 49.77%
Chunk 4: Detected Class = gun_shot, Confidence = 99.51%
Chunk 5: Detected Class = drilling, Confidence = 68.18%
Chunk 6: Detected Class = drilling, Confidence = 41.38%
Chunk 7: Detected Class = gun_shot, Confidence = 83.25%
Chunk 8: Detected Class = gun_shot, Confidence = 94.98%
3 gunshots detected within the last 6 chunks. Sending email alert...
Email alert sent successfully!
Chunk 9: Detected Class = gun_shot, Confidence = 98.29%
Chunk 10: Detected Class = dog_bark, Confidence = 69.88%
Chunk 11: Detected Class = gun_shot, Confidence = 61.14%
Chunk 12: Detected Class = dog_bark, Confidence = 70.94%
Chunk 13: Detected Class = street_music, Confidence = 92.30%
Chunk 14: Detected Class = gun_shot, Confidence = 100.00%
Chunk 15: Detected Class = gun_shot, Confidence = 100.00%
Chunk 16: Detected Class = gun_shot, Confidence 