In [7]:
import cv2
import torch
from torchvision import transforms, models 
from PIL import Image
import time

import torch.nn as nn
import os
import numpy as np
import librosa
import pandas as pd
import wave

# Transformations for the input image
transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class CustomVGG16(nn.Module):
    def __init__(self, num_classes=6, dropout_rate=0.7): 
        super(CustomVGG16, self).__init__()

        model = models.vgg16(pretrained=True)
        for param in model.features[8:].parameters():
            param.requires_grad = True

        self.features = nn.Sequential(
            *list(model.features.children())[:24], 
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            *list(model.features.children())[24:], 
        )
        self.avgpool = model.avgpool
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(dropout_rate),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(dropout_rate),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

# Load the VGG model
model = CustomVGG16(num_classes=6) 
model.load_state_dict(torch.load('best_model_checkpoint.pth', map_location='cpu'))  
model.to('cpu') 
model.eval()  

def classify_frames_using_vgg(num_frames_to_process=100):   # Process 100 frames as an example
    cap = cv2.VideoCapture(1)
    if not cap.isOpened():
        print("Unable to open video feed.")
        return

    total_inference_time = 0.0
    num_frames = 0

    while num_frames < num_frames_to_process:
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame)
        transformed_image = transform(pil_image)
        batch_t = torch.unsqueeze(transformed_image, 0).to('cpu')

        # Measure inference time
        start_time = time.time()
        
        with torch.no_grad():
            out = model(batch_t)

        end_time = time.time()
        inference_time = (end_time - start_time) * 1000  

        
        total_inference_time += inference_time
        num_frames += 1

    average_inference_time = total_inference_time / num_frames

    cap.release()

    print("for VGG16 model:")
    print(f"Avg Inference Time over {num_frames} frames: {average_inference_time:.2f} ms")

# Start the frame processing
classify_frames_using_vgg()

for VGG16 model:
Avg Inference Time over 100 frames: 273.94 ms


In [20]:
#final one
import cv2
import torch
from torchvision import transforms, models 
from PIL import Image
import torch.nn as nn
import os
import numpy as np
import librosa
import pandas as pd
import wave
import pyaudio
import time
from collections import Counter
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class CustomVGG16(nn.Module):
    def __init__(self, num_classes=6, dropout_rate=0.7): 
        super(CustomVGG16, self).__init__()

        model = models.vgg16(pretrained=True)
        for param in model.features[8:].parameters():
            param.requires_grad = True

        self.features = nn.Sequential(
            *list(model.features.children())[:24], 
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            *list(model.features.children())[24:], 
        )
        self.avgpool = model.avgpool

        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(dropout_rate),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(dropout_rate),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x



model = CustomVGG16(num_classes=6) 
model.load_state_dict(torch.load('best_model_checkpoint.pth', map_location='cpu'))  
model.to('cpu') 
model.eval()  

class_names = ['Automotive_commercial', 'Entertainment_commercial',
               'Food_commercial', 'Healthcare_commercial', 'Insurance_commercial', 
               'Technology_Electronics_commercial']



features_df = pd.read_csv('features.csv')

# Extract features and corresponding labels
X = features_df.iloc[:, :-1].values  # features 
y = features_df.iloc[:, -1].values  # labels 

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode the labels
y = label_encoder.fit_transform(y)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.int64)

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Define the RNN model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

input_size = X_train.shape[1]
hidden_size = 64 
num_classes = len(np.unique(y_train))

model2 = RNN(input_size, hidden_size, num_classes)

def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')

        # MFCC (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_processed = np.mean(mfccs.T, axis=0)

        # Chroma feature
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)

        # Spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)

        # Spectral centroid
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)

        # Zero-crossing rate
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

        # Spectral rolloff
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)

        # Combine all features into a 1D array
        features = np.hstack([mfccs_processed, chroma_stft, spectral_contrast, spectral_centroids, zero_crossing_rate, spectral_rolloff])

        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        return None

def slide_and_record_additional_audio(existing_frames, seconds_to_record=10):
    sample_rate = 44100
    frames_to_discard = int(sample_rate / 1024 * seconds_to_record)

    # Discard the first 'seconds_to_record' worth of frames
    existing_frames = existing_frames[frames_to_discard:]

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)

    # Record new audio to append
    for _ in range(0, frames_to_discard):
        data = stream.read(1024)
        existing_frames.append(data)

    stream.stop_stream()
    stream.close()
    p.terminate()

    return existing_frames  # Return the updated frames

def record_and_classify_audio():
    audio_duration = 60  # seconds
    sample_rate = 44100
    audio_frames = []

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=sample_rate, input=True, frames_per_buffer=1024)

    # Start initial recording
    for _ in range(0, int(sample_rate / 1024 * audio_duration)):
        data = stream.read(1024)
        audio_frames.append(data)

    # Main loop for continuous prediction and handling commercials
    while True:
        # Save the recorded audio to a WAV file
        output_audio_file = "recorded_audio.wav"
        with wave.open(output_audio_file, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
            wf.setframerate(sample_rate)
            wf.writeframes(b''.join(audio_frames))

        # Extract features from audio file
        recorded_features = extract_features(output_audio_file)
        if recorded_features is not None:
            recorded_df = pd.DataFrame([recorded_features])
            recorded_df = scaler.transform(recorded_df)

            model2.eval()
            recorded_tensor = torch.tensor(recorded_df, dtype=torch.float32)

    # Start the timer for RNN
            start_time_rnn = time.time()

            with torch.no_grad():
                predicted_output = model2(recorded_tensor.unsqueeze(1))

            # End the timer and calculate inference time for RNN
            end_time_rnn = time.time()
            rnn_inference_time = (end_time_rnn - start_time_rnn) * 1000  # Convert to milliseconds
            print(f"RNN Inference Time: {rnn_inference_time:.2f} ms")

            _, predicted_class = torch.max(predicted_output, 1)
            predicted_class_label = label_encoder.inverse_transform(predicted_class.numpy())
            print(f"Predicted Label: {predicted_class_label[0]}")


            # Handle the case where a commercial is detected
            if predicted_class_label[0] == 0.0:
                print("Commercial detected. Starting frame classification...")
                classify_frames_using_vgg()

            # Handle the case where a non-commercial is detected
            elif predicted_class_label[0] == 1.0:
                print("End of commercial detected. Stopping frame classification...")
                # You can add any code here to handle the end of a commercial
                classify_frames_using_vgg()
        else:
            print("Error in feature extraction!")

        # Slide the audio window for next classification
        audio_frames = slide_and_record_additional_audio(audio_frames)

        time.sleep(3)  # Sleep before next cycle

def classify_frames_using_vgg():
    vgg_inference_times = []  # Initialize the list to store inference times

    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Unable to open video feed.")
        return

    labels_buffer = []
    start_time = time.time()

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        orig_frame = frame.copy()
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame)
        transformed_image = transform(pil_image)
        batch_t = torch.unsqueeze(transformed_image, 0).to('cpu')

        inference_start_time = time.time()
        with torch.no_grad():
            out = model(batch_t)
        inference_end_time = time.time()

        inference_time = (inference_end_time - inference_start_time) * 1000  # Convert to milliseconds
        vgg_inference_times.append(inference_time)

        _, predicted = torch.max(out, 1)
        label = class_names[predicted.item()]
        labels_buffer.append(label)

        if len(labels_buffer) == 10:
            most_common_label = Counter(labels_buffer).most_common(1)[0][0]
            print(f"The commercial belongs to the category: {most_common_label}")
            labels_buffer.clear()

        orig_frame = cv2.cvtColor(orig_frame, cv2.COLOR_RGB2BGR)
        cv2.putText(orig_frame, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
        cv2.imshow('Frame Classification', orig_frame)

        # Check if 'q' is pressed or 10 seconds have passed
        if cv2.waitKey(1) & 0xFF == ord('q') or (time.time() - start_time) >= 10:
            break

    cap.release()
    cv2.destroyAllWindows()

    # Compute and print average VGG inference time
    avg_inference_time = sum(vgg_inference_times) / len(vgg_inference_times)
    print(f"Average VGG Inference Time: {avg_inference_time:.2f} ms")
if __name__ == '__main__':
    record_and_classify_audio()




RNN Inference Time: 0.27 ms
Predicted Label: 0.0
Commercial detected. Starting frame classification...
The commercial belongs to the category: Technology_Electronics_commercial
The commercial belongs to the category: Technology_Electronics_commercial
The commercial belongs to the category: Technology_Electronics_commercial
Average VGG Inference Time: 275.98 ms
RNN Inference Time: 0.27 ms
Predicted Label: 0.0
Commercial detected. Starting frame classification...
The commercial belongs to the category: Technology_Electronics_commercial
The commercial belongs to the category: Technology_Electronics_commercial
The commercial belongs to the category: Technology_Electronics_commercial
Average VGG Inference Time: 282.51 ms


KeyboardInterrupt: 