In [1]:
import os
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import librosa
import soundfile as sf
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
# Frame length and sample rate for VGGish
FRAME_LENGTH = 0.96  # seconds
SAMPLE_RATE = 16000  # Hz
NUM_SAMPLES = int(FRAME_LENGTH * SAMPLE_RATE)  # Number of samples per frame

In [3]:
def waveform_generator(files, labels, batch_size, sample_rate=SAMPLE_RATE, frame_length=FRAME_LENGTH):
    while True:
        for start in range(0, len(files), batch_size):
            x_batch = []
            y_batch = []
            end = min(start + batch_size, len(files))
            for i in range(start, end):
                try:
                    y, sr = sf.read(files[i])
                    y = librosa.resample(y, sr, sample_rate)
                    
                    # Generate waveform frames
                    n_frames = int(np.ceil(len(y) / NUM_SAMPLES))
                    for j in range(n_frames):
                        start_sample = j * NUM_SAMPLES
                        end_sample = start_sample + NUM_SAMPLES
                        frame = y[start_sample:end_sample]
                        if len(frame) < NUM_SAMPLES:
                            frame = np.pad(frame, (0, NUM_SAMPLES - len(frame)), mode='constant')
                        x_batch.append(frame)
                        y_batch.append(labels[i])
                except Exception as e:
                    print(f"Error loading {files[i]}: {e}")
            x_batch = np.array(x_batch)
            y_batch = np.array(y_batch)
            yield x_batch, y_batch

def load_files(directory, label):
    files = [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    labels = [label] * len(files)
    return files, labels

def create_vggish_model():
    vggish_layer = hub.KerasLayer("https://tfhub.dev/google/vggish/1", trainable=False)
    model = tf.keras.Sequential([
        vggish_layer,
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [4]:
# Load file paths and labels
porn_dir = r'A:\AI DB\LSPD\Audio\porn'
non_porn_dir = r'A:\AI DB\LSPD\Audio\normal'

print("Loading file paths")
porn_files, porn_labels = load_files(porn_dir, 1)
non_porn_files, non_porn_labels = load_files(non_porn_dir, 0)
print("File paths loaded")

# Combine and split data
files = np.array(porn_files + non_porn_files)
labels = np.array(porn_labels + non_porn_labels)
X_train_files, X_test_files, y_train, y_test = train_test_split(files, labels, test_size=0.15, random_state=42, stratify=labels)


Loading file paths
File paths loaded


In [5]:
# Create model
model = create_vggish_model()

In [6]:
model.summary()

ValueError: This model has not yet been built. Build the model first by calling `build()` or by calling the model on a batch of data.

In [7]:
# Training parameters
batch_size = 32
steps_per_epoch = len(X_train_files) // batch_size
validation_steps = len(X_test_files) // batch_size

# Train model using the generator
print("Begin training")
train_gen = waveform_generator(X_train_files, y_train, batch_size)
val_gen = waveform_generator(X_test_files, y_test, batch_size)

model.fit(train_gen, epochs=15, steps_per_epoch=steps_per_epoch, validation_data=val_gen, validation_steps=validation_steps)


Begin training


KeyboardInterrupt: 

In [None]:
# Evaluate model
print("Evaluating model")
loss, accuracy = model.evaluate(val_gen, steps=validation_steps)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')