In [4]:
import numpy as np
import soundfile as sf
import librosa 
import math
from glob import glob
import argparse
import os
import sys

In [67]:
class RawClip3(object):
    """Loads audio clips from disk, applies a rolling window, and
    extracts features from each sample."""
    featureFuncs = ['tonnetz', 'spectral_rolloff', 'spectral_contrast',
                    'spectral_bandwidth', 'spectral_flatness', 'mfcc',
                    'chroma_cqt', 'chroma_cens', 'melspectrogram']

    def __init__(self, sourcefile, Y_class=None):
        self.y, self.sr = sf.read(sourcefile)
        self.laughs = None
        self.Y_class = Y_class

    def resample(self, rate, channel):
        return librosa.resample(self.y.T[channel], self.sr, rate)

    def amp(self, rate=22050, n_fft=2048, channel=0):
        D = librosa.amplitude_to_db(librosa.magphase(librosa.stft(
            self.resample(rate, channel), n_fft=n_fft))[0], ref=np.max)
        return D

    def _extract_feature(self, func):
        method = getattr(librosa.feature, func)

        # Construct params for each 'class' of features
        params = {'y': self.raw}
        if 'mfcc' in func:
            params['sr'] = self.sr
            params['n_mfcc'] = 128
        if 'chroma' in func:
            params['sr'] = self.sr

        feature = method(**params)

        return feature

    def _split_features_into_windows(self, data, duration):
        # Apply a moving window
        windows = []

        # Pad the rightmost edge by repeating frames, simplifies stretching
        # the model predictions to the original audio later on.
        data = np.pad(data, [[0, duration], [0, 0]], mode='edge')
        for i in range(data.shape[0] - duration):
            windows.append(data[i:i + duration])

        return np.array(windows)

    def build_features(self, duration=30, milSamplesPerChunk=10):
        # Extract features, one chunk at a time (to reduce memory required)
        # Tip: about 65 million samples for a normal-length episode
        # 10 million samples results in around 1.5GB to 2GB memory use
        features = []

        chunkLen = milSamplesPerChunk * 1000000
        numChunks = math.ceil(self.y.shape[0] / chunkLen)

        for i in range(numChunks):
            # Set raw to the current chunk, for _extract_feature
            self.raw = self.y.T[0][i * chunkLen:(i + 1) * chunkLen]

            # For this chunk, run all of our feature extraction functions
            # Each returned array is in the shape (features, steps)
            # Use concatenate to combine (allfeatures, steps)
            chunkFeatures = np.concatenate(
                list(
                    map(self._extract_feature, self.featureFuncs)
                )
            )
            features.append(chunkFeatures)

        # Transform to be consistent with our LSTM expected input
        features = np.concatenate(features, axis=1).T
        # Combine our chunks along the time-step axis.
        features = self._split_features_into_windows(features, duration)

        return features

In [120]:
class LaughRemover(object):
    """Contains the logic to apply predictions as audio transformations"""
    def __init__(self, kerasModel=None, kerasModelFile=None):
        import keras
        assert kerasModel or kerasModelFile
        if kerasModel:
            self.model = kerasModel
        elif kerasModelFile:
            self.model = keras.models.load_model(filepath=kerasModelFile)

    def remove_laughs(self, infile, outfile):
        """Remove laughs from a single sound file"""
        rc = RawClip3(infile)
        rc.laughs = self.model.predict(rc.build_features())
        self._apply_laughs_array(rc.y, rc.sr, outfile, rc.laughs[:, 0])
        return rc

    def batch_remove_laughs(self, indir : str, outdir: str, batch_size: int=32):
        """Remove laughs from all files in a directory"""
        # If indir == outdir, processes files in-place 
        batch_of_features = []
        for filename in os.listdir(indir):
            rc = RawClip3(os.path.join(indir, filename))
            features = rc.build_features()
            # Need to add some form of padding to each file so that it can be batched for keras.
            # Then need to unpad so that original file duration is restored.
            # Right now, it just loads the model once, and runs all the files through it one-by-one. 
            rc.laughs = self.model.predict(features)
            self._apply_laughs_array(rc.y, rc.sr, os.path.join(outdir, filename), rc.laughs[:, 1])

    def _apply_laughs_array(self, y, sr, outfile, laughs):
        y.T[0] = self._apply_frames_to_samples(frames=laughs, samples=y.T[0])

        y.T[1] = self._apply_frames_to_samples(frames=laughs, samples=y.T[1])

        sf.write(outfile, y, sr) 

    def _apply_frames_to_samples(self, frames, samples, exp=1, period=15):
        # Apply a rolling average to smooth the laugh/notlaugh sections
        frames = np.convolve(frames, np.ones((period,)) / period, mode='same')
        # Each frame = default 512 samples, so expand over that period
        frames = np.repeat(frames, librosa.core.frames_to_samples(1))
        # Trim excess padding off the rightmost end
        frames = frames[:len(samples)]
        # Finally, apply audio volume change
        return samples * (frames ** exp)

In [121]:
def do_mute_laughs(sourceFile, outFile, model):
    params = {}
    if type(model) == str:
        params['kerasModelFile'] = model
    else:
        params['kerasModel'] = model

    laughr = LaughRemover(**params)

    arr=laughr.remove_laughs(sourceFile, outFile)

    return arr  

In [33]:
from keras.models import load_model
Model = load_model('model.h5')

print(Model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_1 (Batc  (None, 30, 296)          1184      
 hNormalization)                                                 
                                                                 
 lstm_1 (LSTM)               (None, 30, 64)            92416     
                                                                 
 lstm_2 (LSTM)               (None, 30, 64)            33024     
                                                                 
 lstm_3 (LSTM)               (None, 64)                33024     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 2)                 130       
                                                      

In [73]:
def print_params(sourceFile, outFile, model):
    params = {}
    if type(model) == str:
        params['kerasModelFile'] = model
    else:
        params['kerasModel'] = model

    print(params.items())

In [111]:
print_params(sourceFile='mn.wav',
                outFile='output.wav',
                model=Model)



dict_items([('kerasModel', <keras.engine.sequential.Sequential object at 0x00000276F3B925F0>)])


In [122]:
arr = do_mute_laughs(sourceFile='02-ff.wav',
                outFile='output.wav',
                model=Model)



In [123]:
arr

<__main__.RawClip3 at 0x2768f88aad0>

In [124]:
arr = do_mute_laughs(sourceFile='mn.wav',
                outFile='mnnn.wav',
                model=Model)



In [8]:
y, sr = librosa.load('compiled.wav')

def avg_amplitude(y, sr):
    second = []
    for s in range(0,len(y),sr):
        second.append( y[s:s+sr].mean() )
    print(s)

avg_amplitude(y,sr)

1940400


In [37]:
import librosa
import numpy as np
import librosa.display
import scipy
from matplotlib import pyplot as plt
from scipy.signal import butter, filtfilt

# Define the laughter frequency range (100 Hz to 500 Hz)
laughter_min_freq = 100
laughter_max_freq = 500

# Define the window and hop sizes for the STFT
window_size = 2048
hop_size = 512

# Load the first audio file
audio1, sr1 = librosa.load('12-ff.wav', sr=None)

# Apply a bandpass filter to isolate the laughter frequency range
b, a = scipy.signal.butter(2, [laughter_min_freq/(sr1/2), laughter_max_freq/(sr1/2)], btype='band')
audio1_filtered = scipy.signal.filtfilt(b, a, audio1)

# Calculate the STFT of the filtered audio signal
stft1 = librosa.stft(audio1_filtered, n_fft=window_size, hop_length=hop_size)

# Calculate the spectral energy within the laughter frequency range at each time frame of the STFT
laughter_energy1 = np.sum(np.abs(stft1[(laughter_min_freq*window_size//sr1):(laughter_max_freq*window_size//sr1), :])**2, axis=0)

# Compute the total energy and peak amplitude of the laughter waveform
total_energy1 = np.sum(laughter_energy1)
peak_amplitude1 = np.max(audio1_filtered)

# Normalize the total energy and peak amplitude values by the duration of the audio file
duration1 = len(audio1)/sr1
normalized_energy1 = total_energy1/duration1
normalized_amplitude1 = peak_amplitude1

# Load the second audio file
audio2, sr2 = librosa.load('08-ff.wav', sr=None)

# Apply the same processing to the second audio file
audio2_filtered = scipy.signal.filtfilt(b, a, audio2)
stft2 = librosa.stft(audio2_filtered, n_fft=window_size, hop_length=hop_size)
laughter_energy2 = np.sum(np.abs(stft2[(laughter_min_freq*window_size//sr2):(laughter_max_freq*window_size//sr2), :])**2, axis=0)
total_energy2 = np.sum(laughter_energy2)
peak_amplitude2 = np.max(audio2_filtered)
duration2 = len(audio2)/sr2
normalized_energy2 = total_energy2/duration2
normalized_amplitude2 = peak_amplitude2

# Compare the normalized intensity and frequency values between the two audio files
print(f"Normalized energy of file 1: {normalized_energy1:.2f}")
print(f"Normalized energy of file 2: {normalized_energy2:.2f}")
print(f"Normalized amplitude of file 1: {normalized_amplitude1:.2f}")
print(f"Normalized amplitude of file 2: {normalized_amplitude2:.2f}")

Normalized energy of file 1: 26136.44
Normalized energy of file 2: 17356.37
Normalized amplitude of file 1: 0.20
Normalized amplitude of file 2: 0.17


In [45]:
import librosa
import numpy as np

def laughter_score(audio_file):
    # Load audio file and extract features
    y, sr = librosa.load(audio_file)
    laughter_segments = librosa.effects.split(y, top_db=20)
    laughter_duration = librosa.get_duration(y)
    num_laugh_segments = len(laughter_segments)
    laugh_variability = np.std(librosa.feature.rms(y=librosa.effects.trim(y)[0]))
    laughs_per_minute = num_laugh_segments / (laughter_duration / 60)
    max_loudness = np.max(y)
    energy = np.sum(y ** 2)

    # Assign weights to each feature
    duration_weight = 0.2
    num_laugh_weight = 0.1
    variability_weight = 0.2
    laughs_per_min_weight = 0.1
    max_loudness_weight = 0.2
    energy_weight = 0.2

    # Normalize each feature to a score between 0 and 10
    duration_score = (laughter_duration / 20) * duration_weight
    num_laugh_score = (num_laugh_segments / 5) * num_laugh_weight
    variability_score = (laugh_variability / 0.15) * variability_weight
    laughs_per_min_score = (laughs_per_minute / 5) * laughs_per_min_weight
    max_loudness_score = (max_loudness / 2) * max_loudness_weight
    energy_score = (energy / 2) * energy_weight

    # Combine normalized scores into final score
    final_score = (duration_score + num_laugh_score + variability_score + laughs_per_min_score + max_loudness_score + energy_score) * 10 / (duration_weight + num_laugh_weight + variability_weight + laughs_per_min_weight + max_loudness_weight + energy_weight)

    return final_score



In [46]:
print(laughter_score('output.wav'))

307.78302942180903


  1.0093950e-09  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  laughter_duration = librosa.get_duration(y)


In [19]:
import librosa
import numpy as np

# Load the audio file
audio, sr = librosa.load('compiled.wav')

# Detect onsets in the laughter segment
onset_env = librosa.onset.onset_strength(audio, sr=sr)
onset_frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr)

# Extract the laughter segments
segments = []
for i in range(len(onset_frames)):
    start = onset_frames[i]
    if i == len(onset_frames) - 1:
        end = audio.size
    else:
        end = onset_frames[i+1]
    segment = audio[start:end]
    segments.append(segment)

# Compute the intensity of each laugh segment
intensities = []
for segment in segments:
    energy = np.sum(segment**2)
    rms_energy = np.sqrt(energy / segment.size)
    intensity = 10 * np.log10(energy / segment.size)
    intensities.append(intensity)

# Normalize the intensities to a range of 0 to 1
max_intensity = np.max(intensities)
norm_intensities = intensities / max_intensity

# Adjust the length of the weighting array to match the number of laughs
w = np.ones(len(norm_intensities))
w /= np.sum(w)

# Compute the weighted average of the normalized intensities
score = np.dot(w, norm_intensities)

print(score)

2.5129751925799706


  onset_env = librosa.onset.onset_strength(audio, sr=sr)


In [29]:
import os
import numpy as np
import pandas as pd
import librosa

# Define a function to calculate the intensity of laughter using RMS value and duration
def calculate_intensity(audio_file):
    y, sr = librosa.load(audio_file, sr=None)  # Load the audio file
    y, _ = librosa.effects.trim(y)  # Remove silent portions of the signal
    
    # Calculate the RMS value of the signal
    rms = np.sqrt(np.mean(y ** 2))
    
    # Calculate the duration of the laughter in seconds
    duration = librosa.get_duration(y=y, sr=sr)
    
    # Calculate the intensity of the laughter by normalizing the RMS value with the duration
    intensity = rms / duration
    
    return intensity*10000

# Example usage
intensity = calculate_intensity('compiled.wav')
print(f'Intensity of laughter: {intensity}')


Intensity of laughter: 8.811465228085329
