In [None]:
from gtts import gTTS
import numpy as np
import librosa
import soundfile as sf
import IPython.display as ipd
import speech_recognition as sr

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, sr

# Preprocess the audio
def preprocess_audio(audio):
    audio = librosa.util.normalize(audio)
    return audio

# Add noise to the audio
def add_noise(audio, noise_level=0.02):
    noise = np.random.normal(0, noise_level, audio.shape)
    noisy_audio = audio + noise
    return np.clip(noisy_audio, -1, 1)  # Ensure valid audio range

# Save the audio
def save_audio(audio, file_path):
    # Ensure audio is in the correct format
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    # Check the audio range
    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    # Hardcoded sample rate
    sr = 16000  # Use 16 kHz for this example

    # Save audio
    sf.write(file_path, audio, sr)

# Transcribe the audio using Google Speech Recognition
def transcribe_audio_google(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError:
        return "Could not request results from Google Speech Recognition service"

# Play the audio
def play_audio(file_path):
    return ipd.Audio(file_path)

# Main function to run the attack
def main():
    text = "Hello, this is a test sentence for adversarial attack experiments."

    # Generate the spoken text audio
    generate_spoken_text_audio(text, 'test_audio.wav')

    # Load the spoken text audio sample
    audio, sr = load_audio('test_audio.wav')

    # Save the original audio
    save_audio(audio, 'original_audio.wav')

    # Play the original audio
    print("Original Audio:")
    display(play_audio('original_audio.wav'))

    # Add noise to create adversarial audio
    noisy_audio = add_noise(audio, noise_level=0.02)

    # Save the adversarial audio
    save_audio(noisy_audio, 'adversarial_audio.wav')

    # Play the adversarial audio
    print("Adversarial Audio:")
    display(play_audio('adversarial_audio.wav'))

    # Transcribe the original and adversarial audio
    original_transcription = transcribe_audio_google('original_audio.wav')
    adversarial_transcription = transcribe_audio_google('adversarial_audio.wav')

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


Adversarial Audio:


Original Audio Transcription: hello this is a test sentence for adversarial attack experiments
Adversarial Audio Transcription: hello this is a test sentence for adversarial attack experiments


In [None]:
!pip install gtts


Collecting gtts
  Downloading gTTS-2.5.2-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.2


In [None]:
!pip install speechrecognition

Collecting speechrecognition
  Downloading SpeechRecognition-3.10.4-py2.py3-none-any.whl (32.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: speechrecognition
Successfully installed speechrecognition-3.10.4


In [None]:
from gtts import gTTS
import numpy as np
import librosa
import soundfile as sf
import IPython.display as ipd
import speech_recognition as sr
from scipy.optimize import minimize

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, sr

# Preprocess the audio
def preprocess_audio(audio):
    audio = librosa.util.normalize(audio)
    return audio

# Objective function for optimization
def objective_function(perturbation, audio, target_text):
    perturbation = np.asarray(perturbation, dtype=np.float32)
    adversarial_audio = audio + perturbation
    adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range

    # Save and transcribe adversarial audio
    save_audio(adversarial_audio, 'temp_adversarial.wav')
    adversarial_transcription = transcribe_audio_google('temp_adversarial.wav')

    # Simple loss function: Penalize similarity to target_text
    loss = np.sum(np.abs(np.array(target_text) - np.array(adversarial_transcription)))  # Placeholder
    return loss

# Generate perturbations using optimization
def perturb_audio(audio, target_text):
    initial_perturbation = np.zeros_like(audio, dtype=np.float32)
    result = minimize(objective_function, initial_perturbation, args=(audio, target_text), method='L-BFGS-B', bounds=[(-0.2, 0.2)] * len(audio))
    adversarial_audio = audio + result.x
    adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range
    return adversarial_audio

# Save the audio
def save_audio(audio, file_path):
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    sr = 16000  # Use 16 kHz for this example
    sf.write(file_path, audio, sr)

# Transcribe the audio using Google Speech Recognition
def transcribe_audio_google(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError:
        return "Could not request results from Google Speech Recognition service"

# Play the audio
def play_audio(file_path):
    return ipd.Audio(file_path)

# Main function to run the attack
def main():
    original_text = "Hello, this is a test sentence for adversarial attack experiments."
    target_text = "Hello World"  # Desired output transcription

    # Generate the spoken text audio
    generate_spoken_text_audio(original_text, 'test_audio.wav')

    # Load the spoken text audio sample
    audio, sr = load_audio('test_audio.wav')

    # Save the original audio
    save_audio(audio, 'original_audio.wav')

    # Play the original audio
    print("Original Audio:")
    display(play_audio('original_audio.wav'))

    # Preprocess the audio
    audio = preprocess_audio(audio)

    # Generate adversarial examples
    adversarial_audio = perturb_audio(audio, target_text)

    # Save the adversarial audio
    save_audio(adversarial_audio, 'adversarial_audio.wav')

    # Play the adversarial audio
    print("Adversarial Audio:")
    display(play_audio('adversarial_audio.wav'))

    # Transcribe the original and adversarial audio
    original_transcription = transcribe_audio_google('original_audio.wav')
    adversarial_transcription = transcribe_audio_google('adversarial_audio.wav')

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


UFuncTypeError: ufunc 'subtract' did not contain a loop with signature matching types (dtype('<U11'), dtype('<U64')) -> None

In [None]:
!pip install gtts librosa soundfile scipy SpeechRecognition

from gtts import gTTS
import numpy as np
import librosa
import soundfile as sf
import IPython.display as ipd
import speech_recognition as sr
from scipy.optimize import minimize

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, sr

# Preprocess the audio
def preprocess_audio(audio):
    audio = librosa.util.normalize(audio)
    return audio

# Hardcoded psychoacoustic masking threshold
def calculate_masking_threshold(audio):
    # Example hardcoded threshold value
    threshold = np.full_like(audio, 0.02, dtype=np.float32)
    return threshold

# Objective function for optimization
def objective_function(perturbation, audio, target_text, threshold):
    perturbation = np.asarray(perturbation, dtype=np.float32)  # Ensure correct type
    adversarial_audio = audio + perturbation
    adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range

    # Save and transcribe adversarial audio
    save_audio(adversarial_audio, 'temp_adversarial.wav')
    adversarial_transcription = transcribe_audio_google('temp_adversarial.wav')

    # Simple loss function: Penalize difference from target_text
    loss = abs(len(target_text) - len(adversarial_transcription))  # Placeholder for actual loss function
    return loss

# Generate perturbations using optimization
def perturb_audio(audio, target_text, threshold):
    initial_perturbation = np.zeros_like(audio, dtype=np.float32)
    result = minimize(objective_function, initial_perturbation, args=(audio, target_text, threshold), method='L-BFGS-B', bounds=[(-0.02, 0.02)] * len(audio))
    adversarial_audio = audio + result.x
    adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range
    return adversarial_audio

# Save the audio
def save_audio(audio, file_path):
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    sr = 16000  # Use 16 kHz for this example
    sf.write(file_path, audio, sr)

# Transcribe the audio using Google Speech Recognition
def transcribe_audio_google(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError:
        return "Could not request results from Google Speech Recognition service"

# Play the audio
def play_audio(file_path):
    return ipd.Audio(file_path)

# Main function to run the attack
def main():
    text = "Hello, this is a test sentence for adversarial attack experiments."

    # Generate the spoken text audio
    generate_spoken_text_audio(text, 'test_audio.wav')

    # Load the spoken text audio sample
    audio, sr = load_audio('test_audio.wav')

    # Save the original audio
    save_audio(audio, 'original_audio.wav')

    # Play the original audio
    print("Original Audio:")
    display(play_audio('original_audio.wav'))

    target_text = 'Hello World'  # Desired output transcription

    # Preprocess the audio
    audio = preprocess_audio(audio)

    # Hardcoded psychoacoustic masking threshold
    threshold = calculate_masking_threshold(audio)

    # Generate adversarial examples
    adversarial_audio = perturb_audio(audio, target_text, threshold)

    # Save the adversarial audio
    save_audio(adversarial_audio, 'adversarial_audio.wav')

    # Play the adversarial audio
    print("Adversarial Audio:")
    display(play_audio('adversarial_audio.wav'))

    # Transcribe the original and adversarial audio
    original_transcription = transcribe_audio_google('original_audio.wav')
    adversarial_transcription = transcribe_audio_google('adversarial_audio.wav')

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


KeyboardInterrupt: 

In [None]:
!pip install gtts librosa soundfile tensorflow speechrecognition numpy

import numpy as np
import librosa
import soundfile as sf
import tensorflow as tf
import speech_recognition as sr
from gtts import gTTS
import IPython.display as ipd

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, sr

# Save the audio
def save_audio(audio, file_path):
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    sr = 16000  # Use 16 kHz for this example
    sf.write(file_path, audio, sr)

# Transcribe the audio using Google Speech Recognition
def transcribe_audio_google(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError:
        return "Could not request results from Google Speech Recognition service"

# Play the audio
def play_audio(file_path):
    return ipd.Audio(file_path)

# Define a simple audio classification model for demonstration
def build_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=input_shape),
        tf.keras.layers.Conv1D(32, 3, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')  # Assuming 10 classes
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# FGSM attack implementation
def fgsm_attack(model, audio, label, epsilon=0.01):
    audio = tf.convert_to_tensor(audio, dtype=tf.float32)
    with tf.GradientTape() as tape:
        tape.watch(audio)
        prediction = model(tf.expand_dims(audio, axis=0))
        loss = tf.keras.losses.sparse_categorical_crossentropy(label, prediction)
    gradient = tape.gradient(loss, audio)
    perturbation = epsilon * tf.sign(gradient)
    adversarial_audio = audio + perturbation
    adversarial_audio = tf.clip_by_value(adversarial_audio, -1, 1)  # Ensure valid audio range
    return adversarial_audio.numpy()

# Main function to run the attack
def main():
    text = "Hello, this is a test sentence for adversarial attack experiments."
    generate_spoken_text_audio(text, 'test_audio.wav')

    audio, sr = load_audio('test_audio.wav')

    # Save the original audio
    save_audio(audio, 'original_audio.wav')

    # Play the original audio
    print("Original Audio:")
    display(play_audio('original_audio.wav'))

    # Define model input shape and build model
    input_shape = (audio.shape[0], 1)
    model = build_model(input_shape)

    # Assuming the label for the original audio is 0 (placeholder)
    label = 0

    # Perform FGSM attack
    epsilon = 0.01  # Perturbation magnitude
    adversarial_audio = fgsm_attack(model, audio, label, epsilon)

    # Save the adversarial audio
    save_audio(adversarial_audio, 'adversarial_audio.wav')

    # Play the adversarial audio
    print("Adversarial Audio:")
    display(play_audio('adversarial_audio.wav'))

    # Transcribe the original and adversarial audio
    original_transcription = transcribe_audio_google('original_audio.wav')
    adversarial_transcription = transcribe_audio_google('adversarial_audio.wav')

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


Adversarial Audio:


Original Audio Transcription: hello this is a test sentence for adversarial attack experiments
Adversarial Audio Transcription: hello this is a test sentence for adversarial attack experiments


In [None]:
!pip install gtts librosa soundfile tensorflow SpeechRecognition numpy

import numpy as np
import librosa
import soundfile as sf
import tensorflow as tf
import speech_recognition as sr
from gtts import gTTS
import IPython.display as ipd

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, sr

# Save the audio
def save_audio(audio, file_path):
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    sr = 16000  # Use 16 kHz for this example
    sf.write(file_path, audio, sr)

# Transcribe the audio using Google Speech Recognition
def transcribe_audio_google(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError:
        return "Could not request results from Google Speech Recognition service"

# Play the audio
def play_audio(file_path):
    return ipd.Audio(file_path)

# Define a simple audio classification model for demonstration
def build_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=input_shape),
        tf.keras.layers.Conv1D(32, 3, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')  # Assuming 10 classes
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Gradient Descent for adversarial audio generation
def gradient_descent_attack(model, audio, target_label, epsilon=0.01, iterations=100):
    audio = tf.Variable(audio, dtype=tf.float32)
    optimizer = tf.keras.optimizers.Adam(learning_rate=epsilon)

    for _ in range(iterations):
        with tf.GradientTape() as tape:
            tape.watch(audio)
            prediction = model(tf.expand_dims(audio, axis=0))
            loss = tf.keras.losses.sparse_categorical_crossentropy(target_label, prediction)
        gradient = tape.gradient(loss, audio)
        optimizer.apply_gradients([(gradient, audio)])
        adversarial_audio = audio.numpy()
        adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range

    return adversarial_audio

# Main function to run the attack
def main():
    text = "Hello, this is a test sentence for adversarial attack experiments."
    generate_spoken_text_audio(text, 'test_audio.wav')

    audio, sr = load_audio('test_audio.wav')

    # Save the original audio
    save_audio(audio, 'original_audio.wav')

    # Play the original audio
    print("Original Audio:")
    display(play_audio('original_audio.wav'))

    # Define model input shape and build model
    input_shape = (audio.shape[0], 1)
    model = build_model(input_shape)

    # Assuming the label for the original audio is 0 (placeholder)
    target_label = 0

    # Perform gradient descent attack
    epsilon = 0.01  # Perturbation magnitude
    iterations = 100
    adversarial_audio = gradient_descent_attack(model, audio, target_label, epsilon, iterations)

    # Save the adversarial audio
    save_audio(adversarial_audio, 'adversarial_audio.wav')

    # Play the adversarial audio
    print("Adversarial Audio:")
    display(play_audio('adversarial_audio.wav'))

    # Transcribe the original and adversarial audio
    original_transcription = transcribe_audio_google('original_audio.wav')
    adversarial_transcription = transcribe_audio_google('adversarial_audio.wav')

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


Adversarial Audio:


Original Audio Transcription: hello this is a test sentence for adversarial attack experiments
Adversarial Audio Transcription: hello this is a test sentence for adversarial a pack experiment


In [None]:
# !pip install gtts librosa soundfile tensorflow SpeechRecognition numpy

import numpy as np
import librosa
import soundfile as sf
import tensorflow as tf
import speech_recognition as sr
from gtts import gTTS
import IPython.display as ipd

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, sr

# Save the audio
def save_audio(audio, file_path):
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    sr = 16000  # Use 16 kHz for this example
    sf.write(file_path, audio, sr)

# Transcribe the audio using Google Speech Recognition
def transcribe_audio_google(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError:
        return "Could not request results from Google Speech Recognition service"

# Play the audio
def play_audio(file_path):
    return ipd.Audio(file_path)

# Define a simple audio classification model for demonstration
def build_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=input_shape),
        tf.keras.layers.Conv1D(32, 3, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')  # Assuming 10 classes
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Gradient Descent for adversarial audio generation
def gradient_descent_attack(model, audio, target_label, epsilon=0.001, iterations=100, regularization_strength=0.1):
    audio = tf.Variable(audio, dtype=tf.float32)
    optimizer = tf.keras.optimizers.Adam(learning_rate=epsilon)

    for _ in range(iterations):
        with tf.GradientTape() as tape:
            tape.watch(audio)
            prediction = model(tf.expand_dims(audio, axis=0))
            loss = tf.keras.losses.sparse_categorical_crossentropy(target_label, prediction)

            # Add regularization term to limit perturbation magnitude
            regularization_loss = regularization_strength * tf.reduce_sum(tf.square(audio))
            total_loss = loss + regularization_loss

        gradient = tape.gradient(total_loss, audio)
        optimizer.apply_gradients([(gradient, audio)])
        adversarial_audio = audio.numpy()
        adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range

    return adversarial_audio

# Main function to run the attack
def main():
    text = "Hello, this is a test sentence for adversarial attack experiments."
    generate_spoken_text_audio(text, 'test_audio.wav')

    audio, sr = load_audio('test_audio.wav')

    # Save the original audio
    save_audio(audio, 'original_audio.wav')

    # Play the original audio
    print("Original Audio:")
    display(play_audio('original_audio.wav'))

    # Define model input shape and build model
    input_shape = (audio.shape[0], 1)
    model = build_model(input_shape)

    # Assuming the label for the original audio is 0 (placeholder)
    target_label = 0

    # Perform gradient descent attack
    epsilon = 0.001  # Reduced perturbation magnitude
    iterations = 100
    regularization_strength = 0.1  # Regularization to control perturbation magnitude
    adversarial_audio = gradient_descent_attack(model, audio, target_label, epsilon, iterations, regularization_strength)

    # Save the adversarial audio
    save_audio(adversarial_audio, 'adversarial_audio.wav')

    # Play the adversarial audio
    print("Adversarial Audio:")
    display(play_audio('adversarial_audio.wav'))

    # Transcribe the original and adversarial audio
    original_transcription = transcribe_audio_google('original_audio.wav')
    adversarial_transcription = transcribe_audio_google('adversarial_audio.wav')

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


Adversarial Audio:


Original Audio Transcription: hello this is a test sentence for adversarial attack experiments
Adversarial Audio Transcription: Google Speech Recognition could not understand the audio


In [None]:
# !pip install gtts librosa soundfile numpy scipy scikit-learn SpeechRecognition

import numpy as np
import librosa
import soundfile as sf
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from gtts import gTTS
import speech_recognition as sr
import IPython.display as ipd

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, sr

# Save the audio
def save_audio(audio, file_path):
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    sr = 16000  # Use 16 kHz for this example
    sf.write(file_path, audio, sr)

# Transcribe the audio using Google Speech Recognition
def transcribe_audio_google(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError:
        return "Could not request results from Google Speech Recognition service"

# Play the audio
def play_audio(file_path):
    return ipd.Audio(file_path)

# PCA-based adversarial attack
def pca_adversarial_attack(audio, n_components=1, perturbation_strength=0.01):
    # Standardize the audio data
    scaler = StandardScaler()
    audio = scaler.fit_transform(audio.reshape(-1, 1)).flatten()

    # Apply PCA
    pca = PCA(n_components=n_components)
    audio_pca = pca.fit_transform(audio.reshape(-1, 1))

    # Generate perturbations in PCA space
    perturbation_pca = np.random.normal(loc=0.0, scale=perturbation_strength, size=audio_pca.shape)

    # Add perturbations and reconstruct audio
    adversarial_audio_pca = audio_pca + perturbation_pca
    adversarial_audio = pca.inverse_transform(adversarial_audio_pca)

    # Rescale the audio data
    adversarial_audio = scaler.inverse_transform(adversarial_audio).flatten()
    adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range

    return adversarial_audio

# Main function to run the attack
def main():
    text = "Hello, this is a test sentence for PCA-based adversarial attack experiments."
    generate_spoken_text_audio(text, 'test_audio.wav')

    audio, sr = load_audio('test_audio.wav')

    # Save the original audio
    save_audio(audio, 'original_audio.wav')

    # Play the original audio
    print("Original Audio:")
    display(play_audio('original_audio.wav'))

    # Perform PCA-based adversarial attack
    adversarial_audio = pca_adversarial_attack(audio, n_components=1, perturbation_strength=0.01)

    # Save the adversarial audio
    save_audio(adversarial_audio, 'adversarial_audio.wav')

    # Play the adversarial audio
    print("Adversarial Audio:")
    display(play_audio('adversarial_audio.wav'))

    # Transcribe the original and adversarial audio
    original_transcription = transcribe_audio_google('original_audio.wav')
    adversarial_transcription = transcribe_audio_google('adversarial_audio.wav')

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


Adversarial Audio:


Original Audio Transcription: hello this is a test sentence for PCA based adversarial attack experiments
Adversarial Audio Transcription: hello this is a test sentence for PCA based adversarial attack experiments


In [None]:
# !pip install gtts librosa soundfile numpy scipy scikit-learn SpeechRecognition

import numpy as np
import librosa
import soundfile as sf
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from gtts import gTTS
import speech_recognition as sr
import IPython.display as ipd

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, sr

# Save the audio
def save_audio(audio, file_path):
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    sr = 16000  # Use 16 kHz for this example
    sf.write(file_path, audio, sr)

# Transcribe the audio using Google Speech Recognition
def transcribe_audio_google(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError:
        return "Could not request results from Google Speech Recognition service"

# Play the audio
def play_audio(file_path):
    return ipd.Audio(file_path)

# PCA-based adversarial attack
def pca_adversarial_attack(audio, n_components=10, perturbation_strength=0.1):
    # Reshape the audio data to ensure proper dimensions for PCA
    audio = audio.reshape(-1, 1)

    # Standardize the audio data
    scaler = StandardScaler()
    audio_scaled = scaler.fit_transform(audio)

    # Check the number of components relative to the audio data dimensions
    n_samples, n_features = audio_scaled.shape
    if n_components > min(n_samples, n_features):
        n_components = min(n_samples, n_features)

    # Apply PCA
    pca = PCA(n_components=n_components)
    audio_pca = pca.fit_transform(audio_scaled)

    # Generate perturbations in PCA space
    perturbation_pca = np.random.normal(loc=0.0, scale=perturbation_strength, size=audio_pca.shape)

    # Add perturbations and reconstruct audio
    adversarial_audio_pca = audio_pca + perturbation_pca
    adversarial_audio_scaled = pca.inverse_transform(adversarial_audio_pca)

    # Rescale the audio data
    adversarial_audio = scaler.inverse_transform(adversarial_audio_scaled).flatten()
    adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range

    return adversarial_audio

# Main function to run the attack
def main():
    text = "Hello, this is a test sentence for PCA-based adversarial attack experiments."
    generate_spoken_text_audio(text, 'test_audio.wav')

    audio, sr = load_audio('test_audio.wav')

    # Save the original audio
    save_audio(audio, 'original_audio.wav')

    # Play the original audio
    print("Original Audio:")
    display(play_audio('original_audio.wav'))

    # Perform PCA-based adversarial attack
    adversarial_audio = pca_adversarial_attack(audio, n_components=10, perturbation_strength=0.1)

    # Save the adversarial audio
    save_audio(adversarial_audio, 'adversarial_audio.wav')

    # Play the adversarial audio
    print("Adversarial Audio:")
    display(play_audio('adversarial_audio.wav'))

    # Transcribe the original and adversarial audio
    original_transcription = transcribe_audio_google('original_audio.wav')
    adversarial_transcription = transcribe_audio_google('adversarial_audio.wav')

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


Adversarial Audio:


Original Audio Transcription: hello this is a test sentence for PCA based adversarial attack experiments
Adversarial Audio Transcription: hello this is a test sentence for PCA based adversarial attack experiments


In [None]:
# !pip install gtts librosa soundfile numpy scikit-learn SpeechRecognition

import numpy as np
import librosa
import soundfile as sf
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from gtts import gTTS
import speech_recognition as sr
import IPython.display as ipd

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, sr

# Save the audio
def save_audio(audio, file_path):
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    sr = 16000  # Use 16 kHz for this example
    sf.write(file_path, audio, sr)

# Transcribe the audio using Google Speech Recognition
def transcribe_audio_google(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError:
        return "Could not request results from Google Speech Recognition service"

# Play the audio
def play_audio(file_path):
    return ipd.Audio(file_path)

# Perform PCA-based adversarial attack
def pca_adversarial_attack(audio, perturbation_strength=0.01, n_components=10):
    audio = audio.reshape(-1, 1)

    scaler = StandardScaler()
    audio_scaled = scaler.fit_transform(audio)

    # Choose n_components based on the length of audio data
    n_samples, n_features = audio_scaled.shape
    pca_n_components = min(n_components, n_samples, n_features)

    pca = PCA(n_components=pca_n_components)
    audio_pca = pca.fit_transform(audio_scaled)

    # Create perturbation in PCA space
    perturbation = np.random.uniform(-perturbation_strength, perturbation_strength, audio_pca.shape)
    adversarial_audio_pca = audio_pca + perturbation

    # Reconstruct the audio
    adversarial_audio_scaled = pca.inverse_transform(adversarial_audio_pca)
    adversarial_audio = scaler.inverse_transform(adversarial_audio_scaled).flatten()
    adversarial_audio = np.clip(adversarial_audio, -1, 1)

    return adversarial_audio

# Main function to run the attack
def main():
    text = "Hello, this is a test sentence for PCA-based adversarial attack experiments."
    generate_spoken_text_audio(text, 'test_audio.wav')

    audio, sr = load_audio('test_audio.wav')

    save_audio(audio, 'original_audio.wav')

    print("Original Audio:")
    display(play_audio('original_audio.wav'))

    target_text = "Hello World"  # Desired incorrect transcription

    adversarial_audio = pca_adversarial_attack(audio, perturbation_strength=0.1, n_components=10)

    save_audio(adversarial_audio, 'adversarial_audio.wav')

    print("Adversarial Audio:")
    display(play_audio('adversarial_audio.wav'))

    original_transcription = transcribe_audio_google('original_audio.wav')
    adversarial_transcription = transcribe_audio_google('adversarial_audio.wav')

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


Adversarial Audio:


Original Audio Transcription: hello this is a test sentence for PCA based adversarial attack experiments
Adversarial Audio Transcription: stop playing this is a text message for 2:30 a.m. on Saturday


In [None]:
# !pip install gtts librosa soundfile numpy scipy SpeechRecognition

import numpy as np
import librosa
import soundfile as sf
from scipy.optimize import minimize
from gtts import gTTS
import speech_recognition as sr
import IPython.display as ipd

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, sr

# Save the audio
def save_audio(audio, file_path):
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    sr = 44000  # Use 44 kHz for this example
    sf.write(file_path, audio, sr)

# Transcribe the audio using Google Speech Recognition
def transcribe_audio_google(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_vosk(audio)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError:
        return "Could not request results from Google Speech Recognition service"

# Play the audio
def play_audio(file_path):
    return ipd.Audio(file_path)

# Generate psychoacoustic masking noise
def generate_masking_noise(audio, sr, noise_level=0.02):
    noise = np.random.uniform(-noise_level, noise_level, size=audio.shape)
    return noise

# Objective function for optimization with psychoacoustic hiding
def objective_function(perturbation, audio, target_text, threshold):
    perturbation = np.asarray(perturbation, dtype=np.float32)  # Ensure correct type
    adversarial_audio = audio + perturbation
    adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range

    # Save and transcribe adversarial audio
    save_audio(adversarial_audio, 'temp_adversarial.wav')
    adversarial_transcription = transcribe_audio_google('temp_adversarial.wav')

    # Simple loss function: Penalize difference from target_text
    loss = abs(len(target_text) - len(adversarial_transcription))  # Placeholder for actual loss function
    return loss

# Generate perturbations using optimization with psychoacoustic hiding
def perturb_audio(audio, target_text, threshold, noise_level=0.02):
    initial_perturbation = np.zeros_like(audio, dtype=np.float32)
    masking_noise = generate_masking_noise(audio, 16000, noise_level)

    def masked_objective_function(perturbation):
        adversarial_audio = audio + perturbation + masking_noise
        adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range
        return objective_function(perturbation, audio, target_text, threshold)

    result = minimize(masked_objective_function, initial_perturbation, method='L-BFGS-B', bounds=[(-0.02, 0.02)] * len(audio))
    adversarial_audio = audio + result.x + masking_noise
    adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range
    return adversarial_audio

# Main function to run the attack
def main():
    text = "Hello, this is a test sentence for psychoacoustic hiding adversarial attack experiments."
    generate_spoken_text_audio(text, 'test_audio.wav')

    audio, sr = load_audio('test_audio.wav')

    save_audio(audio, 'original_audio.wav')

    print("Original Audio:")
    display(play_audio('original_audio.wav'))

    target_text = "This text should be different."  # Desired incorrect transcription

    # Hardcoded psychoacoustic masking threshold
    threshold = 0.02

    # Generate adversarial examples
    adversarial_audio = perturb_audio(audio, target_text, threshold, noise_level=0.02)

    save_audio(adversarial_audio, 'adversarial_audio.wav')

    print("Adversarial Audio:")
    # display(play_audio('adversarial_audio.wav'))

    # Transcribe the original and adversarial audio
    original_transcription = transcribe_audio_google('original_audio.wav')
    # adversarial_transcription = transcribe_audio_google('adversarial_audio.wav')

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


Adversarial Audio:
Original Audio Transcription: Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder.


NameError: name 'adversarial_transcription' is not defined

In [None]:
!pip install vosk

Collecting vosk
  Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
Collecting srt (from vosk)
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting websockets (from vosk)
  Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: srt
  Building wheel for srt (setup.py) ... [?25l[?25hdone
  Created wheel for srt: filename=srt-3.5.3-py3-none-any.whl size=22428 sha256=7dc0847ed2edf2df451362717ae386e9e07a4e162809696895c719ec8b38aad3
  Stored in directory: /root/.cache/pip/wheels/d7/31/a1/18e1e7e8bfdafd19e6803d7eb919b563dd11de380e4304e332
Successfu

In [None]:
from gtts import gTTS
import numpy as np
import librosa
import soundfile as sf
from scipy.optimize import minimize
from pydub import AudioSegment
from io import BytesIO
import speech_recognition as sr
import IPython.display as ipd

# Generate spoken text audio
def generate_spoken_text_audio(text, file_path):
    tts = gTTS(text, lang='en')
    tts.save(file_path)

# Load the audio file
def load_audio(file_path):
    audio, srr = librosa.load(file_path, sr=16000)  # Use 16 kHz for demonstration
    return audio, srr

# Save the audio
def save_audio(audio, file_path):
    if audio.dtype != np.float32:
        audio = audio.astype(np.float32)

    if np.min(audio) < -1 or np.max(audio) > 1:
        print("Warning: Audio data is out of range. Clipping values to [-1, 1].")
        audio = np.clip(audio, -1, 1)

    # sr = 16000  # Use 16 kHz for this example
    sf.write(file_path, audio, 16000)

# Transcribe the audio using speech_recognition with BytesIO
def transcribe_audio_speech_recognition(audio, srr):
    audio_segment = AudioSegment(
        data=audio.tobytes(),
        sample_width=2,
        frame_rate=srr,
        channels=1
    )
    buffer = BytesIO()
    audio_segment.export(buffer, format="wav")
    buffer.seek(0)

    recognizer = sr.Recognizer()
    with sr.AudioFile(buffer) as source:
        audio_data = recognizer.record(source)
        try:
            transcription = recognizer.recognize_google(audio_data)
            return transcription
        except sr.UnknownValueError:
            return "Google Speech Recognition could not understand the audio"
        except sr.RequestError:
            return "Could not request results from Google Speech Recognition service"

# Generate psychoacoustic masking noise
def generate_masking_noise(audio, noise_level=0.02):
    noise = np.random.uniform(-noise_level, noise_level, size=audio.shape)
    return noise

# Combined objective function for optimization
def combined_loss_function(perturbation, audio, target_text, target_transcription, threshold, sr, noise_level=0.02):
    perturbation = np.asarray(perturbation, dtype=np.float32)  # Ensure correct type
    adversarial_audio = audio + perturbation
    adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range

    masking_noise = generate_masking_noise(audio, noise_level)
    adversarial_audio_with_mask = adversarial_audio + masking_noise
    adversarial_audio_with_mask = np.clip(adversarial_audio_with_mask, -1, 1)  # Ensure valid audio range

    # Transcribe adversarial audio
    adversarial_transcription = transcribe_audio_speech_recognition(adversarial_audio_with_mask, sr)

    # Loss components
    misclassification_loss = abs(len(target_transcription) - len(adversarial_transcription))  # Placeholder
    perceptual_loss = np.mean((adversarial_audio_with_mask - audio) ** 2)  # Mean Squared Error

    # Combine losses
    combined_loss = misclassification_loss + 0.1 * perceptual_loss  # Adjust weight as needed

    return combined_loss

# Generate perturbations using optimization with combined loss function
def perturb_audio(audio, target_text, threshold, sr, noise_level=0.02):
    target_transcription = transcribe_audio_speech_recognition(audio, sr)

    initial_perturbation = np.zeros_like(audio, dtype=np.float32)

    result = minimize(
        combined_loss_function,
        initial_perturbation,
        args=(audio, target_text, target_transcription, threshold, sr, noise_level),
        method='L-BFGS-B',
        bounds=[(-0.02, 0.02)] * len(audio)
    )

    perturbation = result.x
    masking_noise = generate_masking_noise(audio, noise_level)
    adversarial_audio = audio + perturbation + masking_noise
    adversarial_audio = np.clip(adversarial_audio, -1, 1)  # Ensure valid audio range
    return adversarial_audio

# Main function to run the attack
def main():
    text = "Hello, this is a test sentence for psychoacoustic hiding adversarial attack experiments."
    generate_spoken_text_audio(text, 'test_audio.wav')

    audio, srr = load_audio('test_audio.wav')

    save_audio(audio, 'original_audio.wav')

    # Play the original audio
    print("Original Audio:")
    ipd.display(ipd.Audio('original_audio.wav'))

    target_text = "This text should be different."  # Desired incorrect transcription

    # Hardcoded psychoacoustic masking threshold
    threshold = 0.02

    # Generate adversarial examples
    adversarial_audio = perturb_audio(audio, target_text, threshold, srr, noise_level=0.02)

    save_audio(adversarial_audio, 'adversarial_audio.wav')

    # Play the adversarial audio
    print("Adversarial Audio:")
    ipd.display(ipd.Audio('adversarial_audio.wav'))

    # Transcribe the original and adversarial audio
    original_transcription = transcribe_audio_speech_recognition(audio, srr)
    adversarial_transcription = transcribe_audio_speech_recognition(adversarial_audio, srr)

    print(f'Original Audio Transcription: {original_transcription}')
    print(f'Adversarial Audio Transcription: {adversarial_transcription}')

if __name__ == "__main__":
    main()


Original Audio:


In [None]:
!pip install pydub


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
