<a href="https://colab.research.google.com/github/Harshit-jain-1/data-science-projects-with-unstructured-data/blob/main/Data_processing_and_analysis_with_audio_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install tensorflow-datasets



In [7]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [8]:
# load the Nsynth dataset
dataset, info = tfds.load('nsynth', split='train', with_info=True)
print(info)



Downloading and preparing dataset 73.07 GiB (download: 73.07 GiB, generated: 73.09 GiB, total: 146.16 GiB) to /root/tensorflow_datasets/nsynth/full/2.3.3...


Dl Completed...:   0%|          | 0/1069 [00:00<?, ? file/s]

Dataset nsynth downloaded and prepared to /root/tensorflow_datasets/nsynth/full/2.3.3. Subsequent calls will reuse this data.
tfds.core.DatasetInfo(
    name='nsynth',
    full_name='nsynth/full/2.3.3',
    description="""
    The NSynth Dataset is an audio dataset containing ~300k musical notes, each with
    a unique pitch, timbre, and envelope. Each note is annotated with three
    additional pieces of information based on a combination of human evaluation and
    heuristic algorithms: Source, Family, and Qualities.
    """,
    config_description="""
    Full NSynth Dataset is split into train, valid, and test sets, with no
    instruments overlapping between the train set and the valid/test sets.
    
    """,
    homepage='https://g.co/magenta/nsynth-dataset',
    data_dir='/root/tensorflow_datasets/nsynth/full/incomplete.6DPN45_2.3.3/',
    file_format=tfrecord,
    download_size=73.07 GiB,
    dataset_size=73.09 GiB,
    features=FeaturesDict({
        'audio': Audio(shape=(640

In [9]:
# insect the keys of one sample
for sample in dataset.take(1):
    print("Available keys:")
    for keys in sample.keys():
        print(keys)

Available keys:
audio
id
instrument
pitch
qualities
velocity


In [10]:
# Extract the audio and an alternate label (pitch)
def preprocess_nsynth(sample):
  audio = sample['audio']
  label = sample['pitch']
  return audio, label

# Apply preprocessing
processed_dataset = dataset.map(preprocess_nsynth)

# Take a single sample
for audio, label in processed_dataset.take(1):
  print(f"Audio Shape: {audio.shape}")
  print(f"Label (Pitch): {label.numpy()}")

Audio Shape: (64000,)
Label (Pitch): 106


In [11]:
from IPython.display import Audio

In [14]:
audio_np = audio.numpy()
Audio(audio_np, rate=16000) # assuming a sample rate of 16KHz

In [15]:
import plotly.graph_objects as go


In [16]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    y=audio_np,
    mode='lines',
    line=dict(color='black'),
    name="Waveform"
))

fig.update_layout(
    title="Waveform",
    xaxis_title="Time (samples)",
    yaxis_title="Amplitude",
    template="plotly_white",
    width=800,
    height=400
)

fig.show()

In [17]:
# Analyzing the spectogram
import librosa
import numpy as np

In [24]:
# compute the STFT
spectrogram = librosa.stft(audio_np, n_fft=512, hop_length=256)
spectrogram_db = librosa.amplitude_to_db(abs(spectrogram))

time = np.linspace(0, len(audio_np) / 16000, spectrogram_db.shape[1])
frequencies = np.linspace(0, 16000 / 2, spectrogram_db.shape[0])

fig = go.Figure(data=go.Heatmap(
    z=spectrogram_db,
    x=time,
    y=frequencies,
    colorscale='Viridis',
    colorbar=dict(title='Amplitude (db)')
))

fig.update_layout(
    title="Spectrogram",
    xaxis_title="Time (seconds)",
    yaxis_title="Frequency (Hz)",
    yaxis=dict(type="log"),
    template="plotly"
)

fig.show()

In [25]:
# analyzing instrument distribution
from collections import Counter

In [28]:
# count instument occurrences
instrument_counts = Counter()
for sample in dataset.take(1000):
  instrument = sample['instrument'] ['family'].numpy()
  instrument_counts[instrument] += 1

# map numeric IDs to instrument family names
instrument_families = ["Bass", "Brass", "Flute", "Guitar", "Keyboard", "Mallet", "Organ", "Reed", "String", "Synth Lead", "Synth Pad", "Vocal"]
mapped_family_counts = {instrument_families[family_id]: count for family_id, count in instrument_counts.items()}

import plotly.express as px
fig = px.bar(
    x=list(mapped_family_counts.keys()),
    y=list(mapped_family_counts.values()),
    labels={'x': 'Instrument Family', 'y': 'Count'},
    title="Distribution of Instrument Families",
    template="plotly"
)
fig.show()

In [29]:
# Mel Spectrogram Analysis
mel_spectrogram = librosa.feature.melspectrogram(y=audio_np, sr=16000, n_mels=128)
mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

fig = go.Figure(data=go.Heatmap(
    z=spectrogram_db,
    x=time,
    y=frequencies,
    colorscale='Viridis',
    colorbar=dict(title='Amplitude (db)')
))
fig.show()

In [32]:
# MFCC analysis
mfccs = librosa.feature.mfcc(y=audio_np, sr=16000, n_mfcc=13)

fig = go.Figure(data=go.Heatmap(
    z=mfccs,
    x=time,
    y=np.arange(1, mfccs.shape[0] + 1),
    colorscale='Viridis',
    colorbar=dict(title="MFCC Value")
))
fig.show()

In [33]:
# Transforming audio data
# apply pitch shift (+2 semitones)
audio_pitch_shifted = librosa.effects.pitch_shift(audio_np, sr=16000, n_steps=2)

# apply time-stretching (speed up by 1.5x)
audio_time_stretched = librosa.effects.time_stretch(audio_np, rate=1.5)

# plot waveforms
fig = go.Figure()
fig.add_trace(go.Scatter(y=audio_np, mode='lines', name='Original'))
fig.add_trace(go.Scatter(y=audio_pitch_shifted, mode='lines', name='Pitch Shifted'))
fig.add_trace(go.Scatter(y=audio_time_stretched, mode='lines', name='Time Stretched'))
fig.show()