In [1]:
import ffmpeg
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
from tensorflow.keras import layers, models
import requests
import json

In [2]:
# Loading dataset

In [3]:
from pathlib import Path
import pandas as pd
positive_dir = Path(r'C:\Users\harsh\Downloads\archive\music_wav')
negative_dir = Path(r'C:\Users\harsh\Downloads\archive\speech_wav')

In [4]:
# Generatig Dataframe for training

In [5]:
def generate_df(audio_dir, label):
    file_paths = pd.Series(list(audio_dir.glob('*.wav')), name='Filepath').astype(str)
    print(f"Found {len(file_paths)} files in {audio_dir}")  # Debugging line
    
    if isinstance(label, str):
        labels = pd.Series([label] * len(file_paths), name='Label')
    elif isinstance(label, list) and len(label) == len(file_paths):
        labels = pd.Series(label, name='Label')
    else:
        raise ValueError("Length of 'label' list must match the number of audios in 'audio_dir'.")
    
    df = pd.concat([file_paths, labels], axis=1)
    return df

In [6]:
positive_df = generate_df(positive_dir, 'POSITIVE')
negative_df = generate_df(negative_dir, 'NEGATIVE')

all_df = pd.concat([positive_df, negative_df], axis=0).sample(frac=1, random_state=1).reset_index(drop=True)
all_df

Found 213 files in C:\Users\harsh\Downloads\archive\music_wav
Found 131 files in C:\Users\harsh\Downloads\archive\speech_wav


Unnamed: 0,Filepath,Label
0,C:\Users\harsh\Downloads\archive\speech_wav\go...,NEGATIVE
1,C:\Users\harsh\Downloads\archive\speech_wav\ve...,NEGATIVE
2,C:\Users\harsh\Downloads\archive\speech_wav\ex...,NEGATIVE
3,C:\Users\harsh\Downloads\archive\music_wav\ext...,POSITIVE
4,C:\Users\harsh\Downloads\archive\speech_wav\ex...,NEGATIVE
...,...,...
339,C:\Users\harsh\Downloads\archive\music_wav\pro...,POSITIVE
340,C:\Users\harsh\Downloads\archive\speech_wav\ex...,NEGATIVE
341,C:\Users\harsh\Downloads\archive\music_wav\ext...,POSITIVE
342,C:\Users\harsh\Downloads\archive\speech_wav\ex...,NEGATIVE


In [7]:
all_df.to_csv('./Downloads/full_csv')

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# Splitting audio into chunks 

In [10]:
def split_audio(audio_file, chunk_duration=20):

    y, sr = librosa.load(audio_file)
    total_duration = librosa.get_duration(y=y, sr=sr)
    chunks = []
    for i in range(0, int(total_duration), chunk_duration):
        start = i
        end = min(i + chunk_duration, total_duration)
        chunk = y[int(start * sr): int(end * sr)]
        chunks.append((chunk, start))
    return chunks, sr

In [11]:
# Creating spectrogram for each chunk

In [12]:
import pandas as pd

def create_spectrogram_dataframe_from_csv(csv_path, chunk_duration=20, target_shape=(128, 128)):

    data = pd.read_csv(csv_path)
    data.head()
    
    spectrogram_data = []

    for index, row in data.iterrows():
        file_path = row['Filepath']
        label = row['Label']
        
        chunks, sr = split_audio(file_path, chunk_duration)
        
        for chunk, start in chunks:

            S = librosa.stft(chunk)
            S_db = librosa.amplitude_to_db(np.abs(S))

            if S_db.shape[1] < target_shape[1]:
                S_db = np.pad(S_db, ((0, 0), (0, target_shape[1] - S_db.shape[1])), mode='constant')
            elif S_db.shape[1] > target_shape[1]:
                S_db = S_db[:, :target_shape[1]]
            
            S_db = S_db[:target_shape[0], :target_shape[1]]

            spectrogram_data.append({
                'Filepath': file_path,
                'start_time': start,
                'spectrogram': S_db,
                'Label': label
            })

    df = pd.DataFrame(spectrogram_data)
    return df


In [13]:
data = create_spectrogram_dataframe_from_csv('Downloads/full_csv')

In [14]:
data.head(10)

Unnamed: 0,Filepath,start_time,spectrogram,Label
0,C:\Users\harsh\Downloads\archive\speech_wav\go...,0,"[[14.888786, 20.423565, 21.639648, 21.552158, ...",NEGATIVE
1,C:\Users\harsh\Downloads\archive\speech_wav\go...,20,"[[14.952335, 20.226557, 21.623596, 21.982986, ...",NEGATIVE
2,C:\Users\harsh\Downloads\archive\speech_wav\ve...,0,"[[15.224457, 20.290585, 21.127655, 21.331951, ...",NEGATIVE
3,C:\Users\harsh\Downloads\archive\speech_wav\ve...,20,"[[15.330545, 20.6032, 21.743843, 22.086868, 22...",NEGATIVE
4,C:\Users\harsh\Downloads\archive\speech_wav\ex...,0,"[[16.752903, 10.850167, -24.529001, -8.490599,...",NEGATIVE
5,C:\Users\harsh\Downloads\archive\music_wav\ext...,0,"[[18.251917, 9.991868, -0.32600123, -6.097932,...",POSITIVE
6,C:\Users\harsh\Downloads\archive\speech_wav\ex...,0,"[[2.3438122, -5.551339, -14.956955, -21.029282...",NEGATIVE
7,C:\Users\harsh\Downloads\archive\speech_wav\ex...,0,"[[-28.524506, -30.02266, -30.02266, -18.743893...",NEGATIVE
8,C:\Users\harsh\Downloads\archive\music_wav\ext...,0,"[[19.792286, 15.094501, 8.901659, 11.15531, 0....",POSITIVE
9,C:\Users\harsh\Downloads\archive\music_wav\car...,0,"[[14.499021, 19.182356, 20.00507, 19.527771, 1...",POSITIVE


In [15]:
data.isnull().sum()

Filepath       0
start_time     0
spectrogram    0
Label          0
dtype: int64

In [16]:
# Splitting in Features and Labels

In [17]:
Features = np.array([np.array(spectrogram) for spectrogram in data['spectrogram']])
Label = data['Label']

In [18]:
Features.shape , Label.shape

((472, 128, 128), (472,))

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Label_encoded = le.fit_transform(Label)

In [20]:
Label_encoded

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,

In [21]:
Features = Features.reshape(-1 , 128 , 128 , 1)
Features.shape

(472, 128, 128, 1)

In [22]:
# Splitting Dataset

In [23]:
Feature_train , Feature_test , Label_train , Label_test = train_test_split(Features , Label_encoded , test_size = 0.25)

In [24]:
print("X_train shape:", Feature_train.shape)
print("y_train shape:", Label_train.shape)
print("X_train type:", type(Feature_train))
print("y_train type:", type(Label_train))

X_train shape: (354, 128, 128, 1)
y_train shape: (354,)
X_train type: <class 'numpy.ndarray'>
y_train type: <class 'numpy.ndarray'>


In [25]:
# Creating Model

In [26]:
def create_audio_cnn_model():
    model = models.Sequential()
    model.add(layers.Conv2D(16, (3, 3), activation='relu', input_shape=(128, 128, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(32, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [27]:
model = create_audio_cnn_model()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [28]:
model.fit(Feature_train, Label_train, epochs=10, validation_data=(Feature_test, Label_test))

Epoch 1/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 103ms/step - accuracy: 0.4705 - loss: 129.1596 - val_accuracy: 0.4237 - val_loss: 5.8341
Epoch 2/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 77ms/step - accuracy: 0.6446 - loss: 2.8484 - val_accuracy: 0.8136 - val_loss: 0.5859
Epoch 3/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 75ms/step - accuracy: 0.8024 - loss: 0.5835 - val_accuracy: 0.8729 - val_loss: 0.2911
Epoch 4/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 78ms/step - accuracy: 0.8989 - loss: 0.2762 - val_accuracy: 0.8729 - val_loss: 0.3299
Epoch 5/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 76ms/step - accuracy: 0.9453 - loss: 0.1425 - val_accuracy: 0.9407 - val_loss: 0.1828
Epoch 6/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 74ms/step - accuracy: 0.9819 - loss: 0.0603 - val_accuracy: 0.9322 - val_loss: 0.1634
Epoch 7/10
[1m12/12[0m [32m━

<keras.src.callbacks.history.History at 0x1a432e5a480>

In [29]:
# Model testing

In [30]:
test_loss, test_accuracy = model.evaluate(Feature_test, Label_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.9359 - loss: 0.2253
Test Loss: 0.1805035024881363
Test Accuracy: 0.9491525292396545


In [32]:
model.save("music_detection_model.keras")

In [33]:
# Extract audio from video

In [34]:
def extract_audio(video_file, audio_file):
    ffmpeg.input(video_file).output(audio_file).run()

extract_audio('Downloads/Fred again.. _ Boiler Room_ London.mp4', 'Downloads/extracted_audio.wav')

In [35]:
test_dir = Path('Downloads/archive/test')
test_df = generate_df(test_dir , 'testing')

Found 1 files in Downloads\archive\test


In [36]:
test_df.head()

Unnamed: 0,Filepath,Label
0,Downloads\archive\test\extracted_audio.wav,testing


In [37]:
test_df.to_csv('./Downloads/test_csv')

In [38]:
# Splitting into chunks and creating spectrograms

In [39]:
test_data = create_spectrogram_dataframe_from_csv('./Downloads/test_csv')

In [40]:
test_data.head()

Unnamed: 0,Filepath,start_time,spectrogram,Label
0,Downloads\archive\test\extracted_audio.wav,0,"[[-38.981056, -38.981056, -38.981056, -38.9810...",testing
1,Downloads\archive\test\extracted_audio.wav,20,"[[5.9243593, 1.1852031, -15.110134, -16.676916...",testing
2,Downloads\archive\test\extracted_audio.wav,40,"[[-27.339714, -30.022415, -30.022415, -20.0239...",testing
3,Downloads\archive\test\extracted_audio.wav,60,"[[6.7900896, 0.30850047, -29.216637, -16.07026...",testing
4,Downloads\archive\test\extracted_audio.wav,80,"[[-1.5412357, -10.521716, -14.608147, -17.8644...",testing


In [41]:
test_data.isnull().sum()

Filepath       0
start_time     0
spectrogram    0
Label          0
dtype: int64

In [42]:
# Predicting chunks

In [43]:
def predict_chunks(data, model):
    predictions = []

    for _, row in data.iterrows():
        
        spectrogram = np.array(row['spectrogram'])
        
        spectrogram = np.expand_dims(spectrogram, axis=-1)
        spectrogram = np.expand_dims(spectrogram, axis=0) 

        prediction = model.predict(spectrogram)
        predicted_class = np.argmax(prediction, axis=1)[0]

        predictions.append(predicted_class)

    data['Prediction'] = predictions
    return data

In [44]:
datax = predict_chunks(test_data , model)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26

In [45]:
datax.head(10)

Unnamed: 0,Filepath,start_time,spectrogram,Label,Prediction
0,Downloads\archive\test\extracted_audio.wav,0,"[[-38.981056, -38.981056, -38.981056, -38.9810...",testing,0
1,Downloads\archive\test\extracted_audio.wav,20,"[[5.9243593, 1.1852031, -15.110134, -16.676916...",testing,0
2,Downloads\archive\test\extracted_audio.wav,40,"[[-27.339714, -30.022415, -30.022415, -20.0239...",testing,0
3,Downloads\archive\test\extracted_audio.wav,60,"[[6.7900896, 0.30850047, -29.216637, -16.07026...",testing,0
4,Downloads\archive\test\extracted_audio.wav,80,"[[-1.5412357, -10.521716, -14.608147, -17.8644...",testing,0
5,Downloads\archive\test\extracted_audio.wav,100,"[[-16.897648, -27.533808, -15.275465, -13.9629...",testing,0
6,Downloads\archive\test\extracted_audio.wav,120,"[[-2.983436, -0.58572596, -7.2836943, -21.6692...",testing,1
7,Downloads\archive\test\extracted_audio.wav,140,"[[25.05285, 21.269934, -14.450684, -6.8510437,...",testing,1
8,Downloads\archive\test\extracted_audio.wav,160,"[[16.298397, 13.5275345, -3.2981787, -1.904537...",testing,1
9,Downloads\archive\test\extracted_audio.wav,180,"[[8.125523, 2.6924057, 7.7259483, -6.52248, -1...",testing,1


In [46]:
datax.to_csv('./Downloads/predicted')

In [47]:
# Save chunks as postive and negative and create positive metadata 

In [48]:
import os
import pandas as pd
import librosa
import soundfile as sf

def save_audio_chunks_by_label(csv_path, chunk_duration=20):
    positive_dir = "./Downloads/wav_data/positive_chunks"
    negative_dir = "./Downloads/wav_data/negative_chunks"
    os.makedirs(positive_dir, exist_ok=True)
    os.makedirs(negative_dir, exist_ok=True)

    positive_chunks_metadata = []

    data = pd.read_csv(csv_path)

    for index, row in data.iterrows():
        file_path = row['Filepath']
        start_time = row['start_time']
        prediction = row['Prediction']

        chunk, sr = librosa.load(file_path, offset=start_time, duration=chunk_duration)

        if prediction == 1:
            save_dir = positive_dir
        else:
            save_dir = negative_dir

        chunk_name = f"{os.path.splitext(os.path.basename(file_path))[0]}_chunk_{index}.wav"
        chunk_path = os.path.join(save_dir, chunk_name)

        sf.write(chunk_path, chunk, sr)

        if prediction == 1:
            positive_chunks_metadata.append({
                "chunk_path": chunk_path,
                "chunk_name": chunk_name,
                "start_time": start_time
            })

    positive_chunks_csv_path = os.path.join(positive_dir, "positive_chunks_metadata.csv")
    positive_chunks_df = pd.DataFrame(positive_chunks_metadata)
    positive_chunks_df.to_csv(positive_chunks_csv_path, index=False)

    print(f"Chunks saved to respective directories based on predictions.")
    print(f"Metadata for positive chunks saved to {positive_chunks_csv_path}")

save_audio_chunks_by_label('./Downloads/predicted')


Chunks saved to respective directories based on predictions.
Metadata for positive chunks saved to ./Downloads/wav_data/positive_chunks\positive_chunks_metadata.csv


In [49]:
import os
import base64
import http.client
import json
from pydub import AudioSegment

In [50]:
def convert_shortened_chunk_to_raw(audio_path, duration_ms=3000):
    
    audio = AudioSegment.from_wav(audio_path)
    
    short_audio = audio[:duration_ms]
    
    short_audio = short_audio.set_channels(1).set_frame_rate(44100).set_sample_width(2)

    raw_audio_path = "temp.raw"
    short_audio.export(raw_audio_path, format="raw")

    with open(raw_audio_path, "rb") as raw_file:
        raw_data = raw_file.read()
    encoded_data = base64.b64encode(raw_data).decode("utf-8")

    os.remove(raw_audio_path)
    
    return encoded_data

In [51]:
encoded_data = convert_shortened_chunk_to_raw('Downloads/wav_data/positive_chunks/extracted_audio_chunk_100.wav')
encoded_data

'pAq+DtkSzgvDBBH9YPX29oz4hvuB/nL1ZOxj62PqpfTn/iP7YPdw9IDxHfe6/Jr8evzD+gz5Zv7AA8kB0//a+uL1hP4mB48K+Q2lCVEFXw5uF6Ea1B1jFfMMvxCMFLQU3BTTEMsMUA3VDdUN1Q1NCcYE/wY5CUgJVwlOAkb70P9aBMMMLBWnEyISOg5TCt0NaBFRDzoN9g2zDlsUBBpzGOIW6BXvFDETcxHLDyQOPQ5WDswMQguSCuMJcQkACagHUAbdAGv7sv75AZIELAcoAiX9zP10/j0ABgKQABr/ywB8AssBGgEKAfoAewT9B7MEaQEiA9sEnARdBIH/pvrV/wUFxgyIFI8OlwhMCQEK9A/oFSISXQ7zCYoFHwu0ELERrxLnDSAJqRMzHkMbUxjREFAJRxY+I/ojtiRtFSUGMAo8Dj8PQxA8BDb4v/ZI9dTvYert4XnZMuPr7Gbv4vFE4qfSLNix3XDgL+Pk25nUhN9w6rfs/u4h50TfQN483RXi7+by5/Xog+gS6C3pSOpn5ofig+SA5rnt8vQa9UP1o/ID8O7y2fVn9fX0svlv/qkE4wowA377lPer8y75sv4y+7L3W/MF70Lxf/Mk7sroD+VU4WrqgfOn+c3/Bv0/+kIBRgg+DjYUyQ5cCR0L3gwkEWsViw6rB5YGggVHDAwTVQ+fC7EJxAemDYgT7xJXEjwZIiBSKoI0OCzvIxIiNiCBIcwiJBJ9AfADZAYZEc8b3xHwB4sJJgsZFA0d+BbkEBwSVBPSHVAolCHYGgcaNhnYGXsahwuU/Dr24O+78ZbzEeqN4J/Xss4U1HfZONL6ypLGKsL0zL/X4tEGzG3I1cRoz/zZ1tiw13XZOtuq4xrsfOXe3oTZK9Rl2J/cu9zY3Cvjf+ng9kEEBwXNBQ0CTv6RB9QQdRMXFtUWlBeHGnsdbRdfEcAMIQjQBX8Dk/+n+yv6sPhv9C/w3OqK5eLpOu4B8cnz9fsiBC0HOAr/A8b

In [52]:
def send_to_shazam(encoded_data):

    conn = http.client.HTTPSConnection("shazam.p.rapidapi.com")
    headers = {
        'x-rapidapi-key': "772354c8f6mshff6ce85382008e5p1be235jsn5cf1a16d5917",
        'x-rapidapi-host': "shazam.p.rapidapi.com",
        'Content-Type': "text/plain"
    }

    # Send request
    conn.request("POST", "/songs/v2/detect?timezone=America%2FChicago&locale=en-US", body=encoded_data, headers=headers)
    res = conn.getresponse()
    data = res.read()
    return data.decode("utf-8")

In [53]:
res = send_to_shazam(encoded_data)
res

'{"matches":[{"id":"638038258","offset":106.472421875,"timeskew":-0.0002565384,"frequencyskew":-0.0003618002},{"id":"638038258","offset":106.457578125,"timeskew":-0.002638638,"frequencyskew":0.0},{"id":"638038258","offset":106.459796875,"timeskew":-0.0026385188,"frequencyskew":-0.00015509129},{"id":"638038258","offset":107.359265625,"timeskew":0.0030779839,"frequencyskew":-0.0014166236}],"timestamp":1731737441065,"timezone":"America/Chicago","tagid":"5b930990-0b08-4d3c-b843-52a0a9113e73","track":{"layout":"5","type":"MUSIC","key":"638038258","title":"Strong","subtitle":"Romy & Fred again..","images":{"background":"https://is1-ssl.mzstatic.com/image/thumb/AMCArtistImages116/v4/98/51/58/985158f8-fcce-d2e4-a661-42a701163197/0d7f331e-2f83-49da-a84d-2ce411f29aca_ami-identity-3f73b52c9078a3c4580916a24f4ba24e-2023-06-07T15-59-32.702Z_cropped.png/800x800cc.jpg","coverart":"https://is1-ssl.mzstatic.com/image/thumb/Music116/v4/57/b4/ea/57b4ea8d-85da-0a6a-49f5-a7d932ab6e9f/889030032072.png/400x40

In [54]:
positive = pd.read_csv('Downloads/wav_data/positive_chunks/positive_chunks_metadata.csv')
positive.head()

Unnamed: 0,chunk_path,chunk_name,start_time
0,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_6.wav,120
1,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_7.wav,140
2,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_8.wav,160
3,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_9.wav,180
4,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_10.wav,200


In [55]:
# Convert chunk to raw data , send to shazam api , perform these two steps for all positive chunks

In [61]:
from io import StringIO

def update_csv_with_detections(csv_path, limit=30):

    # data = pd.read_csv(csv_path)

    detected_songs = []

    rows_to_process = csv_path.head(limit)

    for index, row in rows_to_process.iterrows():
        chunk_path = row['chunk_path']

        try:

            encoded_data = convert_shortened_chunk_to_raw(chunk_path)
            
            response = send_to_shazam(encoded_data)
            response_data = json.loads(response)
            
            detected_song = "Unknown"
            if "track" in response_data:
                detected_song = response_data["track"].get("title", "Unknown")
            
            print(f"Detected: {detected_song} for {chunk_path}")
            detected_songs.append(detected_song)

        except Exception as e:
            print(f"Error processing {chunk_path}: {e}")
            detected_songs.append("Error")

    rows_to_process['Detected_Song'] = detected_songs

    output = StringIO()
    rows_to_process.to_csv(output, index=False)
    csv_data = output.getvalue()
    output.close()

    return csv_data

In [62]:
detected_csv = update_csv_with_detections(positive)

Detected: Unknown for ./Downloads/wav_data/positive_chunks\extracted_audio_chunk_6.wav
Detected: Berlin for ./Downloads/wav_data/positive_chunks\extracted_audio_chunk_7.wav
Detected: Kammy (like i do) for ./Downloads/wav_data/positive_chunks\extracted_audio_chunk_8.wav
Detected: Kammy (like i do) for ./Downloads/wav_data/positive_chunks\extracted_audio_chunk_9.wav
Detected: Kammy (like i do) for ./Downloads/wav_data/positive_chunks\extracted_audio_chunk_10.wav
Detected: Kammy (like i do) for ./Downloads/wav_data/positive_chunks\extracted_audio_chunk_11.wav
Detected: Kammy (like i do) for ./Downloads/wav_data/positive_chunks\extracted_audio_chunk_12.wav
Detected: Kammy (like i do) for ./Downloads/wav_data/positive_chunks\extracted_audio_chunk_13.wav
Detected: Unknown for ./Downloads/wav_data/positive_chunks\extracted_audio_chunk_14.wav
Detected: Kammy (like i do) for ./Downloads/wav_data/positive_chunks\extracted_audio_chunk_15.wav
Detected: Kammy (like i do) for ./Downloads/wav_data/po

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_to_process['Detected_Song'] = detected_songs


In [63]:
dataframe = pd.read_csv(StringIO(detected_csv))
dataframe.head(10)

Unnamed: 0,chunk_path,chunk_name,start_time,Detected_Song
0,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_6.wav,120,Unknown
1,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_7.wav,140,Berlin
2,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_8.wav,160,Kammy (like i do)
3,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_9.wav,180,Kammy (like i do)
4,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_10.wav,200,Kammy (like i do)
5,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_11.wav,220,Kammy (like i do)
6,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_12.wav,240,Kammy (like i do)
7,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_13.wav,260,Kammy (like i do)
8,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_14.wav,280,Unknown
9,./Downloads/wav_data/positive_chunks\extracted...,extracted_audio_chunk_15.wav,300,Kammy (like i do)


In [64]:
def combine_transitions(df):
    combined_results = []
    current_song = None
    current_start = None

    for i in range(len(df)):
        detected_song = df.loc[i, "Detected_Song"]
        start_time = df.loc[i, "start_time"]

        if current_song is None or (detected_song != current_song and detected_song != "Unknown"):
            if current_song is not None:  
                end_time = df.loc[i, "start_time"]
                combined_results.append({"Detected_Song": current_song, "start_time": current_start, "end_time": end_time})

            current_song = detected_song if detected_song != "Unknown" else current_song
            current_start = start_time

    if current_song is not None:
        combined_results.append({
            "Detected_Song": current_song,
            "start_time": current_start,
            "end_time": df.iloc[-1]["start_time"] + 20
        })

    return pd.DataFrame(combined_results)


result_df = combine_transitions(dataframe)
result_df

Unnamed: 0,Detected_Song,start_time,end_time
0,Berlin,140,160
1,Kammy (like i do),160,380
2,Danielle (smile on my face),380,600
3,Tanya (Maybe Life),600,760
