# Music Plagiarism Detector

Neste notebook, vamos criar um modelo básico para detectar plágio musical usando STFT, MFCC e comparação de distâncias Euclidianas.


In [59]:
import numpy as np
import librosa
import soundfile as sf
import os 
import pandas as pd

In [60]:
def loadAndSplit(fileName,filePath,destinationPath):
    y,sr=librosa.load(filePath+fileName)
    segment_length=sr*5
    num_segments = int(np.ceil(len(y) / segment_length))
    split=[]
    for i in range(num_segments):
        t = y[i * segment_length: (i + 1) * segment_length]
        split.append(t)
    for i in range(num_segments):
        fileNoMP3=fileName[:-4]
        sf.write(destinationPath+fileNoMP3+".mp3",split[i],sr)
    return "Done!"

In [61]:
def readSplitFiles(filePath):
    file_names=os.listdir(filePath)
    samples=[]
    for file in file_names:
        y,sr=sf.read(filePath+file,always_2d=True)
        y=y.flatten('F')[:y.shape[0]]
        mfcc_mean=[np.mean(feature) for feature in librosa.feature.mfcc(y=y)]
        samples.append(mfcc_mean)
    return samples

In [62]:
# Definindo os diretórios para as classes
Class_1_Original_Files = './Class_1_Original_Files/'
Class_1_Split_Files = './Class_1_Split_Files/'
Class_2_Original_Files = './Class_2_Original_Files/'
Class_2_Split_Files = './Class_2_Split_Files/'

# creating Class 1 frame
Class_1_File_Names=os.listdir(Class_1_Original_Files)
for file in Class_1_File_Names:
  loadAndSplit(file,Class_1_Original_Files,Class_1_Split_Files)
Class_1_Samples=readSplitFiles(Class_1_Split_Files)
Class_1_DF=pd.DataFrame(Class_1_Samples)
Class_1_DF['Class']=1
# creating Class 2 frame
Class_2_File_Names=os.listdir(Class_2_Original_Files)
for file in Class_2_File_Names:
  loadAndSplit(file,Class_2_Original_Files,Class_2_Split_Files)
Class_2_Samples=readSplitFiles(Class_2_Split_Files)
Class_2_DF=pd.DataFrame(Class_2_Samples)
Class_2_DF['Class']=2
# merging all samples to one DataFrame
df=pd.concat([Class_1_DF,Class_2_DF],axis=0)

LibsndfileError: Error opening './Class_1_Split_Files/Under Pressure (Remastered 2011).mp3': System error.

In [29]:
# importing required libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
# creating test and train data
X=df.drop('Class',axis=1)
X=sm.tools.add_constant(X)
y=df['Class']
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
# setting up pipeline
numeric_columns=X.columns.tolist()
numeric_pipeline=Pipeline([('numscaler',MinMaxScaler())])
ct=ColumnTransformer([('numpipe',numeric_pipeline,numeric_columns)])
knn_pipe=Pipeline([('preprocess',ct),
                   ('model',KNeighborsClassifier())])
# setting up gridsearch 
grid_params = {'model__n_neighbors': [3, 5, 7, 9, 11, 15],
               'model__weights': ['uniform', 'distance'],
               'model__metric': ['euclidean', 'manhattan']}
grid = GridSearchCV(estimator=knn_pipe, param_grid=grid_params, 
                    cv=5, n_jobs=-1)
# fitting and predicting
grid.fit(X_train, y_train)
y_pred=grid.predict(X_test)
y_pred_prob=grid.predict_proba(X_test)
# creating and displaying classification report
clf_report=classification_report(y_test,y_pred,output_dict=True)
pd.DataFrame(clf_report)