# Exploring Audio Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import sklearn.preprocessing

#We'll first upload the audio file to the notebook. For this, we use 'librosa'.
#Librosa is a python package for music and audio analysis. The package has been already installed. 
#Let us first import the library.

# Librosa (the mother of audio files)
import librosa, IPython
import librosa.display
import IPython.display as ipd
import warnings
warnings.filterwarnings('ignore')

The first step in any data science project is data analysis. 
We are working with audio data here. Let's first try listening to one of these audio files.

In [2]:
#To listen to the audio file, we simply use display' command and specify the path to the audio.
IPython.display.Audio('Data/genres_original/disco/disco.00006.wav')

ValueError: rate must be specified when data is a numpy array or list of audio samples.

In [None]:
#Let's try to load the audio file 'disco.00005.wav' using librosa
#Notice that when we load the file using librosa, we get two outputs:
#Sound: sequence of vibrations in varying pressure strengths (y)
#The sample rate (sr) is the number of samples of audio carried per second, measured in Hz or kHz

y, sr = librosa.load('Data/genres_original/disco/disco.00005.wav')
y_1, sr_1 = librosa.load('Data/genres_original/metal/metal.00005.wav')
y_2, sr_2 = librosa.load('Data/genres_original/blues/blues.00005.wav')
y_3, sr_3 = librosa.load('Data/genres_original/jazz/jazz.00005.wav')
y_4, sr_4 = librosa.load('Data/genres_original/classical/classical.00005.wav')

#Let's see what y and sr look like

print('y:', y, '\n')
print('Sample Rate (KHz):', sr, '\n')

In [None]:
#Just to verify, we can ensure that the total length of y, i.e, the total number of times sampling has been done, is equal to number of samplings per second times the length of the audio in seconds.

print('y shape:', np.shape(y), '\n')  #gives the total number of times sampling has been done

# Verify length of the audio
print('Check Len of Audio:', np.shape(y)[0]/sr)

In [None]:
#As expected, the length of the audio is around 30 sec.

# Data Analysis and Visualization

Waveform

In [None]:
#To visualize the amplitude of the pressure waves vs time

plt.figure(figsize = (20, 10))
librosa.display.waveshow(y = y , sr = sr, color='grey');
plt.title("Waveform of Disco 05", fontsize = 20);

plt.figure(figsize = (20, 10))
librosa.display.waveshow(y = y_1 , sr = sr_1, color='grey');
plt.title("Waveform of Metal 05", fontsize = 20);

plt.figure(figsize = (20, 10))
librosa.display.waveshow(y = y_2 , sr = sr_2, color='grey');
plt.title("Waveform of Blues 05", fontsize = 20);

plt.figure(figsize = (20, 10))
librosa.display.waveshow(y = y_3 , sr = sr_3, color='grey');
plt.title("Waveform of Jazz 05", fontsize = 20);

plt.figure(figsize = (20, 10))
librosa.display.waveshow(y = y_4 , sr = sr_4, color='grey');
plt.title("Waveform of Classical 05", fontsize = 20);


Waveform in frequency domain

In [None]:
#To visualize in frequency domain we fourier transform the amplitude vs time data into Magnitude vs frequency.
FT_y = np.abs(librosa.stft(y))   

plt.figure(figsize = (20, 10))
plt.plot(FT_y);                  
plt.title("Frequency Graph of Disco 05", fontsize = 20);

Spectrogram 

In [None]:
#Spectrogram : It it is visual representation of spectrum of frequencies of sound at different points of time
DB = librosa.amplitude_to_db(FT_y)                   #Convert the amplitude in units of decibels
plt.figure(figsize = (20, 5))
librosa.display.specshow(DB, sr = sr, x_axis = 'time', y_axis = 'hz', cmap = 'plasma')
plt.title("Spectrogram of Disco 05", fontsize = 20);
plt.colorbar();

In [None]:
#Since most of the lower frequencies have higher amplitude, we need to to use log scale on y axis to look at this plot properly
                 
plt.figure(figsize = (20, 5))
librosa.display.specshow(DB, sr = sr, x_axis = 'time', y_axis = 'log', cmap = 'plasma')
plt.title("Spectrogram of Disco 05", fontsize = 20);
plt.colorbar();

Spectral Centroids

In [None]:
#Spectral centroids depict where the centre of mass of a sound is located
#first we form a list of spectral centroids
centroids = librosa.feature.spectral_centroid(y=y_1, sr=sr_1)[0]
centroids_nor = sklearn.preprocessing.minmax_scale(centroids, axis=0)       #normalize these centroids
           
t = librosa.frames_to_time(range(len(centroids)))                          #We form the variable representing time

plt.figure(figsize = (20, 10))
librosa.display.waveshow(y=y_1, sr=sr_1, alpha=0.7, color='grey');
plt.plot(t, centroids_nor, color = 'red');                                #spectral centroid graph for metal song
plt.title("Spectral Centroids of Metal 05", fontsize = 20);

Spectral Rolloff

In [None]:
# Spectral Rolloff represents the frequency below which a fraction of total energy lies
rolloff = librosa.feature.spectral_rolloff(y=y_1, sr=sr_1)[0]
rolloff_nor = sklearn.preprocessing.minmax_scale(rolloff, axis=0) 

t = librosa.frames_to_time(range(len(rolloff)))    

plt.figure(figsize = (20, 10))
librosa.display.waveshow(y, sr=sr, alpha=0.7, color='grey');
plt.plot(t, rolloff_nor, color='red');
plt.title("Spectral Rolloff of Metal 05", fontsize = 20);

Chromagram

In [None]:
#Chromagram is a plot where spectrum of frequencies are classified into 12 chroma of an octave.
chromagram = librosa.feature.chroma_stft(y=y, sr=sr)
plt.figure(figsize=(20, 10))
librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', cmap='plasma')
plt.colorbar();
plt.title("Chromagram of Disco 05", fontsize = 20);

In [None]:
df = pd.read_csv('Data/features_30_sec.csv')              #Load the 30 sec features file
df

In [None]:
df.info()

In [None]:
df.isna().sum().sum()                    #Checking for null entries

In [None]:
#To visualize no of times the waveform crosses 0, we use the zero_crossing_rate_mean cloumn for different genres

genres = df['label'].unique()
zeroes =[]
for i in genres:
    zeroes.append(df[df['label']== i]['zero_crossing_rate_mean'].mean())
d1 = df[['label','zero_crossing_rate_mean']]

plt.figure(figsize = (15, 5))
plt.bar(genres, zeroes, color='green')
plt.xlabel("Genre", fontsize = 10)
plt.ylabel("Average no. of zero crossings")
plt.title("No. of zero crossings by Bargraph", fontsize = 20);

In [None]:
#This can be better represented by a violinplot
plt.figure(figsize = (15, 5))
sns.violinplot(x = 'label', y = 'zero_crossing_rate_mean', data = df)
plt.xlabel("Genre")
plt.ylabel("Average no. of zero crossings")
plt.title("No. of zero crossings by Violin Plot", fontsize = 20);

In [None]:
#Similarly we can measure the Beats Per Minute for different genres using tempo column
plt.figure(figsize = (15, 5))
sns.violinplot(x = 'label', y = 'tempo', data = df)
plt.xlabel("Genre")
plt.ylabel("BPM")
plt.title("Beats Per Minute", fontsize = 20);

In [None]:
#To get a correlation heatmap, we choose the mean variables and plot a heatmap of it
#df_mean is list of columns that chas mean values of some parameter

df_mean = []
for i in df.columns:
    if 'mean' in i:
        df_mean.append(i)

correlation = df[df_mean].corr()
plt.figure(figsize = (30,25))
sns.heatmap(correlation, cmap='plasma')
sns.set(font_scale=1.75)
plt.title("Correlation heatmap for mean variables", fontsize = 20);

# Classification using Machine Learning Algorithms

For learning from the data, we will use the file 'features_3_sec.csv' as it has the maximum amount of information about the data

In [None]:
#Let us first load the csv file

data = pd.read_csv('Data/features_3_sec.csv')
data

In [None]:
data.isna().sum().sum()               #check for null entries

In [None]:
#AS there are 9990 rows, lets check which label has how many of them
for i in data['label'].unique():
    print( str(i) + '     ' + str(data[data['label']==i].shape[0]))
    

In [None]:
#Since all the missing files are not in one genre, we can ignore 1-2 files missing in each genre.

In [None]:
#Since we do not intend to use the file name for genre prediction, we will drop that column
#Also, since the duration of each entry in the dataset is 3 secs, we might as well that drop that column 
data = data.drop(['filename','length'], axis=1)
data

In [None]:
#Clearly, our target variable is the column 'label' whereas columns from 'chroma_stft_mean' to mfcc20_var' are the features
#let's extract the features in another dataframe by dropping the labels column

features = data.drop('label', axis=1)
features

In [None]:
#We can also store the target variable 'label' as a separate dataframe

labels = data['label']
labels

In [None]:
#Also, we know that before applying any machine learning algorithm, it is better to normalize the data.
#So let us first normalize the data

from sklearn import preprocessing

#### NORMALIZE X ####

# Normalize so everything is on the same scale. 

cols = features.columns
min_max_scaler = preprocessing.MinMaxScaler()
new_features = min_max_scaler.fit_transform(features)

# new data frame with the new scaled data. 
features_scaled = pd.DataFrame(new_features, columns = cols)

In [None]:
#append the 'labels' column again to the normalized features dataframe
scaled_data = features_scaled
scaled_data['label'] = labels
scaled_data
#This is the scaled dataset with labels

# MACHINE LEARNING

### Applying different machine learning models

We will try the following classification models and choose the one which gives the best accuracy using K-fold cross validation for the test set:  
• Logistic regression  
• Decision Tree  
• Random Forest  
• Support Vector Classifier  
• K nearest neighbours  
• Naive bayes  


In [None]:
#importing the relevant libraries for all the models listed above

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

#Also, we will be using K-fold cross validation to determine the accuracy of a given model
#So let us import the required libraries

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [None]:
#Kfold CV using shuffle=False
LR = LogisticRegression()
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()
SVM = SVC()
KNN = KNeighborsClassifier()
NB = GaussianNB()

models = [LR,DT,RF,SVM,KNN,NB]
          
for model in models:
    accuracy = np.zeros(5)
    i = 0
    kf = KFold(n_splits=5, shuffle=False, random_state=None)
    
    for train_index, test_index in kf.split(features):
        print("TRAIN:", train_index, "TEST:", test_index)
        features_train, features_test = features.iloc[train_index], features.iloc[test_index]
        labels_train, labels_test = labels[train_index], labels[test_index]
        model.fit(features_train, labels_train)
        labels_pred = model.predict(features_test)
        #print('Accuracy:' round(accuracy_score(labels_test, labels_pred), 5))
        accuracy[i] = round(accuracy_score(labels_test, labels_pred), 5)
        i = i + 1
    print(accuracy)
    print(str(model) + 'Accuracy: ' + str(accuracy.mean()))        

In [None]:
#Kfold CV using shuffle=True
LR = LogisticRegression()
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()
SVM = SVC()
KNN = KNeighborsClassifier()
NB = GaussianNB()

models = [LR,DT,RF,SVM,KNN,NB]
          
for model in models:
    accuracy = np.zeros(5)
    i = 0
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    
    for train_index, test_index in kf.split(features):
        print("TRAIN:", train_index, "TEST:", test_index)
        features_train, features_test = features.iloc[train_index], features.iloc[test_index]
        labels_train, labels_test = labels[train_index], labels[test_index]
        model.fit(features_train, labels_train)
        labels_pred = model.predict(features_test)
        #print('Accuracy:' round(accuracy_score(labels_test, labels_pred), 5))
        accuracy[i] = round(accuracy_score(labels_test, labels_pred), 5)
        i = i + 1
    print(accuracy)
    print(str(model) + 'Accuracy: ' + str(accuracy.mean()))        

Clearly, Random Forest Classifier has the highest accuracy.  
So let us go ahead with the Random Forest Classifier.

In [None]:
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(features,labels, train_size=0.85)

RF.fit(features_train, labels_train)

In [None]:
labels_pred = RF.predict(features_test)

print('Accuracy', ':', round(accuracy_score(labels_test, labels_pred), 5))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(labels_test, labels_pred)

As we can see, most of the labels are correctly predicted by the model!

Let's see if the model gives the correct output for a particular input.

In [None]:
input_features = np.array(features_test.iloc[46])
print('actual label=' + labels_test.iloc[46])

In [None]:
input_features = [input_features]
predicted_output = RF.predict(input_features)
print("X=%s, Predicted=%s" % (input_features[0], predicted_output[0]))

We can see that the predicted output is correct!

# References

https://librosa.org/doc/latest/index.html  
https://towardsdatascience.com/how-to-split-data-into-three-sets-train-validation-and-test-and-why-e50d22d3e54c  
https://towardsdatascience.com/top-machine-learning-algorithms-for-classification-2197870ff501  
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
