In [None]:
!pip install tensorflow



In [None]:
# to use Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Feature extraction

In [None]:
import glob
import os
import matplotlib
import librosa
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score

def featureExtraction(fileName):
	raw, rate = librosa.load(fileName)
	stft = np.abs(librosa.stft(raw))
	mfcc = np.mean(librosa.feature.mfcc(y=raw,sr=rate,n_mfcc=40).T, axis=0)
	chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=rate).T, axis=0)
	mel = np.mean(librosa.feature.melspectrogram(y=raw, sr=rate).T, axis=0)
	contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=rate).T, axis=0)
	tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(raw), sr=rate).T, axis=0)
	return mfcc, chroma, mel, contrast, tonnetz

def parseAudio(parentDirectory, subDirectories, fileExtension="*.wav"):
    features, labels = np.empty((0, 193)), np.empty(0)
    for subDir in subDirectories:
        for fn in glob.glob(os.path.join(parentDirectory, subDir, fileExtension)):
            mfcc, chroma, mel, contrast, tonnetz = featureExtraction(fn)
            tempFeatures = np.hstack([mfcc, chroma, mel, contrast, tonnetz])
            features = np.vstack([features, tempFeatures])
            # Devotional = 1, Happy = 2, Party = 3, Romantic = 4, Sad = 0
            if subDir == "Devotional":
                labels = np.append(labels, 1)
            elif subDir == "Happy":
                labels = np.append(labels, 2)
            elif subDir == "Party":
                labels = np.append(labels, 3)
            elif subDir == "Romantic":
                labels = np.append(labels, 4)
            else:
                labels = np.append(labels, 0)
    return np.array(features), np.array(labels, dtype=np.int32)

In [None]:
training = "/content/drive/MyDrive/training"
test = "/content/drive/MyDrive/test"
subDirectories = ["Devotional", "Happy", "Party", "Romantic", "Sad"]
trainingFeatures, trainingLabels = parseAudio(training, subDirectories)
testFeatures, testLabels = parseAudio(test, subDirectories)

  raw, rate = librosa.load(fileName)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  raw, rate = librosa.load(fileName)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


## Kmeans

In [None]:
model = KMeans(n_clusters=5)
model.fit(trainingFeatures)

predicted_labels = model.labels_

# Aligning KMeans clustering results with trainingLabels
from scipy.stats import mode

# Find the best mapping of clustered labels to true labels
def map_labels(true_labels, predicted_labels):
    label_map = np.zeros_like(predicted_labels)
    for i in range(5):
        mask = (predicted_labels == i)
        label_map[mask] = mode(true_labels[mask])[0]
    return label_map

mapped_labels = map_labels(trainingLabels, predicted_labels)

# Calculate the Accuracy and the Confusion Matrix
accuracy = accuracy_score(trainingLabels, mapped_labels)
print(f"Accuracy on training data: {accuracy:.4f}")

cm = confusion_matrix(trainingLabels, mapped_labels)
labels = ["Sad", "Devotional", "Happy", "Party", "Romantic"]
print("Confusion Matrix with Annotations:")
for i, row in enumerate(cm):
    print(f"{row} {labels[i]}")

# Test the model
test_predicted_labels = model.predict(testFeatures)
mapped_test_labels = map_labels(testLabels, test_predicted_labels)

# Calculate the Accuracy and the Confusion Matrix
test_accuracy = accuracy_score(testLabels, mapped_test_labels)
print(f"Accuracy on test data: {test_accuracy:.4f}")

test_cm = confusion_matrix(testLabels, mapped_test_labels)
# print("Confusion Matrix for test data")
# print(test_cm)
labels = ["Sad", "Devotional", "Happy", "Party", "Romantic"]
print("Confusion Matrix with Annotations:")
for i, row in enumerate(test_cm):
    print(f"{row} {labels[i]}")

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k = 3
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(trainingFeatures, trainingLabels)

y_pred = knn.predict(testFeatures)

accuracy = accuracy_score(testLabels, y_pred)
print(f"Accuracy on test data: {accuracy:.4f}")

## SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(trainingFeatures, trainingLabels)

y_pred = svm_classifier.predict(testFeatures)

accuracy = accuracy_score(testLabels, y_pred)
print(f'Accuracy on test data: {accuracy:.4f}')

Accuracy on test data: 0.4490


## CNN

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Feature extraction
def extract_mel_spectrogram(file_path, n_mels=128, n_timesteps=400):
    audio, sr = librosa.load(file_path, sr=None)
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

    if mel_spectrogram_db.shape[1] < n_timesteps:
        pad_width = n_timesteps - mel_spectrogram_db.shape[1]
        mel_spectrogram_db = np.pad(mel_spectrogram_db, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mel_spectrogram_db = mel_spectrogram_db[:, :n_timesteps]

    mel_spectrogram_rgb = np.repeat(mel_spectrogram_db[..., np.newaxis], 3, axis=-1)
    return mel_spectrogram_rgb

def load_data(data_dir, n_timesteps=400):
    modes = os.listdir(data_dir)
    X, y = [], []

    for mode in modes:
        mode_dir = os.path.join(data_dir, mode)
        for file_name in os.listdir(mode_dir):
            file_path = os.path.join(mode_dir, file_name)
            mel_spectrogram_rgb = extract_mel_spectrogram(file_path, n_timesteps=n_timesteps)
            X.append(mel_spectrogram_rgb)
            y.append(mode)

    return np.array(X), np.array(y)

In [None]:
# Data preparation
train_dir = '/content/drive/MyDrive/training'
test_dir = '/content/drive/MyDrive/test'
X_train, y_train = load_data(train_dir)
X_test, y_test = load_data(test_dir)

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

y_train_onehot = to_categorical(y_train_encoded)
y_test_onehot = to_categorical(y_test_encoded)

  audio, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator


def build_vgg16_model(input_shape, num_classes):
    vgg = VGG16(include_top=False, weights='imagenet', input_shape=input_shape)

    # Freeze the pre training layer of VGG16
    for layer in vgg.layers:
        layer.trainable = False

    # Add classification layer
    x = Flatten()(vgg.output)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=vgg.input, outputs=x)
    return model

input_shape = X_train.shape[1:]  # (128, 130, 1)
num_classes = len(np.unique(y_train_encoded))
model = build_vgg16_model(input_shape, num_classes)

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Training
history = model.fit(X_train, y_train_onehot, batch_size=32, epochs=50, validation_data=(X_test, y_test_onehot), verbose=1)

# Testing
test_loss, test_acc = model.evaluate(X_test, y_test_onehot)
print(f'Test accuracy: {test_acc:.4f}')

Epoch 1/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 1s/step - accuracy: 0.2941 - loss: 15.8293 - val_accuracy: 0.5306 - val_loss: 3.4787
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 152ms/step - accuracy: 0.5010 - loss: 2.8109 - val_accuracy: 0.4898 - val_loss: 1.2255
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 172ms/step - accuracy: 0.5985 - loss: 0.9684 - val_accuracy: 0.4694 - val_loss: 1.1871
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 155ms/step - accuracy: 0.6665 - loss: 0.8627 - val_accuracy: 0.5918 - val_loss: 1.0476
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 153ms/step - accuracy: 0.7349 - loss: 0.7685 - val_accuracy: 0.5306 - val_loss: 1.0912
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 175ms/step - accuracy: 0.7290 - loss: 0.6182 - val_accuracy: 0.5000 - val_loss: 1.0947
Epoch 7/50
[1m13/13[0m [3

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, LSTM, TimeDistributed, Reshape, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import numpy as np

def build_vgg16_lstm_model(input_shape, num_classes):
    vgg = VGG16(include_top=False, weights='imagenet', input_shape=input_shape)

    # Freeze the pre training layer of VGG16
    for layer in vgg.layers:
        layer.trainable = False

    x = TimeDistributed(Flatten())(vgg.output)
    x = LSTM(128, return_sequences=False)(x)  # use LSTM
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=vgg.input, outputs=x)
    return model


input_shape = X_train.shape[1:]
num_classes = len(np.unique(y_train_encoded))

# Build VGG16 + LSTM model
model = build_vgg16_lstm_model(input_shape, num_classes)
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Training
history = model.fit(X_train, y_train_onehot, batch_size=32, epochs=50, validation_data=(X_test, y_test_onehot), verbose=1)

# Testing
test_loss, test_acc = model.evaluate(X_test, y_test_onehot)
print(f'Test accuracy: {test_acc:.4f}')

Epoch 1/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 243ms/step - accuracy: 0.2306 - loss: 1.6250 - val_accuracy: 0.4490 - val_loss: 1.3915
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 209ms/step - accuracy: 0.5506 - loss: 1.2412 - val_accuracy: 0.5408 - val_loss: 1.2427
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 203ms/step - accuracy: 0.6535 - loss: 1.0408 - val_accuracy: 0.5204 - val_loss: 1.1048
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 205ms/step - accuracy: 0.8133 - loss: 0.6653 - val_accuracy: 0.5408 - val_loss: 1.0424
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 201ms/step - accuracy: 0.8844 - loss: 0.4664 - val_accuracy: 0.5714 - val_loss: 1.0629
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 202ms/step - accuracy: 0.9193 - loss: 0.3160 - val_accuracy: 0.5918 - val_loss: 0.9924
Epoch 7/50
[1m13/13[0m [3

## Random Forest

In [None]:
import os
import librosa
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Build model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

# Training
rf_model.fit(X_train, y_train)

# Testing
y_pred = rf_model.predict(X_test)

# Calculate the Accuracy and the Confusion Matrix
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)