<a href="https://colab.research.google.com/github/Mehul-Agrawal410/yun_solutions_assignment/blob/main/solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

In [2]:
def preprocess_audio(audio_path):
    audio, _ = librosa.load(audio_path, sr=22050)  # Load audio file
    return audio

def extract_features(audio):
    mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=13)  # Extract Mel frequency cepstral coefficients
    return mfccs

def load_audio(file_path):
    audio, sr = librosa.load(file_path)
    return audio, sr

def extract_mfcc(audio, sr):
    mfcc = librosa.feature.mfcc(audio, sr=sr, n_mfcc=13)
    return mfcc

###I am using the Emo-DB dataset to train the model which will recogniz emotions , so you would need to get the [datset](https://www.kaggle.com/datasets/piyushagni5/berlin-database-of-emotional-speech-emodb?resource=download), download and extract it and put the path to the extracted repo here

In [5]:
# from google.colab import files
# uploaded = files.upload()
data_path = "path_to_Emo-DB_dataset"
emotions = {"W": 0, "L": 1, "E": 2, "A": 3, "F": 4, "T": 5, "N": 6}  # Map emotions to numeric labels

In [None]:
X, y = [], []
for subdir, _, files in os.walk(data_path):
    for file in files:
        if file.endswith(".wav"):
            audio_path = os.path.join(subdir, file)
            emotion_label = emotions[file[5]]  # Extract emotion label from file name
            audio = preprocess_audio(audio_path)
            features = extract_features(audio)  # Pass audio data to the extract_features function
            X.append(features)
            y.append(emotion_label)

max_len = max(len(x) for x in X)
X_padded = []
for x in X:
    if x.shape[1] < max_len:
        x = np.pad(x, pad_width=((0, 0), (0, max_len - x.shape[1])), mode='constant')
    elif x.shape[1] > max_len:
        x = x[:, :max_len]
    X_padded.append(x)
X = np.array(X_padded)

####Unfortunately, I keep encountering error in the model.fit part here. It keeps giving ``ValueError: Found input variables with inconsistent numbers of samples: [4862, 374]`` or ``ValueError: zero-dimensional arrays cannot be concatenated`` or something else and I can't figure out how to resolve this, I even asked ChatGPT about it (it suggested concatenation) but it keeps circling in loops from 1 error to another and I am confident in my ability that I would figure this out if I had some more time

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

X_train_concatenated = np.vstack(X_train)
y_train_concatenated = np.array(y_train)
y_train_concatenated = y_train_concatenated.reshape(-1, 1)  # Reshape y_train_concatenated to match the number of samples
model = SVC(kernel='rbf', gamma='scale')
model.fit(X_train_concatenated, y_train_concatenated)

y_val_pred = model.predict(np.concatenate(X_val))
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
print("Validation Report:\n", val_report)

y_test_pred = model.predict(np.concatenate(X_test))
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)
print("Test Report:\n", test_report)

###The next part of the code is written *assuming* that the model works correctly

In [None]:
# from google.colab import files
# uploaded = files.upload()
file_path = "path_to_mp3_file.mp3"
audio, sr = load_audio(file_path)
mfcc = extract_mfcc(audio, sr)

In [None]:
def classify_emotion(mfcc, model):
    # Normalize MFCC features
    mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
    # Reshape MFCC to match model input shape
    mfcc = np.expand_dims(mfcc, axis=0)
    mfcc = np.expand_dims(mfcc, axis=-1)
    # Perform emotion classification
    predictions = model.predict(mfcc)
    # Map predictions to emotion labels
    emotion_labels = ['neutral', 'happy', 'sad', 'angry', 'fearful', 'disgust']
    predicted_emotion = emotion_labels[np.argmax(predictions)]
    return predicted_emotion
def plot_emotional_changes(mfcc):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mfcc, x_axis='time')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Emotional Changes')
    plt.tight_layout()
    plt.show()

In [None]:
predicted_emotion = classify_emotion(mfcc, model)
print("Predicted Emotion:", predicted_emotion)
plot_emotional_changes(mfcc)