In [1]:
!pip install librosa soundfile numpy sklearn pyaudio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyaudio
  Downloading PyAudio-0.2.13.tar.gz (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 KB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: sklearn, pyaudio
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2344 sha256=448fdf646a1247667b3f303ffd63d4e7a197f0f139c49824c5773d8d4dae917b
  Stored in directory: /root/.cache/pip/wheels/14/25/f7/1cc0956978ae479e75140219088deb7a36f60459df242b1a72
  [1;31merror[0m: [1msubprocess-exited

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [127]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [128]:
#DataFlair - Extract features (mfcc, chroma, mel) from a sound file
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
        return result

In [129]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised',
  'neutral': 'Neutral',
  'calm': 'Calm',
  'happy': 'Happy',
  'sad': 'Sad',
  'angry': 'Angry',
  'fearful': 'Fearful',
  'disgust': 'Disgust',
  'surprised': 'Surprised'
}
#DataFlair - Emotions to observe
observed_emotions=['happy', 'sad','calm','angry']

In [130]:
#DataFlair - Load the data and extract features for each sound file
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("/content/drive/MyDrive/DataScienceAdvanced/Phase3:SER/TrainData/Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [131]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.25)

In [132]:
#DataFlair - Get the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

(576, 192)


In [133]:
#DataFlair - Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(alpha=0.01, batch_size=512, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=600)

In [134]:
#DataFlair - Train the model


model.fit(x_train,y_train)

MLPClassifier(alpha=0.01, batch_size=512, hidden_layer_sizes=(300,),
              learning_rate='adaptive', max_iter=600)

In [135]:
#DataFlair - Predict for the test set
y_pred=model.predict(x_test)
y_pred

array(['calm', 'sad', 'angry', 'angry', 'angry', 'happy', 'calm', 'calm',
       'calm', 'calm', 'calm', 'calm', 'happy', 'happy', 'calm', 'calm',
       'angry', 'calm', 'calm', 'calm', 'sad', 'happy', 'calm', 'happy',
       'happy', 'calm', 'calm', 'calm', 'happy', 'angry', 'angry', 'sad',
       'angry', 'calm', 'calm', 'angry', 'calm', 'calm', 'sad', 'calm',
       'sad', 'happy', 'calm', 'happy', 'happy', 'happy', 'angry',
       'happy', 'calm', 'calm', 'happy', 'calm', 'happy', 'calm', 'happy',
       'calm', 'calm', 'calm', 'angry', 'calm', 'happy', 'angry', 'calm',
       'calm', 'calm', 'sad', 'angry', 'angry', 'calm', 'calm', 'happy',
       'angry', 'calm', 'calm', 'calm', 'angry', 'calm', 'happy', 'angry',
       'happy', 'happy', 'sad', 'happy', 'angry', 'angry', 'happy',
       'calm', 'happy', 'angry', 'sad', 'happy', 'calm', 'happy', 'calm',
       'angry', 'happy', 'happy', 'calm', 'calm', 'calm', 'happy', 'calm',
       'sad', 'calm', 'happy', 'sad', 'angry', 'angry

In [136]:
#DataFlair - Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)

#DataFlair - Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 70.31%


In [139]:
# Define function to extract audio features
def extract_feature(audio_data, sample_rate, mfcc=True, chroma=True, mel=True):
    feature_list = []
    if chroma:
        stft = np.abs(librosa.stft(audio_data))
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        feature_list.append(chroma)
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40).T,axis=0)
        feature_list.append(mfccs)
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(audio_data, sr=sample_rate).T,axis=0)
        feature_list.append(mel)
    return np.concatenate(feature_list)

# Load audio file
audio_file = '/content/30 Second Coca Cola Commercial.wav'
y, sr = librosa.load(audio_file, mono=False)

# Convert stereo audio to monophonic audio
y_mono = librosa.to_mono(y)

# Split audio into 3-second chunks
chunk_length = sr * 3# 3 seconds
chunks = librosa.util.frame(y_mono, frame_length=chunk_length, hop_length=chunk_length)

# Extract features from each chunk
features = []
for chunk in chunks.T:
    feature = extract_feature(chunk, sample_rate=sr, mfcc=True, chroma=True, mel=True)
    features.append(feature)

# Reshape audio features
features = np.array(features)
n_samples = features.shape[0]
features = features.reshape(n_samples, -1) # reshape to have 180 features

# Predict emotion label
predicted_emotions = model.predict(features)

# Map predicted emotion codes to emotion labels
predicted_emotions = [emotions[prediction] for prediction in predicted_emotions]

# Print predicted emotion labels for each chunk
for i, emotion in enumerate(predicted_emotions):
    print(f"Chunk {i+1}: {emotion}")


Chunk 1: Angry
Chunk 2: Angry
Chunk 3: Angry
Chunk 4: Angry
Chunk 5: Angry
Chunk 6: Angry
Chunk 7: Angry
Chunk 8: Angry
Chunk 9: Angry


In [126]:
from collections import Counter

# Predict emotion labels for each chunk
# Map predicted emotion codes to emotion labels
predicted_emotions = [emotions[prediction.lower()] for prediction in predicted_emotions]

# Count occurrences of each emotion
emotion_counts = Counter(predicted_emotions)

# Get emotion with the highest count
majority_emotion = emotion_counts.most_common(1)[0][0]

print('Majority emotion:', majority_emotion)


Majority emotion: Angry
