-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
146 lines (120 loc) · 4.79 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import streamlit as st
import librosa
import numpy as np
from keras.models import load_model
import soundfile as sf
import tempfile
import os
import base64
import pyaudio
import wave
import tempfile
import pickle
# portaudio19-dev
# python3-pyaudio
# Load the speaker_model
with open('speaker_model.pkl', 'rb') as f:
speaker_model = pickle.load(f)
# Load the trained LSTM model
model = load_model('./model.h5')
# Function to start audio recording
def record_audio(seconds=5, sample_rate=44100, channels=2, chunk_size=1024):
audio = pyaudio.PyAudio()
stream = audio.open(format=pyaudio.paInt16,
channels=channels,
rate=sample_rate,
input=True,
frames_per_buffer=chunk_size)
frames = []
for _ in range(int(sample_rate / chunk_size * seconds)):
data = stream.read(chunk_size)
frames.append(data)
stream.stop_stream()
stream.close()
audio.terminate()
return b''.join(frames)
# Function to save a temp .wav file
def save_wav_file(frames, file_path):
with wave.open(file_path, 'wb') as wf:
wf.setnchannels(2)
wf.setsampwidth(pyaudio.PyAudio().get_sample_size(pyaudio.paInt16))
wf.setframerate(44100)
wf.writeframes(frames)
# Function to extract MFCC features from audio file
def extract_mfcc(filename):
y, sr = librosa.load(filename, duration=3, offset=0.5)
mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
return mfcc
# Function to predict emotion from audio file
def predict_emotion(filename, model):
mfcc_features = extract_mfcc(filename)
mfcc_features = np.expand_dims(mfcc_features, axis=0)
mfcc_features = np.expand_dims(mfcc_features, axis=-1)
prediction = model.predict(mfcc_features)
predicted_label = np.argmax(prediction)
emotions = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
predicted_emotion = emotions[predicted_label]
return predicted_emotion
# Authentication Functions
def extract_mfcc_padded(audio_file, sr=22050, n_mfcc=13, max_len=200):
y, sr = librosa.load(audio_file, sr=sr)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
# Pad or truncate to a fixed length (max_len)
if mfccs.shape[1] < max_len:
pad_width = max_len - mfccs.shape[1]
mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
else:
mfccs = mfccs[:, :max_len]
return mfccs
def train_speaker_model(speaker_data, reg_covar=1e-6):
gmm = GaussianMixture(n_components=6, covariance_type='diag', reg_covar=reg_covar)
gmm.fit(np.vstack(speaker_data))
return gmm
# Main function
def main():
# threshold for authentication
threshold = -7300
st.title("SER by MAZER")
st.title("Record now")
seconds_to_record = st.slider("Record time (seconds)", 0, 10, 5)
record_button = st.button("Record")
if record_button:
st.write("Recording...")
frames = record_audio(seconds=seconds_to_record+1)
st.write("Recording complete!")
# Save the recorded audio to a temporary WAV file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
save_wav_file(frames, tmp_file.name)
audio_path = tmp_file.name
st.audio(audio_path, format='audio/wav')
test_mfcc = extract_mfcc_padded(audio_path)
log_likelihood = speaker_model.score(test_mfcc)
st.write(f"Log Likelihood: {log_likelihood}")
if log_likelihood > threshold:
# print("Speaker verified. Proceed with speech emotion recognition.")
predicted_emotion = predict_emotion(audio_path, model)
st.write(f"Predicted Emotion: {predicted_emotion}")
else:
# print("Authentication failed. Access denied.")
st.write(f"Authentication failed. Access denied.")
# Provide download link for the WAV file
if __name__ == "__main__":
main()
# if we want to upload a file
st.title("OR")
uploaded_file = st.file_uploader("Upload an audio file", type=["wav"])
if uploaded_file is not None:
threshold = -7200
st.audio(uploaded_file, format='audio/wav')
test_mfcc = extract_mfcc_padded(uploaded_file)
log_likelihood = speaker_model.score(test_mfcc)
st.write(f"Log Likelihood: {log_likelihood}")
if log_likelihood > threshold:
# print("Speaker verified. Proceed with speech emotion recognition.")
predicted_emotion = predict_emotion(uploaded_file, model)
st.write(f"Predicted Emotion: {predicted_emotion}")
else:
# print("Authentication failed. Access denied.")
st.write(f"Authentication failed. Access denied.")
# should not recognise the voice of a particular person
# real time audio input