In [None]:
!nvidia-smi

In [1]:
! pip install --upgrade tensorflow keras librosa==0.9.1

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting keras
  Downloading keras-3.1.1-py3-none-any.whl.metadata (5.6 kB)
Collecting librosa==0.9.1
  Downloading librosa-0.9.1-py3-none-any.whl.metadata (6.9 kB)
Collecting resampy>=0.2.2 (from librosa==0.9.1)
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Collecting ml-dtypes~=0.3.1 (from tensorflow)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting tensorboard<2.17,>=2.16 (from tensorflow)
  Downloading tensorboard-2.16.2-py3-none-any.whl.metadata (1.6 kB)
Collecting optree (from keras)
  Downloading optree-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading librosa-0.9.1-py3-none-any.whl (213 kB)
[2K   [90

In [34]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import GRU, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Dense

#### Data Preprocessing

In [10]:
# Load train filenames and transcriptions from CSV
train_df = pd.read_csv('/kaggle/input/train-metadata-final/metadata_train_final.csv')
train_df['intent'] = train_df['intent'].str.lower()

# Load validation filenames and transcriptions from CSV
# validate_df = pd.read_csv('metadata_validate_final.csv')
# validate_df['intent'] = validate_df['intent'].str.lower()

In [11]:
train_df

Unnamed: 0,file_name,label,intent
0,1249120_43453425_58166571.wav,10,mental health
1,1249120_43719934_43347848.wav,6,hair and skin issues
2,1249120_43719934_53187202.wav,0,chest pain
3,1249120_31349958_55816195.wav,8,injury
4,1249120_43719934_82524191.wav,8,injury
...,...,...,...
5890,1249120_14353703_45949288.wav,1,digestive issues
5891,1249120_15004831_26452554.wav,8,injury
5892,1249120_15004831_64958100.wav,11,muscle pain
5893,1249120_15830408_92962528.wav,0,chest pain


In [12]:
train_files = []
train_intents = []

for file_name, intent in zip(train_df['file_name'], train_df['intent']):
    train_files.append(f"/kaggle/input/train-audio-classification/train/{file_name}")
    train_intents.append(intent)

In [13]:
train_df = pd.DataFrame({
    'filename': train_files,
    'intent': train_intents
})

In [14]:
train_df

Unnamed: 0,filename,intent
0,/kaggle/input/train-audio-classification/train...,mental health
1,/kaggle/input/train-audio-classification/train...,hair and skin issues
2,/kaggle/input/train-audio-classification/train...,chest pain
3,/kaggle/input/train-audio-classification/train...,injury
4,/kaggle/input/train-audio-classification/train...,injury
...,...,...
5890,/kaggle/input/train-audio-classification/train...,digestive issues
5891,/kaggle/input/train-audio-classification/train...,injury
5892,/kaggle/input/train-audio-classification/train...,muscle pain
5893,/kaggle/input/train-audio-classification/train...,chest pain


In [16]:
train_df['intent'].value_counts()

intent
muscle pain             1255
hair and skin issues     764
injury                   664
foot pain                472
respiratory issue        470
sensory issue            458
dizziness                256
chest pain               231
head ache                231
feeling cold/hot         230
digestive issues         230
general weakness         215
internal pain            215
mental health            204
Name: count, dtype: int64

In [17]:
min_samples_per_intent = train_df.groupby('intent').size().min()

# Sample the DataFrame for each intent class to get the desired number of samples
train_df = train_df.groupby('intent', group_keys=False).apply(lambda x: x.sample(min_samples_per_intent))

# Reset index of the sampled DataFrame
train_df.reset_index(drop=True, inplace=True)

In [18]:
train_df['intent'].value_counts()

intent
chest pain              204
digestive issues        204
dizziness               204
feeling cold/hot        204
foot pain               204
general weakness        204
hair and skin issues    204
head ache               204
injury                  204
internal pain           204
mental health           204
muscle pain             204
respiratory issue       204
sensory issue           204
Name: count, dtype: int64

#### Feature Extraction

In [19]:
def extract_mfcc(filename):
    y, sr = librosa.load(filename, duration=8, offset=0.5)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    return mfcc

In [20]:
extract_mfcc(train_df['filename'][484])

array([-3.2453796e+02,  1.5181155e+02, -8.3627424e+00,  1.8127216e+01,
        3.1863151e+00,  5.0795736e+00,  6.8731689e+00,  1.6032633e+00,
       -1.0545517e+00,  6.7843323e+00, -9.2558460e+00,  8.7427855e+00,
       -1.4072841e+00,  5.8018255e+00,  8.2681856e+00,  2.7569058e+00,
        8.4719521e-01,  7.9460282e+00, -4.3127213e+00,  5.1337667e+00,
       -1.7130418e-01,  7.5766139e+00, -3.5493451e-01,  7.7652445e+00,
        1.2889872e+00,  6.3668771e+00, -6.8194902e-01,  5.6029043e+00,
       -1.7059224e+00,  2.7381787e+00,  2.3384645e+00,  2.0479071e+00,
       -6.3767356e-01,  1.7218281e+00, -2.0435250e+00,  1.1490377e+00,
       -1.4560053e+00,  2.2223242e-01, -1.5129106e+00,  1.5885150e+00],
      dtype=float32)

In [21]:
X_mfcc = train_df['filename'].apply(lambda x: extract_mfcc(x))
X_mfcc

0       [-297.50266, 51.646576, 1.9694555, 12.192121, ...
1       [-522.0002, 106.7206, -14.634256, 34.310047, -...
2       [-264.29233, 131.25223, -3.0252275, 12.848872,...
3       [-306.3366, 123.03135, -12.331156, 21.2125, -1...
4       [-461.34543, 195.18999, 27.279205, -0.02898660...
                              ...                        
2851    [-336.18207, 101.09681, -51.51952, 29.08254, -...
2852    [-307.5957, 61.80122, 12.749242, 14.058215, 3....
2853    [-353.21185, 130.57845, -8.485679, 13.764009, ...
2854    [-479.44928, 124.76472, 28.798885, 3.1377325, ...
2855    [-483.18253, 191.72531, 32.272778, 1.9935544, ...
Name: filename, Length: 2856, dtype: object

In [22]:
X = [x for x in X_mfcc]
X = np.array(X)
X.shape

(2856, 40)

In [23]:
## input split
X = np.expand_dims(X, -1)
X.shape

(2856, 40, 1)

In [24]:
enc = OneHotEncoder()
y = enc.fit_transform(train_df[['intent']])

In [25]:
y = y.toarray()
y.shape

(2856, 14)

#### Model

In [40]:
model = Sequential([
    Bidirectional(GRU(256, return_sequences=True), input_shape=(40,1)),  # Bidirectional GRU layer
    Conv1D(128, kernel_size=3, activation='relu'),  # 1D Convolutional layer
    MaxPooling1D(pool_size=2),  # Max pooling layer
    Dropout(0.2),
    GlobalMaxPooling1D(),  # Global max pooling layer
    Dense(128, activation='relu'),  
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(14, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [41]:
# Train the model
history = model.fit(X, y, validation_split=0.2, epochs=50, batch_size=64)

Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.0963 - loss: 2.6696 - val_accuracy: 0.0000e+00 - val_loss: 3.7159
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0963 - loss: 2.5239 - val_accuracy: 0.0000e+00 - val_loss: 3.8473
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.1030 - loss: 2.4985 - val_accuracy: 0.0000e+00 - val_loss: 3.9536
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0891 - loss: 2.4998 - val_accuracy: 0.0000e+00 - val_loss: 4.1383
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0920 - loss: 2.4952 - val_accuracy: 0.0000e+00 - val_loss: 4.6168
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0887 - loss: 2.4801 - val_accuracy: 0.0000e+00 - val_loss: 4.2808
Epoch 7/50

In [None]:
# Save the model and history
model.save('model_audio_classification_1.h5')

# Save history to a file
import json

with open('history_audio_classification_1.json', 'w') as f:
    json.dump(history.history, f)