In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install librosa soundfile matplotlib




# Audio Feature Extraction

In [7]:
# coding= UTF-8
import glob
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf

# Return audio features 
def feature_extraction(file_name):
    # Load audio file
    X, sample_rate = librosa.load(file_name, sr=None)  # Original sampling rate
    if X.ndim > 1:
        X = X[:, 0]  # Use only the first channel if stereo
    X = X.T
    
    # Short-Time Fourier Transform (not directly used, but needed for some features)
    stft = np.abs(librosa.stft(X))
    
    # Extract features
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=20).T, axis=0)
    rmse = np.mean(librosa.feature.rms(y=X).T, axis=0)
    spectral_flux = np.mean(librosa.onset.onset_strength(y=X, sr=sample_rate).T, axis=0)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=X).T, axis=0)
    
    return mfccs, rmse, spectral_flux, zcr

# Audio parsing: Function makes call for feature extraction and returns array with features and labels 
def parse_audio_files(parent_dir, sub_dirs):
    n_mfccs = 20
    number_of_features = 3 + n_mfccs
    features, labels = np.empty((0, number_of_features)), np.empty(0)

    for label, sub_dir in enumerate(sub_dirs):
        folder_path = os.path.join(parent_dir, sub_dir)
        print(f"\nProcessing folder: {sub_dir} (label: {label})")
        print("Looking in:", folder_path)

        file_list = glob.glob(os.path.join(folder_path, "*"))
        print(f"Found {len(file_list)} files (all types)")

        for file_name in file_list:
            print("Trying file:", file_name)
            try:
                mfccs, rmse, spectral_flux, zcr = feature_extraction(file_name)
                extracted_features = np.hstack([mfccs, rmse, spectral_flux, zcr])
                features = np.vstack([features, extracted_features])
                labels = np.append(labels, label)
            except Exception as e:
                print("[Error] Skipping file —", file_name)
                print("       →", e)
                continue

    return features, labels



# Example usage (edit the paths as needed)
if __name__ == "__main__":
    parent_directory = "/kaggle/input/avalinguo-dataset/audio files"
    sub_directories = ["001 - Low", "002 - Intermediate", "003 - High"]
    features, labels = parse_audio_files(parent_directory, sub_directories)

    print("Feature extraction completed.")
    print("Features shape:", features.shape)
    print("Labels shape:", labels.shape)




Processing folder: 001 - Low (label: 0)
Looking in: /kaggle/input/avalinguo-dataset/audio files/001 - Low
Found 438 files (all types)
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Avalinguo - Dana and Konay segment 115 - D
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Avalinguo - Victor and Abraham segment 28
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Avalinguo - Xoca and Josué segment 58 - J
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Elderly Chinese street cleaner speaks fluent English segment 69 - E
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Avalinguo - Dana and Konay segment 9 - D
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Interview with a Filippines Woman segment 32 - W
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Avalinguo - Victor and Abraham segment 129
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/A

In [8]:
import os

# Path to the audio dataset
base_path = "/kaggle/input/avalinguo-dataset/audio files/"

# Read audio class directories
audio_subdirectories = [
    name for name in os.listdir(base_path)
    if os.path.isdir(os.path.join(base_path, name))
]

# Sort directories
audio_subdirectories.sort()
print('Audio Subdirs:', audio_subdirectories)


Audio Subdirs: ['001 - Low', '002 - Intermediate', '003 - High']


In [10]:
##Get features and labels
#This generates two numpy files. One npy file with feature vectors corresponding each audio file. The other with labels.

# Parse Audio Files Function Call
features, labels = parse_audio_files('/kaggle/input/avalinguo-dataset/audio files/', audio_subdirectories) #(parent dir,sub dirs)
np.save('feat.npy', features) 
np.save('label.npy', labels)


Processing folder: 001 - Low (label: 0)
Looking in: /kaggle/input/avalinguo-dataset/audio files/001 - Low
Found 438 files (all types)
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Avalinguo - Dana and Konay segment 115 - D
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Avalinguo - Victor and Abraham segment 28
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Avalinguo - Xoca and Josué segment 58 - J
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Elderly Chinese street cleaner speaks fluent English segment 69 - E
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Avalinguo - Dana and Konay segment 9 - D
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Interview with a Filippines Woman segment 32 - W
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/Avalinguo - Victor and Abraham segment 129
Trying file: /kaggle/input/avalinguo-dataset/audio files/001 - Low/A

# Using RNN


In [12]:
# coding= UTF-8
import os
import numpy as np
from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense

from keras.layers import Dense
from keras.optimizers import Adam

# Load data 
X = np.load("feat.npy")
y = np.load('label.npy').ravel()

# Fix random seed number
np.random.seed(7)

number_of_features = len(X[1]) 
number_of_classes = 3

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 230)


# Reshape data for LSTM (Samples, Timesteps, Features)
X_train = np.expand_dims(X_train, axis=2) #(280,193,1)
X_test = np.expand_dims(X_test, axis=2)

y_train = keras.utils.to_categorical(y_train - 1, num_classes= number_of_classes) # Converts a class vector (integers) to binary class matrix
y_test = keras.utils.to_categorical(y_test - 1, num_classes= number_of_classes)

# Build RNN Neural Network
print('Build LSTM RNN model ...')
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=X_train.shape[1:]))
model.add(LSTM(32, return_sequences=False))

#model.add(LSTM(16, return_sequences=False))
model.add(Dense(number_of_classes, activation='softmax'))
          
print("Compiling ...")
model.compile(loss='categorical_crossentropy', # for multiple classes
              optimizer='rmsprop', 
              metrics=['accuracy'])

print(model.summary())

print("Training ...")
model.fit(X_train, y_train, batch_size=64, epochs=60)

print("\nValidating ...")
score, accuracy = model.evaluate(X_test, y_test, batch_size=32, verbose=1)
print("Loss:  ", score)
print("Accuracy:  ", accuracy)

Build LSTM RNN model ...


I0000 00:00:1745911674.192797      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1745911674.193480      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5
  super().__init__(**kwargs)


Compiling ...


None
Training ...
Epoch 1/60


I0000 00:00:1745911678.012430     117 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.4332 - loss: 1.0477
Epoch 2/60
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6116 - loss: 0.8568
Epoch 3/60
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6355 - loss: 0.7425
Epoch 4/60
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6949 - loss: 0.6709
Epoch 5/60
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7192 - loss: 0.6193
Epoch 6/60
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7148 - loss: 0.6056
Epoch 7/60
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7492 - loss: 0.5615
Epoch 8/60
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7518 - loss: 0.5682
Epoch 9/60
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [14]:
# Save the model
model.save('/kaggle/working/fluency_model.h5')  # This saves the model in the working directory


# Prediction

In [18]:
import os
import librosa
import numpy as np
from keras.models import load_model
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

# Load your trained fluency model
model = load_model('/kaggle/working/fluency_model.h5')

# 1. Feature Extraction Function
def feature_extraction(file_name):
    X, sample_rate = librosa.load(file_name, sr=None)
    
    # Apply high-pass filter to remove low-frequency noise
    X = librosa.effects.preemphasis(X)
    
    if X.ndim > 1:
        X = X[:, 0]  # Use the first channel if stereo
    X = X.T
    
    # Extract features
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=20).T, axis=0)
    rmse = np.mean(librosa.feature.rms(y=X).T, axis=0)
    spectral_flux = np.mean(librosa.onset.onset_strength(y=X, sr=sample_rate).T, axis=0)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=X).T, axis=0)

    return mfccs, rmse, spectral_flux, zcr

# 2. Function to extract audio features
def extract_audio_features(uploaded_file):
    try:
        mfccs, rmse, spectral_flux, zcr = feature_extraction(uploaded_file)
        extracted_features = np.hstack([mfccs, rmse, spectral_flux, zcr])
        return extracted_features
    except Exception as e:
        print("[Error] There was an error in feature extraction:", e)
        return None

# 3. Function to make a prediction using the model
def predict_audio(uploaded_file):
    features = extract_audio_features(uploaded_file)
    
    if features is None:
        return None
    
    features = np.expand_dims(features, axis=0)  # Add batch dimension
    features = np.expand_dims(features, axis=2)  # Add time dimension
    
    prediction = model.predict(features)
    return prediction

# 4. Load Pre-trained ASR Model (Wav2Vec2)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# 5. Function to get predicted transcription from Wav2Vec2
def transcribe_audio(file_path):
    audio_input, _ = librosa.load(file_path, sr=16000)  # Ensure sample rate is 16000
    input_values = processor(audio_input, return_tensors="pt").input_values
    logits = asr_model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription.lower()  # Convert transcription to lowercase for comparison

# 6. Path to your uploaded audio file (adjust the path as needed)
audio_path = '/kaggle/input/testfile/'  # Path where the audio file is uploaded
uploaded_file = os.path.join(audio_path, 'Heart.m4a')  # Replace with your actual audio file name

# 7. Predict fluency
prediction = predict_audio(uploaded_file)

if prediction is not None:
    # Get the class label with the highest probability
    predicted_class = np.argmax(prediction)
    fluency_labels = ['Low', 'Intermediate', 'High']
    print(f"Predicted fluency level: {fluency_labels[predicted_class]}")
    
    # If fluency is not high, check the transcription and provide feedback
    if fluency_labels[predicted_class] != "High":
        print("Pronunciation might not be clear. Let's check the transcription...")
        
        # 8. Transcribe the audio and compare with expected word
        predicted_transcription = transcribe_audio(uploaded_file)
        print(f"Predicted transcription from ASR model: {predicted_transcription}")
        
        expected_transcription = "this"  # Example: Replace with the expected word
        if predicted_transcription != expected_transcription:
            print(f"Pronunciation correction suggestion: The correct pronunciation should be '{expected_transcription}'")
        else:
            print("Pronunciation seems correct!")
else:
    print("Feature extraction failed.")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  X, sample_rate = librosa.load(file_name, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 229ms/step
Predicted fluency level: High
