In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, GRU, Dense, Input, Bidirectional
from tensorflow.keras.models import Model
import tensorflow as tf

# Load the dataset
df = pd.read_csv('/kaggle/input/gru-dataset-text/capstone_dataset_hugging_face.csv')

# Prepare the data
X = pd.concat([df['ipa_phoneme'], df['incorrect_ipa_phoneme']], axis=0)
y = pd.concat([pd.Series([0] * len(df)), pd.Series([1] * len(df))], axis=0)

# Tokenize the phoneme sequences
tokenizer = Tokenizer(char_level=True)  # Character-level tokenization
tokenizer.fit_on_texts(X)
X_tokenized = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_tokenized, padding='post', maxlen=128)  # Padding the sequences

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Define the model parameters
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size based on tokenized phonemes
embedding_dim = 64  # Dimension of the embedding layer
gru_units = 64  # Number of GRU units
sequence_length = X_padded.shape[1]  # Length of input sequences

# Build the GRU-based model
inputs = Input(shape=(sequence_length,))
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)
gru = Bidirectional(GRU(gru_units, return_sequences=False))(embedding)  # Bidirectional GRU
dense = Dense(64, activation='relu')(gru)  # Dense layer
output = Dense(1, activation='sigmoid')(dense)  # Output layer for binary classification

# Define the model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# Save the model
model.save('/kaggle/working/gru_phoneme_classification_model.h5')




Epoch 1/10
[1m13747/13747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 15ms/step - accuracy: 0.7475 - loss: 0.4819 - val_accuracy: 0.8659 - val_loss: 0.3205
Epoch 2/10
[1m13747/13747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 15ms/step - accuracy: 0.8714 - loss: 0.3085 - val_accuracy: 0.8809 - val_loss: 0.2912
Epoch 3/10
[1m13747/13747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 15ms/step - accuracy: 0.8830 - loss: 0.2864 - val_accuracy: 0.8850 - val_loss: 0.2826
Epoch 4/10
[1m13747/13747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 15ms/step - accuracy: 0.8892 - loss: 0.2742 - val_accuracy: 0.8883 - val_loss: 0.2760
Epoch 5/10
[1m13747/13747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 15ms/step - accuracy: 0.8928 - loss: 0.2668 - val_accuracy: 0.8934 - val_loss: 0.2677
Epoch 6/10
[1m13747/13747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 15ms/step - accuracy: 0.8942 - loss: 0.2619 - val_accuracy: 0.8916 - val

In [13]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load the saved model
model = load_model('/kaggle/working/gru_phoneme_classification_model.h5')

# Load the same tokenizer used during training
# Assuming the tokenizer is saved or can be recreated using the training data
# You may need to fit the tokenizer on the same vocabulary as the training data
tokenizer = Tokenizer(char_level=True)  # Character-level tokenization as before
X = pd.concat([df['ipa_phoneme'], df['incorrect_ipa_phoneme']], axis=0)  # Use the dataset's phonemes
tokenizer.fit_on_texts(X)  # Fit on the training data

# Define a function to predict if a list of phonemes is correct or mispronounced
def predict_phoneme_sequence(phoneme_list):
    # Tokenize and pad the input phoneme list
    phoneme_tokenized = tokenizer.texts_to_sequences(phoneme_list)
    phoneme_padded = pad_sequences(phoneme_tokenized, padding='post', maxlen=128)
    
    # Predict using the loaded model
    predictions = model.predict(phoneme_padded)
    
    # Interpret the predictions: if prediction >= 0.5, it's considered mispronounced
    results = ['Correct' if pred < 0.5 else 'Mispronounced' for pred in predictions]
    
    return results

# Example usage with a list of phoneme sequences (in IPA format)
phoneme_list = ['fˈɔːɹɪsʌt', 'fˈɔːɹɪst', 'kˈælkjʊlˌeɪɾɚ','dʒɛnər','sˈæŋkːtjuːˌɛɹi','ɛkˈsprɛsoʊ','ɛk ˈsɛtərə','mɛˈænɪkwˌɪn','kˈæʃ','bʊˈkɛt']  # Example phoneme sequences
results = predict_phoneme_sequence(phoneme_list)

# Print the results
for i, phoneme in enumerate(phoneme_list):
    print(f'Phoneme sequence: {phoneme} - Prediction: {results[i]}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step
Phoneme sequence: fˈɔːɹɪsʌt - Prediction: Mispronounced
Phoneme sequence: fˈɔːɹɪst - Prediction: Correct
Phoneme sequence: kˈælkjʊlˌeɪɾɚ - Prediction: Correct
Phoneme sequence: dʒɛnər - Prediction: Mispronounced
Phoneme sequence: sˈæŋkːtjuːˌɛɹi - Prediction: Mispronounced
Phoneme sequence: ɛkˈsprɛsoʊ - Prediction: Mispronounced
Phoneme sequence: ɛk ˈsɛtərə - Prediction: Mispronounced
Phoneme sequence: mɛˈænɪkwˌɪn - Prediction: Mispronounced
Phoneme sequence: kˈæʃ - Prediction: Correct
Phoneme sequence: bʊˈkɛt - Prediction: Mispronounced
