In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cnn-model-correction/phoneme_correction_model.h5
/kaggle/input/gru-dataset/capstone_dataset_hugging_face.csv


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, Dense, Embedding, Add, Multiply, Activation, Lambda, TimeDistributed
import tensorflow as tf
import numpy as np

2024-09-26 10:17:56.751874: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-26 10:17:56.751987: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-26 10:17:56.887599: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Load the CSV file
file_path = '/kaggle/input/gru-dataset/capstone_dataset_hugging_face.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# Combine all phoneme sequences to fit the tokenizer
all_phonemes = list(df['ipa_phoneme']) + list(df['incorrect_ipa_phoneme'])

tokenizer = Tokenizer(char_level=True)  # Use character-level tokenization
tokenizer.fit_on_texts(all_phonemes)

# Create tokenized sequences
correct_sequences = tokenizer.texts_to_sequences(df['ipa_phoneme'])
incorrect_sequences = tokenizer.texts_to_sequences(df['incorrect_ipa_phoneme'])

# Pad sequences to the same length
max_sequence_length = max(max(len(seq) for seq in correct_sequences), max(len(seq) for seq in incorrect_sequences))

correct_sequences = pad_sequences(correct_sequences, maxlen=max_sequence_length, padding='post')
incorrect_sequences = pad_sequences(incorrect_sequences, maxlen=max_sequence_length, padding='post')

In [2]:


# Split the data into training and validation sets
train_incorrect, val_incorrect, train_correct, val_correct = train_test_split(
    incorrect_sequences, correct_sequences, test_size=0.2, random_state=42
)

# Prepare decoder input sequences for training (shifted incorrect sequences)
train_decoder_input_sequences = np.zeros_like(train_incorrect)
train_decoder_input_sequences[:, 1:] = train_incorrect[:, :-1]

val_decoder_input_sequences = np.zeros_like(val_incorrect)
val_decoder_input_sequences[:, 1:] = val_incorrect[:, :-1]

# Prepare target sequences for training (correct sequences)
train_decoder_target_sequences = train_correct
val_decoder_target_sequences = val_correct

# Define model parameters
embedding_dim = 64
num_tokens = len(tokenizer.word_index) + 1  # +1 for padding token

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_tokens, embedding_dim)(encoder_inputs)
encoder_conv = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
(encoder_embedding)

# Attention mechanism
attention = Dense(1, activation='tanh')(encoder_conv)
attention = Lambda(lambda x: tf.nn.softmax(x, axis=1))(attention) 
# Wrap softmax in a Lambda layer
attention = Multiply()([encoder_conv, attention])

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_tokens, embedding_dim)(decoder_inputs)
decoder_conv = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
(decoder_embedding)

# Combine encoder output with attention and decoder output at each time step
combined = Add()([attention, decoder_conv])

# TimeDistributed Dense layer to predict the next token for each timestep
decoder_dense = TimeDistributed(Dense(num_tokens, activation='softmax'))(combined)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_dense)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    [train_incorrect, train_decoder_input_sequences],
    train_decoder_target_sequences,
    epochs=50,
    batch_size=64,
    validation_data=([val_incorrect, val_decoder_input_sequences], val_decoder_target_sequences)
)

# Evaluate the model on the validation data
val_loss, val_accuracy = model.evaluate([val_incorrect, val_decoder_input_sequences], val_decoder_target_sequences)
print(f"Validation Accuracy: {val_accuracy:.4f}")



Epoch 1/50
[1m  42/3437[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13s[0m 4ms/step - accuracy: 0.6352 - loss: 2.4883 

I0000 00:00:1723448137.515820     111 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m3437/3437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step - accuracy: 0.8563 - loss: 0.5684 - val_accuracy: 0.8826 - val_loss: 0.3434
Epoch 2/50
[1m3437/3437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8843 - loss: 0.3354 - val_accuracy: 0.8858 - val_loss: 0.3189
Epoch 3/50
[1m3437/3437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8867 - loss: 0.3138 - val_accuracy: 0.8881 - val_loss: 0.3056
Epoch 4/50
[1m3437/3437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8886 - loss: 0.3022 - val_accuracy: 0.8884 - val_loss: 0.2993
Epoch 5/50
[1m3437/3437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8887 - loss: 0.2977 - val_accuracy: 0.8898 - val_loss: 0.2957
Epoch 6/50
[1m3437/3437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8900 - loss: 0.2932 - val_accuracy: 0.8902 - val_loss: 0.2921
Epoch 7/50
[1m3437/3

In [6]:

# Save the entire model
model.save('phoneme_correction_model.h5')




In [5]:
# Rebuild the model architecture
embedding_dim = 64
num_tokens = len(tokenizer.word_index) + 1

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_tokens, embedding_dim)(encoder_inputs)
encoder_conv = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(encoder_embedding)

# Attention mechanism
attention = Dense(1, activation='tanh')(encoder_conv)
attention = Lambda(lambda x: tf.nn.softmax(x, axis=1))(attention)
attention = Multiply()([encoder_conv, attention])

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_tokens, embedding_dim)(decoder_inputs)
decoder_conv = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(decoder_embedding)

# Combine encoder output with attention and decoder output at each time step
combined = Add()([attention, decoder_conv])

# TimeDistributed Dense layer to predict the next token for each timestep
decoder_dense = TimeDistributed(Dense(num_tokens, activation='softmax'))(combined)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_dense)

# Load the weights
model.load_weights('/kaggle/input/cnn-model-correction/phoneme_correction_model.h5')


In [6]:


# Inference example
new_mispronounced_sequence = "fˈɔːɹɪsʌt"  # Replace with actual sequence
new_mispronounced_sequence_tokenized = tokenizer.texts_to_sequences([new_mispronounced_sequence])
new_mispronounced_sequence_padded = pad_sequences(new_mispronounced_sequence_tokenized, maxlen=max_sequence_length, padding='post')

# Prepare the decoder input for inference
decoder_input = np.zeros_like(new_mispronounced_sequence_padded)
decoder_input[:, 1:] = new_mispronounced_sequence_padded[:, :-1]

# Predict
predicted_phonemes = model.predict([new_mispronounced_sequence_padded, decoder_input])

# Get the token with the highest probability for each timestep
predicted_token_indices = np.argmax(predicted_phonemes, axis=-1)

# Create a reverse mapping from index to phoneme
index_to_phoneme = {index: phoneme for phoneme, index in tokenizer.word_index.items()}
index_to_phoneme[0] = ''  # Padding token

# Decode the predicted token indices to phoneme sequences
predicted_phoneme_sequences = []
for token_sequence in predicted_token_indices:
    predicted_phoneme_sequence = ''.join([index_to_phoneme[token] for token in token_sequence])
    predicted_phoneme_sequences.append(predicted_phoneme_sequence)

# Print the decoded phoneme sequence
for seq in predicted_phoneme_sequences:
    print(seq)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
fˈɔːɹɪst


I0000 00:00:1724581477.480503     111 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


In [9]:


# Inference example
new_mispronounced_sequence = "hˈɪpzəpˌɑːɾæməs"  # Replace with actual sequence
new_mispronounced_sequence_tokenized = tokenizer.texts_to_sequences([new_mispronounced_sequence])
new_mispronounced_sequence_padded = pad_sequences(new_mispronounced_sequence_tokenized, maxlen=max_sequence_length, padding='post')

# Prepare the decoder input for inference
decoder_input = np.zeros_like(new_mispronounced_sequence_padded)
decoder_input[:, 1:] = new_mispronounced_sequence_padded[:, :-1]

# Predict
predicted_phonemes = model.predict([new_mispronounced_sequence_padded, decoder_input])

# Get the token with the highest probability for each timestep
predicted_token_indices = np.argmax(predicted_phonemes, axis=-1)

# Create a reverse mapping from index to phoneme
index_to_phoneme = {index: phoneme for phoneme, index in tokenizer.word_index.items()}
index_to_phoneme[0] = ''  # Padding token

# Decode the predicted token indices to phoneme sequences
predicted_phoneme_sequences = []
for token_sequence in predicted_token_indices:
    predicted_phoneme_sequence = ''.join([index_to_phoneme[token] for token in token_sequence])
    predicted_phoneme_sequences.append(predicted_phoneme_sequence)

# Print the decoded phoneme sequence
for seq in predicted_phoneme_sequences:
    print(seq)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
hˈɪpəppˌɑːɾmməs


In [28]:


# Inference example
new_mispronounced_sequence = "ɹˈɪvɪtɛ"  # Replace with actual sequence
new_mispronounced_sequence_tokenized = tokenizer.texts_to_sequences([new_mispronounced_sequence])
new_mispronounced_sequence_padded = pad_sequences(new_mispronounced_sequence_tokenized, maxlen=max_sequence_length, padding='post')

# Prepare the decoder input for inference
decoder_input = np.zeros_like(new_mispronounced_sequence_padded)
decoder_input[:, 1:] = new_mispronounced_sequence_padded[:, :-1]

# Predict
predicted_phonemes = model.predict([new_mispronounced_sequence_padded, decoder_input])

# Get the token with the highest probability for each timestep
predicted_token_indices = np.argmax(predicted_phonemes, axis=-1)

# Create a reverse mapping from index to phoneme
index_to_phoneme = {index: phoneme for phoneme, index in tokenizer.word_index.items()}
index_to_phoneme[0] = ''  # Padding token

# Decode the predicted token indices to phoneme sequences
predicted_phoneme_sequences = []
for token_sequence in predicted_token_indices:
    predicted_phoneme_sequence = ''.join([index_to_phoneme[token] for token in token_sequence])
    predicted_phoneme_sequences.append(predicted_phoneme_sequence)

# Print the decoded phoneme sequence
for seq in predicted_phoneme_sequences:
    print(seq)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
ɹˈɪvɪt


In [16]:


# Inference example
new_mispronounced_sequence = "ˈæpθt"  # Replace with actual sequence
new_mispronounced_sequence_tokenized = tokenizer.texts_to_sequences([new_mispronounced_sequence])
new_mispronounced_sequence_padded = pad_sequences(new_mispronounced_sequence_tokenized, maxlen=max_sequence_length, padding='post')

# Prepare the decoder input for inference
decoder_input = np.zeros_like(new_mispronounced_sequence_padded)
decoder_input[:, 1:] = new_mispronounced_sequence_padded[:, :-1]

# Predict
predicted_phonemes = model.predict([new_mispronounced_sequence_padded, decoder_input])

# Get the token with the highest probability for each timestep
predicted_token_indices = np.argmax(predicted_phonemes, axis=-1)

# Create a reverse mapping from index to phoneme
index_to_phoneme = {index: phoneme for phoneme, index in tokenizer.word_index.items()}
index_to_phoneme[0] = ''  # Padding token

# Decode the predicted token indices to phoneme sequences
predicted_phoneme_sequences = []
for token_sequence in predicted_token_indices:
    predicted_phoneme_sequence = ''.join([index_to_phoneme[token] for token in token_sequence])
    predicted_phoneme_sequences.append(predicted_phoneme_sequence)

# Print the decoded phoneme sequence
for seq in predicted_phoneme_sequences:
    print(seq)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
ˈæpt
