In [1]:
#Import libraries
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn import metrics, svm, naive_bayes
from PIL import Image

In [2]:
#Function to extract labels from files
def extract_labels(file_path):
    with open(file_path) as file_labels:

        file_lines = file_labels.readlines()

        file_shape_labels= [None] * len(file_lines)
        file_position_labels = [None] * len(file_lines)
        file_phoneme_labels = [None] * len(file_lines)
        file_second_phoneme_label = [None] * len(file_lines)
        
        for i, line in enumerate(file_lines):
            file_line = line.split()

            file_shape_labels[i]  = int(file_line[1])
            file_position_labels[i] = int(file_line[2])
            file_phoneme_labels[i] = int(file_line[3])
            file_second_phoneme_label[i] = int(file_line[4])
            
    return np.array(file_shape_labels), np.array(file_position_labels), np.array(file_phoneme_labels), np.array(file_second_phoneme_label)

In [3]:
#Funtion to randomly shuffle data
def shuffle_data(images, shape_labels, position_labels, phoneme_labels, second_phoneme_labels):
    #index = np.arange(len(labels))
    #np.shuffle(index)
    index = np.random.choice(np.arange(len(phoneme_labels)), replace = False, size = len(phoneme_labels))
    
    shuffled_images = images[index]
    shuffled_shape_labels = shape_labels[index]
    shuffled_position_labels = position_labels[index]
    shuffled_phoneme_labels = phoneme_labels[index]
    shuffled_second_phoneme_labels = second_phoneme_labels[index]
    return shuffled_images, shuffled_shape_labels, shuffled_position_labels, shuffled_phoneme_labels, shuffled_second_phoneme_labels

In [4]:
#Function to split data
def split_data(data, split_ratio=0.9):
    partition = round(len(data) * split_ratio)
    
    train = data[:partition]
    test = data[partition:]
    return train, test

In [5]:
#Function to prepare dataset to input network
def create_dataset(image_dir):
    dataset = []
    for image_name in os.listdir(image_dir):
        image = Image.open(image_dir + image_name)
        image_as_array = np.asarray(image)
        dataset.append(image_as_array)
    return np.array(dataset)

In [6]:
#Function to concatenate outputs of two networks into single vector
def join_outputs(out1, out2):
    joint_output = []
    for i in range(len(out1)):
        joint = np.concatenate((out1[i],out2[i]))
        joint_output.append(joint)
    return np.array(joint_output)

In [7]:
#Function to plot training and validation loss and accuracy
#Obtained from https://www.tensorflow.org/tutorials/images/classification
def plot_training_results(history, epochs = 7):
    
    loss=history.history['loss']
    val_loss=history.history['val_loss']
    
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    epochs_range = range(epochs)

    plt.figure(figsize=(8, 8))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    
    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.show()
    return

In [8]:
#Defining the data augmentation layer that will go in at the start of the networks
#Obtained from https://www.tensorflow.org/tutorials/images/classification
data_augmentation = tf.keras.Sequential([
    #tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal", input_shape=(256, 256, 3)),
    tf.keras.layers.experimental.preprocessing.RandomRotation(0.1, input_shape=(256, 256, 3)),
    tf.keras.layers.experimental.preprocessing.RandomZoom(0.1),
  ])

In [9]:
#Open image folders
consonants_dir = os.path.join('/Users/User/Desktop/MSc Project/English Cued Speech/ConsonantImages/')
vowels_dir = os.path.join('/Users/User/Desktop/MSc Project/English Cued Speech/VowelImages/')
combined_dir = os.path.join('/Users/User/Desktop/MSc Project/English Cued Speech/PhonemeImages/')

#Open label files
consonant_labels = "/Users/User/Desktop/MSc Project/English Cued Speech/Labels_Consonant.txt"
vowel_labels = "/Users/User/Desktop/MSc Project/English Cued Speech/Labels_Vowel.txt"
combined_labels = "/Users/User/Desktop/MSc Project/English Cued Speech/Labels_All.txt"

In [10]:
#Extract labels
(vowel_shape_labels, vowel_position_labels, vowel_phoneme_labels, not_used) = extract_labels(vowel_labels)
(consonant_shape_labels, consonant_position_labels, consonant_phoneme_labels, or_this) = extract_labels(consonant_labels)
(combined_shape_labels, combined_position_labels, combined_consonant_labels, combined_vowel_labels) = extract_labels(combined_labels)

In [11]:
#Vowels Model

In [12]:
#Prepare images of vowels to input into network
vowel_images = create_dataset(vowels_dir)
vowel_images = vowel_images.reshape(len(vowel_images), 256, 256, 3)

In [13]:
#Randomly shuffle vowel images and labels
shuffled_vowels, shuffled_vowel_shape_labels, shuffled_vowel_position_labels, shuffled_vowel_phoneme_labels, non_used = shuffle_data(vowel_images, vowel_shape_labels, vowel_position_labels, vowel_phoneme_labels, not_used)

#Split training and test data
vowel_train, vowel_test = split_data(shuffled_vowels)
vowel_shape_labels_train, vowel_shape_labels_test = split_data(shuffled_vowel_shape_labels)
vowel_position_labels_train, vowel_position_labels_test = split_data(shuffled_vowel_position_labels)
vowel_phoneme_labels_train, vowel_phoneme_labels_test = split_data(shuffled_vowel_phoneme_labels)

In [14]:
#Build the network to recognise hand position
hand_position_model = tf.keras.models.Sequential([
  tf.keras.layers.experimental.preprocessing.Rescaling(1./255, input_shape=(256, 256, 3)),
  tf.keras.layers.Conv2D(8, (3, 3), activation='relu'),
  tf.keras.layers.MaxPooling2D(2, 2),
  tf.keras.layers.Conv2D(8, (3, 3), activation='relu'),
  tf.keras.layers.MaxPooling2D(2,2),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(5, activation='softmax')
])

#Compile the network
hand_position_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#Train the network
hand_position_model.fit(vowel_train, vowel_position_labels_train, epochs=7)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x2107a692108>

In [15]:
#Test the network and print it's performance
hand_position_test_loss, hand_position_test_accuracy = hand_position_model.evaluate(vowel_test, vowel_position_labels_test)
print ('Test loss: {}, Test accuracy: {}'.format(hand_position_test_loss, hand_position_test_accuracy*100))

Test loss: 0.16300436854362488, Test accuracy: 97.39130139350891


In [16]:
#Build network to recognise vowel phoneme
vowel_phoneme_model = tf.keras.models.Sequential([
  tf.keras.layers.experimental.preprocessing.Rescaling(1./255, input_shape=(256, 256, 3)),
  tf.keras.layers.Conv2D(16, (3, 3), activation='relu'),
  tf.keras.layers.MaxPooling2D(2, 2),
  tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
  tf.keras.layers.MaxPooling2D(2,2),
  tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
  tf.keras.layers.MaxPooling2D(2,2),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(13, activation='softmax')
])

#Complie the network
vowel_phoneme_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#Train the network
vowel_phoneme_model.fit(vowel_train, vowel_phoneme_labels_train, epochs=7)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x2101adef688>

In [17]:
#plot_training_results(vowel_phoneme_history)

In [18]:
#Test the network and print it's performance
vowel_phoneme_test_loss, vowel_phoneme_test_accuracy = vowel_phoneme_model.evaluate(vowel_test, vowel_phoneme_labels_test)
print ('Test loss: {}, Test accuracy: {}'.format(vowel_phoneme_test_loss, vowel_phoneme_test_accuracy*100))

Test loss: 1.1623965501785278, Test accuracy: 53.913044929504395


In [19]:
#Obtain decisions from both networks and concatenate into a single decision vector
hand_position_decision = hand_position_model(vowel_train)

vowel_phoneme_decision = vowel_phoneme_model(vowel_train)

vowel_decision = join_outputs(hand_position_decision, vowel_phoneme_decision)

In [20]:
#Obtain test vectors to test network on
hand_position_decision_test = hand_position_model(vowel_test)

vowel_phoneme_decision_test = vowel_phoneme_model(vowel_test)

vowel_decision_test = join_outputs(hand_position_decision_test, vowel_phoneme_decision_test)

In [21]:
#Build network to make final decision of phoneme
vowel_decision_model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(16, activation='relu'),
  tf.keras.layers.Dense(13, activation='softmax')
])

#Compile network
vowel_decision_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#Train network
vowel_decision_model.fit(vowel_decision, vowel_phoneme_labels_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x2101cc31188>

In [22]:
#Test network and print result
vowel_test_loss, vowel_test_accuracy = vowel_decision_model.evaluate(vowel_decision_test, vowel_phoneme_labels_test)
print ('Test loss: {}, Test accuracy: {}'.format(vowel_test_loss, vowel_test_accuracy*100))

Test loss: 1.0235064029693604, Test accuracy: 57.3913037776947


In [23]:
#Build SVM classifier for vowel
vowel_svm = svm.SVC(kernel='linear')

vowel_svm.fit(vowel_decision, vowel_phoneme_labels_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [24]:
#Test SVM classifier and print accuracy
svm_vowel_decision = vowel_svm.predict(vowel_decision_test)

print("Accuracy:",metrics.accuracy_score(vowel_phoneme_labels_test, svm_vowel_decision)*100)

Accuracy: 54.78260869565217


In [25]:
#Build Naive-Bayes classifier for vowel
vowel_nb = naive_bayes.GaussianNB()

vowel_nb.fit(vowel_decision, vowel_phoneme_labels_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [26]:
#Test Naive-Bayes classifier and print accuracy
nb_vowel_decision = vowel_nb.predict(vowel_decision_test)

print("Accuracy:",metrics.accuracy_score(vowel_phoneme_labels_test, nb_vowel_decision)*100)

Accuracy: 46.95652173913044
