In [1]:
%load_ext autoreload

In [None]:
"""
Subjects articulate 38 distinct phonemes that span the entire English language phonetic space in `silent' and `audible' manner. 

38 phonemes are (labels are given in brackets beside the phoneme)

Bilabial consonants: Baa (0), Paa (1), Maa (2)
Labiodental consonants: Faa (3), Vaa (4)
Dental consonants: Thaa (5), Dhaa (6)
Alvelor consonants: Taa (7), Daa (8), Naa (9), Saa (10), Zaa (11)
Post vaelor consonants: Chaa (12), Shaa (13), Jhaa (14), Zhaa (15)
Velar consonants: Kaa (!6), Gaa (17), NGaa (18)
Approximant consonants: Yaa (19), Raa (20), Laa (21), Waa (22)
Vowels:
OY as in bOY (23), OW as in nOW (24),
AO as in OUght (25), AA as in fAther (26),
AE as in At (27), EH as in mEt (28),
EY as in mAte (29), IY as in mEET (30),
IH as in It (31), AH as in HUt (32),
UW as in fOOD (33), ER as in hER (34),
UH as in hOOD (35)

DATA is given in a numpy array of dimensions (380, 22, 7500) - (38 phonemes each repeated 10 times, 22 channels, 7500 time samples).
Raw data was filtered using 3rd order Butterworth bandpass filter between 80 and 1000 Hertz.
"""

In [2]:
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [3]:
from manifoldRnn import spdNN
from manifoldRnn import optimizers 
from manifoldRnn import trainTest
from manifoldRnn import spdRnn

In [4]:
class BaseDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __getitem__(self, index):
        return self.data[index].astype('float32'), self.labels[index]

    def __len__(self):
        return len(self.data)

In [5]:
dev = "cuda:0" 
device = torch.device(dev)

In [6]:
numberPhonemes = 38
trialsPerPhoneme = 10
numberTrials = numberPhonemes * trialsPerPhoneme
numberChannels = 22
windowLength = 7500

In [7]:
subjectNumber = 1
articulationManner = "Voiced"
subject = "Subject" + str(subjectNumber)

In [8]:
DATA = np.load("Experiment1/Phoneme/" + articulationManner + subject + ".npy")
   
mean = np.mean(DATA, axis = -1)
std = np.std(DATA, axis = -1)
DATA = (DATA - mean[..., np.newaxis])/(std[..., np.newaxis] + 1e-5)

phonemeMatrices = DATA
labelsByPhonemes = np.array([[i] * trialsPerPhoneme for i in range(numberPhonemes)]).reshape(numberTrials)

Indices =  {}
for i in range(numberPhonemes):
    Indices[i] = []
for i in range(len(labelsByPhonemes)):
    Indices[labelsByPhonemes[i]].append(i)

In [9]:
slicedMatrices = np.zeros((numberPhonemes * trialsPerPhoneme, 46, numberChannels, numberChannels))
for j in range(numberPhonemes * trialsPerPhoneme):
    for i in range(46):
        where = i * 150 + 300
        start = where - 300
        End = where + 450
        slicedMatrices[j, i] = 1/750 * phonemeMatrices[j, :, start:End] @ phonemeMatrices[j, :, start:End].T

covariancesLabels = np.zeros((numberPhonemes, trialsPerPhoneme, 46, numberChannels, numberChannels))
for i in range(numberPhonemes):
    for j in range(trialsPerPhoneme):
        covariancesLabels[i, j] = slicedMatrices[Indices[i][j]]

trainFeatures = np.zeros((numberPhonemes * 6, 46, numberChannels, numberChannels))
trainLabels = np.zeros((numberPhonemes * 6))
count = 0
for i in range(numberPhonemes):
    trainFeatures[count:count + 3] = covariancesLabels[i, :3]
    trainFeatures[count + 3:count + 6] = covariancesLabels[i, 5:8]
    trainLabels[count:count + 6] = [i] * 6
    count += 6

testFeatures = np.zeros((numberPhonemes * 4, 46, numberChannels, numberChannels))
testLabels = np.zeros((numberPhonemes * 4))
count = 0
for i in range(numberPhonemes):
    testFeatures[count:count + 2] = covariancesLabels[i, 3:5]
    testFeatures[count + 2:count + 4] = covariancesLabels[i, 8:10]
    testLabels[count:count + 4] = [i] * 4
    count += 4

trainDataset = BaseDataset(trainFeatures, trainLabels)
testDataset = BaseDataset(testFeatures, testLabels)
trainDataloader = DataLoader(trainDataset, batch_size = 32, shuffle = True)
testDataloader = DataLoader(testDataset, batch_size = 32, shuffle = False)

In [10]:
numberEpochs = 150

model = spdRnn.spdRnnNet(numberPhonemes).to(device)
numParams = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(numParams)
lossFunction = nn.CrossEntropyLoss()
cnnOptimizer = optimizers.StiefelOptim(model.CNN.parameters(), lr = 0.05)
rnnOptimizer = optim.Adam(model.RNN.parameters(), lr = 0.001, weight_decay = 1e-3)

146796


In [11]:
maxValue = 0
for epoch in range(numberEpochs):
    trainLoss, trainAccuracy = trainTest.trainOperation(model, device, trainDataloader, cnnOptimizer, rnnOptimizer, lossFunction)
    testLoss, testAccuracy = trainTest.testOperation(model, device, testDataloader, lossFunction)
    if maxValue < testAccuracy:
        maxValue = testAccuracy
    print(f'Epoch: {epoch + 1}/{numberEpochs}, Training loss: {trainLoss:.4f}, Training accuracy: {trainAccuracy:.2f}%, Test loss: {testLoss:.4f}, Test accuracy: {testAccuracy:.2f}%')
print(maxValue)

Epoch: 1/150, Training loss: 0.1297, Training accuracy: 1.75%, Test loss: 0.1189, Test accuracy: 2.63%
Epoch: 2/150, Training loss: 0.1276, Training accuracy: 2.63%, Test loss: 0.1184, Test accuracy: 2.63%
Epoch: 3/150, Training loss: 0.1264, Training accuracy: 4.82%, Test loss: 0.1176, Test accuracy: 5.92%
Epoch: 4/150, Training loss: 0.1250, Training accuracy: 4.82%, Test loss: 0.1162, Test accuracy: 5.26%
Epoch: 5/150, Training loss: 0.1223, Training accuracy: 7.02%, Test loss: 0.1093, Test accuracy: 5.26%
Epoch: 6/150, Training loss: 0.1151, Training accuracy: 5.26%, Test loss: 0.1034, Test accuracy: 12.50%
Epoch: 7/150, Training loss: 0.1109, Training accuracy: 11.40%, Test loss: 0.1046, Test accuracy: 7.24%
Epoch: 8/150, Training loss: 0.1110, Training accuracy: 5.70%, Test loss: 0.1013, Test accuracy: 15.13%
Epoch: 9/150, Training loss: 0.1104, Training accuracy: 12.28%, Test loss: 0.1001, Test accuracy: 15.79%
Epoch: 10/150, Training loss: 0.1030, Training accuracy: 14.04%, Tes

In [12]:
print(maxValue)

65.13157894736842


In [13]:
%autoreload