In [1]:
from google.colab import drive  
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Input, Dense, Bidirectional, Conv2D, Flatten, Dropout, MaxPooling2D, Lambda, Reshape, concatenate, TimeDistributed
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

import os
from PIL import Image
from collections import OrderedDict
import numpy as np

In [3]:
def getImagesTensors(imgFolder):
    i=0
    imgDict=dict()
    files=os.listdir(imgFolder)
    for file in files:
        image = tf.io.read_file(imgFolder+"/"+file)
        image = tf.io.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, (50,50))
        image = tf.cast(image, tf.int32)
        imgDict[file]=image
        i+=1
    return imgDict

In [4]:
def getFinalInputs(d1,d2):
  rez=[]
  for key,val in d1.items():
      rez.append([val,d2[key.split(".")[0]+".mp3"]])
  return rez

In [5]:
trainImagesDictionaryTensors=getImagesTensors('gdrive/MyDrive/spectrograms/train')
testImagesDictionaryTensors=getImagesTensors('gdrive/MyDrive/spectrograms/test')

In [6]:
  def getSentences(filePath,imageDictionaryTensors):
    data = dict()
    i=0
    with open(filePath,'r',encoding='utf8') as file:
        for line in file:
            splittedLine = line.split("\t")
            mp3FileName = splittedLine[1]
            if mp3FileName.split(".")[0]+".jpg" in imageDictionaryTensors:
                sentence = splittedLine[2].split(" ")
                data[mp3FileName]=sentence
    return data

In [7]:
trainSentencesDictionary = getSentences('gdrive/MyDrive/cv-corpus-6.1-2020-12-11/ro/train.tsv',trainImagesDictionaryTensors)
testSentencesDictionary = getSentences('gdrive/MyDrive/cv-corpus-6.1-2020-12-11/ro/dev.tsv',testImagesDictionaryTensors)

In [8]:
trainInputList=getFinalInputs(trainImagesDictionaryTensors,trainSentencesDictionary)
testInputList=getFinalInputs(testImagesDictionaryTensors,testSentencesDictionary)

In [9]:
def getTokenizedSentences(trainInputList,tokenizer,dataType):
    if dataType=="train":
        dataString = [value[1] for value in trainInputList]
        tokenizer.fit_on_texts(dataString)
        sequences = tokenizer.texts_to_sequences(dataString)
        vocabularySize = len(tokenizer.word_index) + 1
        sequences=pad_sequences(sequences, maxlen=vocabularySize)
        rez = np.array([np.array([0 for _ in range(vocabularySize)]) for _ in range(len(dataString))])
        for i in range(len(sequences)):
          for j in range(len(sequences[i])):
            if sequences[i][j]!=rez[i][j]:
                rez[i][sequences[i][j]-1]=1
        return rez,vocabularySize
    elif dataType=="test":
        dataString = [value[1] for value in trainInputList]
        sequences = tokenizer.texts_to_sequences(dataString)
        vocabularySize = len(tokenizer.word_index) + 1
        sequences=pad_sequences(sequences, maxlen=vocabularySize)
        rez = np.array([np.array([0 for _ in range(vocabularySize)]) for _ in range(len(dataString))])
        for i in range(len(sequences)):
          for j in range(len(sequences[i])):
            if sequences[i][j]!=rez[i][j]:
                rez[i][sequences[i][j]-1]=1
        return rez

In [10]:
tokenizer = Tokenizer(num_words=6586, lower=True, oov_token="<UGACHAKA>")
tokenizedTrainSentences,vocabularySize = getTokenizedSentences(trainInputList,tokenizer,"train")
tokenizedTestSentences = getTokenizedSentences(testInputList,tokenizer,"test")
print(len(tokenizedTrainSentences))
print(len(tokenizedTestSentences))
imageHeight,imageWidth,channels=50,50,3

3399
858


In [None]:
####################AUTOENCODER####################

autoencoder=tf.keras.Sequential([
                                 tf.keras.layers.InputLayer(input_shape=vocabularySize),
                                 tf.keras.layers.Dense(1024,activation='relu'),
                                 tf.keras.layers.Dense(512,activation='relu'),
                                 tf.keras.layers.Dense(256,activation='relu'),
                                 tf.keras.layers.Dense(128,activation='relu'),
                                 tf.keras.layers.Dense(64,activation='relu'),
                                 tf.keras.layers.Dense(32,activation='relu'),
                                 tf.keras.layers.Dense(16,activation='relu'),
                                 tf.keras.layers.Dense(4,activation='relu'),
                                 tf.keras.layers.Dense(16,activation='relu'),
                                 tf.keras.layers.Dense(32,activation='relu'),
                                 tf.keras.layers.Dense(64,activation='relu'),
                                 tf.keras.layers.Dense(128,activation='relu'),
                                 tf.keras.layers.Dense(256,activation='relu'),
                                 tf.keras.layers.Dense(512,activation='relu'),
                                 tf.keras.layers.Dense(1024,activation='relu'),
                                 tf.keras.layers.Dense(vocabularySize,activation='sigmoid'),
])
autoencoder.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer='adam')

In [11]:
autoencoder=tf.keras.models.load_model('/content/gdrive/MyDrive/autoencoder/second/')
autoencoder.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 1024)              6745088   
_________________________________________________________________
dense_17 (Dense)             (None, 512)               524800    
_________________________________________________________________
dense_18 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_19 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_20 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_21 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_22 (Dense)             (None, 16)               

In [None]:
for i in range(10):
    print(i)  
    autoencoder.fit(x=tokenizedTrainSentences,y=tokenizedTrainSentences,validation_data=(tokenizedTestSentences,tokenizedTestSentences),batch_size=34,shuffle=True,epochs=200)
    autoencoder.save('/content/gdrive/MyDrive/autoencoder/second/')

In [12]:
def computeAccuracy(model,testInputs,testOutputs):
    corectSentences = 0
    corectOnes = 0
    corectZeros = 0
    numOfOnes = 0
    numOfZeros = 0
    for testInput,testOutput in zip(testInputs,testOutputs):
        prediction=model.predict(np.expand_dims(testInput,axis=0))
        prediction = convert(prediction)
        if np.array_equal(prediction, np.array(testOutput)):
            corectSentences+=1
        else:
            for realValue,predictedValue in zip(prediction,testOutput):
                if realValue == 1:
                    numOfOnes+=1
                    if predictedValue == 1:
                        corectOnes+=1
                if realValue == 0:
                    numOfZeros+=1
                    if predictedValue == 0:
                        corectZeros+=1
    if corectSentences == len(testInputs):
        return 1,1,1
    return corectSentences / len(testInputs), corectOnes / numOfOnes, corectZeros / numOfZeros

In [15]:
corectSentences,corectOnes,corectZeros = computeAccuracy(model=autoencoder, testInputs=tokenizedTrainSentences, testOutputs=tokenizedTrainSentences)
print("Test accuracy for AutoEncoder (sentences): " + str(corectSentences))
print("Test accuracy for AutoEncoder (ones): " + str(corectOnes))
print("Test accuracy for AutoEncoder (zeros): " + str(corectZeros))

Test accuracy for AutoEncoder (sentences): 0.9355692850838482
Test accuracy for AutoEncoder (ones): 0.8435754189944135
Test accuracy for AutoEncoder (zeros): 0.9996523443165235


In [16]:
inputLayer = tf.keras.Input(shape=vocabularySize)
x = autoencoder.get_layer(index= 0)(inputLayer)
x = autoencoder.get_layer(index= 1)(x)
x = autoencoder.get_layer(index= 2)(x)
x = autoencoder.get_layer(index= 3)(x)
x = autoencoder.get_layer(index= 4)(x)
x = autoencoder.get_layer(index= 5)(x)
x = autoencoder.get_layer(index= 6)(x)
encoderOutputs = autoencoder.get_layer(index= 7)(x)
encoder= Model(inputs=inputLayer, outputs=encoderOutputs)

decoderInput = tf.keras.Input(shape=4)
i = autoencoder.get_layer(index= 8)(decoderInput)
x = autoencoder.get_layer(index= 9)(i)
x = autoencoder.get_layer(index= 10)(x)
x = autoencoder.get_layer(index= 11)(x)
x = autoencoder.get_layer(index= 12)(x)
x = autoencoder.get_layer(index= 13)(x)
x = autoencoder.get_layer(index= 14)(x)
decoderOutputs = autoencoder.get_layer(index= 15)(x)
decoder= Model(inputs=decoderInput, outputs=decoderOutputs)

encoder.summary()
decoder.summary()





Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 6586)]            0         
_________________________________________________________________
dense_16 (Dense)             (None, 1024)              6745088   
_________________________________________________________________
dense_17 (Dense)             (None, 512)               524800    
_________________________________________________________________
dense_18 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_19 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_20 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_21 (Dense)             (None, 32)                2080  

In [None]:
def res_identity(x, filters): 
  #renet block where dimension doesnot change.
  #The skip connection is just simple identity conncection
  #we will have 3 blocks and then input will be added

  x_skip = x # this will be used for addition with the residual block 
  f1, f2 = filters

  #first block 
  x = tf.keras.layers.Conv2D(f1, kernel_size=(1, 1), strides=(1, 1), padding='valid', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = tf.keras.layers.Activation('relu')(x)

  #second block # bottleneck (but size kept same with padding)
  x = tf.keras.layers.Conv2D(f1, kernel_size=(3, 3), strides=(1, 1), padding='same', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = tf.keras.layers.Activation('relu')(x)

  # third block activation used after adding the input
  x = tf.keras.layers.Conv2D(f2, kernel_size=(1, 1), strides=(1, 1), padding='valid', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
  x = tf.keras.layers.BatchNormalization()(x)
  # x = Activation(activations.relu)(x)

  # add the input 
  x = tf.keras.layers.Add()([x, x_skip])
  x = tf.keras.layers.Activation('relu')(x)

  return x

In [None]:
def res_conv(x, s, filters):
  '''
  here the input size changes''' 
  x_skip = x
  f1, f2 = filters

  # first block
  x = tf.keras.layers.Conv2D(f1, kernel_size=(1, 1), strides=(s, s), padding='valid', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
  # when s = 2 then it is like downsizing the feature map
  x = tf.keras.layers.BatchNormalization()(x)
  x = tf.keras.layers.Activation('relu')(x)

  # second block
  x = tf.keras.layers.Conv2D(f1, kernel_size=(3, 3), strides=(1, 1), padding='same', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
  x = tf.keras.layers.BatchNormalization()(x)
  x = tf.keras.layers.Activation('relu')(x)

  #third block
  x = tf.keras.layers.Conv2D(f2, kernel_size=(1, 1), strides=(1, 1), padding='valid', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
  x = tf.keras.layers.BatchNormalization()(x)

  # shortcut 
  x_skip = tf.keras.layers.Conv2D(f2, kernel_size=(1, 1), strides=(s, s), padding='valid', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x_skip)
  x_skip = tf.keras.layers.BatchNormalization()(x_skip)

  # add 
  x = tf.keras.layers.Add()([x, x_skip])
  x = tf.keras.layers.Activation('relu')(x)

  return x

In [None]:
pictureInputs = Input(shape=(imageHeight, imageWidth, channels))
pictureInputsRescaled = Rescaling(1. / 255)(pictureInputs)
x = tf.keras.layers.ZeroPadding2D(padding=(3, 3))(pictureInputs)

# 1st stage
# here we perform maxpooling, see the figure above

x = Conv2D(64, kernel_size=(7, 7), strides=(2, 2))(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2))(x)

#2nd stage 
# frm here on only conv block and identity block, no pooling

x = res_conv(x, s=1, filters=(64, 256))
x = res_identity(x, filters=(64, 256))
x = res_identity(x, filters=(64, 256))

# 3rd stage

x = res_conv(x, s=2, filters=(128, 512))
x = res_identity(x, filters=(128, 512))
x = res_identity(x, filters=(128, 512))
x = res_identity(x, filters=(128, 512))

# 4th stage

x = res_conv(x, s=2, filters=(256, 1024))
x = res_identity(x, filters=(256, 1024))
x = res_identity(x, filters=(256, 1024))
x = res_identity(x, filters=(256, 1024))
x = res_identity(x, filters=(256, 1024))
x = res_identity(x, filters=(256, 1024))

# 5th stage

x = res_conv(x, s=2, filters=(512, 2048))
x = res_identity(x, filters=(512, 2048))
x = res_identity(x, filters=(512, 2048))

# ends with average pooling and dense connection

x = tf.keras.layers.AveragePooling2D((2, 2), padding='same')(x)

x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(512, activation='relu', kernel_initializer='he_normal')(x)
x = tf.keras.layers.Dense(512, activation='relu', kernel_initializer='he_normal')(x)
x = tf.keras.layers.Dense(256, activation='relu', kernel_initializer='he_normal')(x)
x = tf.keras.layers.Dense(256, activation='relu', kernel_initializer='he_normal')(x)
x = tf.keras.layers.Dense(4, activation='relu', kernel_initializer='he_normal')(x)

# define the model 

model = tf.keras.Model(inputs=pictureInputs, outputs=x, name='Resnet50')


model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.MeanSquaredError())

In [17]:
model = tf.keras.models.load_model('gdrive/MyDrive/models/model1')
model.summary()

Model: "Resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 50, 50, 3)]  0                                            
__________________________________________________________________________________________________
zero_padding2d (ZeroPadding2D)  (None, 56, 56, 3)    0           input_3[0][0]                    
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 25, 25, 64)   9472        zero_padding2d[0][0]             
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 25, 25, 64)   256         conv2d[0][0]                     
___________________________________________________________________________________________

In [18]:
trainInputs=np.array([value[0] for value in trainInputList])
encodedTrainOutputs = encoder.predict(tokenizedTrainSentences)

In [None]:
for _ in range(100):
    model.fit(x=trainInputs, y=encodedTrainOutputs,batch_size=34, epochs=10, shuffle=True)
    model.save('gdrive/MyDrive/models/model1') 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: gdrive/MyDrive/models/model1/assets
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: gdrive/MyDrive/models/model1/assets
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: gdrive/MyDrive/models/model1/assets
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: gdrive/MyDrive/models/model1/assets
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
  8/100 [=>............................] - ETA: 7:26 - loss: 63.5054

In [14]:
def convert(prediction):
    l=[]
    for value in prediction[0]:
        if value > 0.7:
            l.append(1)
        else:
            l.append(0)
    return np.array(l)

In [None]:
def predictAudioToText(spectrogramTensor, model, decoder):
    spectrogramTensor = np.expand_dims(spectrogramTensor,axis=0)
    encodedFeatures = model.predict(spectrogramTensor)
    outputWords = decoder.predict(encodedFeatures)
    return convert(outputWords)

In [None]:
def getSentencesFromPrediction(indexesList,tokenizer):
    sentence=""
    for integer in indexesList:
      for word, index in tokenizer.word_index.items():
          if index == integer:
              sentence+= word + " "
              break
    return sentence

In [None]:
prediction=predictAudioToText(trainImagesDictionaryTensors['common_voice_ro_20789097.jpg'],model,decoder)
indexList = [idx+1 for idx,val in enumerate(prediction) if val == 1]
print(getSentencesFromPrediction(indexList,tokenizer))

să mai două dori aş totuşi, fac remarce. 
