# Converting Text captions to audio files for cat images

In [None]:
# Downloading zip file for dataset
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip 

In [None]:
# Unzipping downloaded file
!unzip annotations_trainval2017.zip -d annotations_trainval2017

In [None]:
# Installing and importing required libraries
!pip install pycocotools
# Installing google text-to-speech
!pip install gtts
%matplotlib inline
from pycocotools.coco import COCO
import time
from gtts import gTTS #Import Google Text to Speech
from IPython.display import Audio, clear_output #Import Audio method from IPython's Display Class

In [None]:
dataDir='annotations_trainval2017' #Directory for data
dataType='train2017'
annFile='{}/annotations/instances_{}.json'.format(dataDir,dataType) #Annotations file

In [None]:
# initialize COCO api for instance annotations
coco=COCO(annFile)

In [None]:
# display COCO categories and supercategories
cats = coco.loadCats(coco.getCatIds())
nms=[cat['name'] for cat in cats]
print('COCO categories: \n{}\n'.format(' '.join(nms)))

nms = set([cat['supercategory'] for cat in cats])
print('COCO supercategories: \n{}'.format(' '.join(nms)))

In [None]:
# For cats
our_categories = ['cat']
imgIds = []
for cat_name in our_categories:
  # get all images containing given categories
  catIds = coco.getCatIds(catNms=cat_name)
  imgIds += coco.getImgIds(catIds=catIds)

imgIds = list(set(imgIds))
imgs = coco.loadImgs(imgIds)

In [None]:
# initialize COCO api for caption annotations
annFile = '{}/annotations/captions_{}.json'.format(dataDir,dataType)
coco_caps=COCO(annFile)

In [None]:
# Making directory to store audio files
!mkdir DLProjSpeechAnnotationsCats

In [None]:

count = 0

for img in imgs:
    annIds = coco_caps.getAnnIds(imgIds = img['id'])
    captions = coco_caps.loadAnns(annIds)[0:5]
    clear_output(wait = True)
    print(count+1, " / ", len(imgs))
    time.sleep(10)
    for i in range(len(captions)):
      caption = captions[i]['caption'] # Getting text caption
      tts = gTTS(caption, slow = True) # Converting text caption to speech with slow set to True
      save_file_name = './DLProjSpeechAnnotationsCats/' + img['file_name'][:-4] + '_' + str(i) + '.wav' 
      tts.save(save_file_name)# Saving the audio file
    count += 1

In [None]:
# Zipping the folder
!zip -r "DLProjSpeechAnnotationsCats.zip" "DLProjSpeechAnnotationsCats" 

In [None]:
# Getting download link to avoid crashing the notebook by browsing output directory
import os
os.chdir(r'/kaggle/working')
from IPython.display import FileLink
FileLink(r'DLProjSpeechAnnotationsDogs.zip')