In [2]:
import os
import pandas as pd
from pymongo import MongoClient
from bson.objectid import ObjectId
import json
import audio_metadata
from googleapiclient.discovery import build

##### Download all audio clips first!!!!

#### Importing dataset

In [None]:
audio_clips_df = pd.read_csv('./csvs/modified_audio_clips_xx.csv', low_memory=False)      # last updated audio clips collection

In [None]:
audio_clips_df.info()

#### Retrieve audio metadata - duration

In [None]:
def clip_duration(audio_file):
    wave_file = audio_metadata.load(audio_file)
    duration = wave_file['streaminfo'].duration
    duration = "{0:.3f}".format(duration)
    return duration

In [None]:
# Download all audio clips to find audio metadata
audio_clips_duration_dict = {}
for index,audio_clip in audio_clips_df.iloc[:].iterrows():

    if audio_clip['file_name']:
        try:
            # directory where all audio clips are stored
            directory = os.environ.get('AUDIO_CLIPS_DIRECTORY_PATH')
            audio_file = directory + audio_clip['file_name']
            duration = clip_duration(audio_file)
            audio_clips_duration_dict[audio_clip['_id']] = float(duration)
        except Exception as e:
            with open('./exceptions/audio_clips_duration_exception.txt', 'a') as audio_clips_duration_exception:
                audio_clips_duration_exception.write(audio_clip['file_name'])
                audio_clips_duration_exception.close()
            audio_clips_duration_dict[audio_clip['_id']] = None
    else:
        audio_clips_duration_dict[audio_clip['_id']] = None

In [None]:
duration = open('./youtube_api/audio_clips_duration.txt',"a")
duration.write(json.dumps(audio_clips_duration_dict))
duration.close()

### Connection to MongoDB

In [None]:
mongodb_uri = os.environ.get('MONGODB_URI')
database_name = os.environ.get('DATABASE_NAME')

client = MongoClient(mongodb_uri)
database = client[database_name]

#### Connecting to audio clips collection

In [None]:
audio_clips = database['modified_audio_clips_xx']   # last updated audio clips collection

#### Update audio clips duration

In [None]:
try:
    for (key,value) in audio_clips_duration_dict.items():
        audio_clips.find_one_and_update(
        {"_id" : ObjectId(key)},
        {"$set": { "duration" : value}}
    );
except Exception as e:
    print("Exception: ", e)

#### Updating duration with filename - extended.mp3

In [None]:
# Audio Clip ids with duration:

# 604b805eb6d11f34349a885c - 2.16
# 604b805eb6d11f34349a885d - 7.368
# 604b805eb6d11f34349a885e - 18.168
# 604b805eb6d11f34349a885f - 2.904
# 604b805eb6d11f34349a8860 - 17.064
# 604b805eb6d11f34349a8861 - 7.44
# 604b805eb6d11f34349a8862 - 0.576
# 604b805eb6d11f34349a8863 - 13.152
# 604b805eb6d11f34349a8864 - 2.04
# 604b805eb6d11f34349a8865 - 4.128
# 604b805eb6d11f34349a8854 - 7.2
# 604b805eb6d11f34349a8855 - 3.336
# 604b805eb6d11f34349a8856 - 4.152
# 604b805eb6d11f34349a8858 - 28.32
# 604b805eb6d11f34349a8859 - 8.016
# 604b805eb6d11f34349a885a - 3.288
# 604b805eb6d11f34349a885b - 1.92

try:
    audio_clips.find_one_and_update(
    {"_id" : ObjectId('604b805eb6d11f34349a885b')},   # change object id from above list manually to update for all 
    {"$set": { "duration" : 1.92}}      # change duration from above list manually to update for all 
    );
except Exception as e:
    print(e)

#### Removing audio clips with <0.2s duration in audio clips collection

In [None]:
try:
    audio_clips.delete_many({"duration": {"$lt": 0.2}})
except Exception as e:
    print(e)