In [None]:
import pandas as pd
from pymongo import MongoClient
from bson.objectid import ObjectId

#### Assuming audio clips transcribed already with google cloud speech to text api

##### transcriptionResponse.json format
    [
        {
            "audio_file_name": 'abc.wav',
            "transcription_response": {
                // api result
            },
            "transcript": 'transcribed audio clip sentence'
        }
    ]

In [None]:
tr = pd.read_json('./transcriptionResponse.json')

In [None]:
tr['audio_filename'].duplicated().value_counts()

In [None]:
tr = tr.drop_duplicates(subset='audio_filename')

In [None]:
output_rows = []

for i,row in tr.iterrows():

    audio_filename = row['audio_filename']
    results = row["transcription_response"]["results"]

    for j in range(len(results)-1):
        result = results[j]

        alternative = result['alternatives'][0]
        
        if alternative!= {}:

            transcript = alternative["transcript"]
            start_time = alternative["words"][0]["startTime"]
            start_time = float(str(start_time).replace('s',""))
            end_time = alternative["words"][-1]["endTime"]
            end_time = float(str(end_time).replace('s',""))

            output_row = {
                "audio_filename": audio_filename,
                "sentence": transcript,
                "start_time": start_time,
                "end_time": end_time,
            }

            output_rows.append(output_row)
        
transcript_info = pd.DataFrame(output_rows)

In [None]:
transcript_info.head()

In [None]:
len(transcript_info)

In [None]:
len(transcript_info['audio_filename'].unique())

In [None]:
transcript_info_dict = transcript_info.copy()

In [None]:
# Creating transcript info dictionary
transcript_info_dict['transcript'] = transcript_info_dict[['sentence', 'start_time', 'end_time']].apply(lambda x: x.to_dict(), axis=1)

In [None]:
transcript_info_dict.head()

In [None]:
grouped_transcript_info = transcript_info_dict.groupby('audio_filename')['transcript'].agg(list).reset_index()

In [None]:
grouped_transcript_info.head()

In [None]:
len(grouped_transcript_info)

### Connection to MongoDB

In [None]:
mongodb_uri = os.environ.get('MONGODB_URI')
database_name = os.environ.get('DATABASE_NAME')

client = MongoClient(mongodb_uri)
database = client[database_name]

#### Update transcript in audio clips collection

In [None]:
audio_clips = database['modified_audio_clips_xx']   # last updated audio clips collection

In [None]:
grouped_transcript_info_df = grouped_transcript_info[:len(grouped_transcript_info)]

In [None]:
def update_transcript(row):

    file_name = row['audio_filename']

    transcript = row['transcript']

    # create an array of objects based on the list of dictionaries
    transcript_info = [{'_id': ObjectId(), 'sentence': d['sentence'], 'start_time': d['start_time'], 'end_time': d['end_time']} for d in transcript]

    audio_clips.update_many({'$and': [{'file_name': file_name}, {'transcript': {'$exists': True, '$eq': []}}]}, {'$set': {'transcript': transcript_info}})

# applying the function to each row of the DataFrame
grouped_transcript_info_df.apply(update_transcript, axis=1)