In [8]:
#!/usr/bin/env python
# coding: utf-8

# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.

from typing import List

import logging
import sys
import requests
import time
import swagger_client as cris_client
import pandas as pd
import json
from datetime import timedelta

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format="%(message)s")

In [9]:
# Your subscription key and region for the speech service
SUBSCRIPTION_KEY = "c45caa4f5ce640ceb402059defa818c3"
SERVICE_REGION = "eastus"

NAME = "Speech_Transcribe"
DESCRIPTION = "Transcribe and Diarize a telephone convevrsation"

LOCALE = "en-US"
RECORDINGS_BLOB_URI = "https://gtsblob.blob.core.windows.net/transcription/Samoy.wav"

# Set subscription information when doing transcription with custom models
ADAPTED_ACOUSTIC_ID = None  # guid of a custom acoustic model
ADAPTED_LANGUAGE_ID = None  # guid of a custom language model

In [10]:
def transcribe():
    logging.info("Starting transcription client...")

    # configure API key authorization: subscription_key
    configuration = cris_client.Configuration()
    configuration.api_key['Ocp-Apim-Subscription-Key'] = SUBSCRIPTION_KEY
    configuration.host = "https://{}.cris.ai".format(SERVICE_REGION)

    # create the client object and authenticate
    client = cris_client.ApiClient(configuration)

    # create an instance of the transcription api class
    transcription_api = cris_client.CustomSpeechTranscriptionsApi(api_client=client)

    # get all transcriptions for the subscription
    transcriptions: List[cris_client.Transcription] = transcription_api.get_transcriptions()

    logging.info("Deleting all existing completed transcriptions.")

    # delete all pre-existing completed transcriptions
    # if transcriptions are still running or not started, they will not be deleted
    for transcription in transcriptions:
        try:
            transcription_api.delete_transcription(transcription.id)
        except ValueError:
            # ignore swagger error on empty response message body: https://github.com/swagger-api/swagger-core/issues/2446
            pass

    # Specify transcription properties by passing a dict to the properties parameter. See
    # https://docs.microsoft.com/azure/cognitive-services/speech-service/batch-transcription#configuration-properties
    # for supported parameters.
    properties = {
        # 'PunctuationMode': 'DictatedAndAutomatic',
        # 'ProfanityFilterMode': 'Masked',
         'AddWordLevelTimestamps': 'True',
         'AddDiarization': 'True',
         'AddSentiment': True
        # 'TranscriptionResultsContainerUrl': "<results container>"
    }

    # Use base models for transcription. Comment this block if you are using a custom model.
    transcription_definition = cris_client.TranscriptionDefinition(
        name=NAME, description=DESCRIPTION, locale=LOCALE, recordings_url=RECORDINGS_BLOB_URI,
        properties=properties
    )

    # Uncomment this block to use custom models for transcription.
    # Model information (ADAPTED_ACOUSTIC_ID and ADAPTED_LANGUAGE_ID) must be set above.
    # if ADAPTED_ACOUSTIC_ID is None or ADAPTED_LANGUAGE_ID is None:
    #     logging.info("Custom model ids must be set to when using custom models")
    # transcription_definition = cris_client.TranscriptionDefinition(
    #     name=NAME, description=DESCRIPTION, locale=LOCALE, recordings_url=RECORDINGS_BLOB_URI,
    #     models=[cris_client.ModelIdentity(ADAPTED_ACOUSTIC_ID), cris_client.ModelIdentity(ADAPTED_LANGUAGE_ID)],
    #     properties=properties
    # )

    data, status, headers = transcription_api.create_transcription_with_http_info(transcription_definition)

    # extract transcription location from the headers
    transcription_location: str = headers["location"]

    # get the transcription Id from the location URI
    created_transcription: str = transcription_location.split('/')[-1]

    logging.info("Created new transcription with id {}".format(created_transcription))

    logging.info("Checking status.")

    completed = False

    while not completed:
        running, not_started = 0, 0

        # get all transcriptions for the user
        transcriptions: List[cris_client.Transcription] = transcription_api.get_transcriptions()

        # for each transcription in the list we check the status
        for transcription in transcriptions:
            if transcription.status in ("Failed", "Succeeded"):
                # we check to see if it was the transcription we created from this client
                if created_transcription != transcription.id:
                    continue

                completed = True

                if transcription.status == "Succeeded":
                    results_uri = transcription.results_urls["channel_0"]
                    results = requests.get(results_uri)
                    logging.info("Transcription succeeded. Results: ")
                    logging.info(results.content.decode("utf-8"))
                    return results.content.decode("utf-8")
                else:
                    logging.info("Transcription failed :{}.".format(transcription.status_message))
                    break
            elif transcription.status == "Running":
                running += 1
            elif transcription.status == "NotStarted":
                not_started += 1

        logging.info("Transcriptions status: "
                     "completed (this transcription): {}, {} running, {} not started yet".format(
                         completed, running, not_started))

        # wait for 5 seconds
        time.sleep(5)

    input("Press any key...")




In [11]:
if __name__ == "__main__":
    response1 = transcribe()

Starting transcription client...
Deleting all existing completed transcriptions.
Created new transcription with id d94ccaef-491e-43f8-af39-80b6ab5f072d
Checking status.
Transcriptions status: completed (this transcription): False, 0 running, 1 not started yet
Transcriptions status: completed (this transcription): False, 0 running, 1 not started yet
Transcriptions status: completed (this transcription): False, 0 running, 1 not started yet
Transcriptions status: completed (this transcription): False, 1 running, 0 not started yet
Transcriptions status: completed (this transcription): False, 1 running, 0 not started yet
Transcriptions status: completed (this transcription): False, 1 running, 0 not started yet
Transcriptions status: completed (this transcription): False, 1 running, 0 not started yet
Transcriptions status: completed (this transcription): False, 1 running, 0 not started yet
Transcriptions status: completed (this transcription): False, 1 running, 0 not started yet
Transcriptio

In [12]:
print(response1)

{
  "AudioFileResults": [
    {
      "AudioFileName": "Channel.0.wav",
      "AudioFileUrl": null,
      "AudioLengthInSeconds": 290.31,
      "CombinedResults": [
        {
          "ChannelNumber": null,
          "Lexical": "they will call good holder services my name is jimmy only had the card number please yes but first let me ask my daughter receive this gift card for christmas southwest bank gift card and we're trying to get a balance on it keeps asking for a pin with course she doesn't have a pen so can you switch the balances i'm sorry about it but uh if you want to check the balance as soon as we service the the card 'cause we service model credit union can i get the card number please for me to check yes it's a it's a gift card OK OK i'm now loading it here in our system one moment please in your name is you are i'm sorry for the delay just said still getting the information OK uhm here's something came up now this system and showing here that this card is broadway uh noth

In [13]:
audioFileResults = pd.read_json(response1)['AudioFileResults']
speakers_timestamps = {}
speakers = []

for i in audioFileResults:
        
    seg_results = i['SegmentResults']
    
    #find the speakers
    for j in seg_results:
        if j['SpeakerId'] not in speakers:
            speakers.append(j['SpeakerId'])            
    print("Unique set of speakers :")
    print(speakers)
    print('\n')
    
    for speaker in speakers:
        speaker_dict = {'speakerId' : speaker}
        OffsetInSeconds = []
        EndTimeInSeconds = []
        for segmentResult in seg_results:
            if segmentResult['SpeakerId'] == speaker:
                
                OffsetInSeconds.append(segmentResult['OffsetInSeconds'])
                #compute end time
                endTime = (timedelta(seconds=segmentResult['OffsetInSeconds']) + \
                timedelta(seconds=segmentResult['DurationInSeconds'])).total_seconds()
                EndTimeInSeconds.append(endTime)
               
        speaker_dict['start_timestamps'] = OffsetInSeconds
        speaker_dict['end_timestamps'] = EndTimeInSeconds
        
        speakers_timestamps[speaker] = speaker_dict
    
    print(speakers_timestamps)
    print('\n')

#convert to json object
json_obj = json.dumps(speakers_timestamps, indent=4)

print("printing json object.....\n")
print(json_obj)

with open("audiotimestamp.json", "w") as outfile: 
    outfile.write(json_obj)

Unique set of speakers :
['1', '2']


{'1': {'speakerId': '1', 'start_timestamps': [3.65, 24.09, 31.71, 38.68, 66.31, 84.51, 89.75, 99.66, 132.21, 150.93, 153.14, 160.23, 176.57, 178.65, 182.16, 208.55, 218.76, 226.84, 231.88, 240.84, 258.5, 264.33, 270.73, 277.75, 286.39], 'end_timestamps': [7.5, 30.06, 34.77, 45.06, 83.73, 85.09, 98.88, 130.82, 134.49, 151.65, 158.68, 171.38, 177.21, 179.51, 183.34, 212.55, 221.32, 228.08, 232.66, 245.0, 259.5, 269.13, 274.45, 280.17, 287.0]}, '2': {'speakerId': '2', 'start_timestamps': [8.37, 21.58, 34.91, 134.49, 142.58, 147.22, 151.65, 172.82, 184.09, 186.91, 189.08, 197.02, 199.38, 214.4, 221.61, 228.36, 235.29, 238.69, 245.63, 259.5, 274.45, 280.96, 287.71], 'end_timestamps': [21.03, 23.41, 37.23, 140.93, 144.37, 148.75, 152.03, 176.21, 186.14, 187.24, 193.35, 197.59, 206.44, 217.73, 225.43, 231.52, 236.11, 239.31, 258.48, 262.73, 277.37, 283.51, 288.07]}}


printing json object.....

{
    "1": {
        "speakerId": "1",
        "start_timesta

In [14]:
import urllib.request

#download the file to a local dir
urllib.request.urlretrieve('https://gtsblob.blob.core.windows.net/transcription/Samoy.wav', 'Samoy.wav')

('Samoy.wav', <http.client.HTTPMessage at 0x13c99537b70>)

In [15]:
!pip install pydub
import os
from pydub import AudioSegment

for speaker in speakers:
  element = speakers_timestamps[speaker]

  for i in range(len(element['start_timestamps'])):
    newAudio = AudioSegment.from_wav("Samoy.wav")
    newAudio = newAudio[int(element['start_timestamps'][i]*1000):int(element['end_timestamps'][i]*1000)]
    newAudio.export('newSong' + str(i) + '.wav', format="wav")

  combined_sounds = AudioSegment.from_wav("newSong0.wav")
  
  for i in range(len(element['start_timestamps'])):
    sound = AudioSegment.from_wav("newSong" + str(i) + ".wav")
    combined_sounds = combined_sounds + sound
    combined_sounds.export('newClip' + element['speakerId'] + '.wav', format="wav")

  for i in range(len(element['start_timestamps'])):
    os.remove('newSong' + str(i) + '.wav')



