In [None]:
!pip install pydub



In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
import os
import glob

source = '/gdrive/MyDrive/denoised_source'

seasons = os.listdir(source)
print(f"all seasons: {seasons}")
eps = []
rttms = []
for season in seasons:
  for f in os.listdir(os.path.join(source, season)):
    if(f.endswith('.wav')):
      eps.append(os.path.join(source, season, f))
      base = os.path.basename(f)
      base, _ = os.path.splitext(base)
      rttms.append(os.path.join(source, season, base + '.rttm'))



all seasons: ['S3', 'S1', 'S2']


In [None]:
import os

def get_content(rttm_file: str):
  '''input: rttm_file
  output: content'''
  # init
  content = []
  with open(rttm_file, 'rb') as fp:
    data = fp.readlines()
    for line in data:
      l = str(line).split(' ')
      d = {'start': float(l[5]), 'duration': float(l[8]), 'speaker': l[11]}
      content.append(d)

  return content

def get_organized_data(content: list):
  '''input: content
  output: organized_data'''
  # declare an empty object
  organized_data = {}
  # iterate each item in content
  for item in content:
      # get data
      speaker_name = item['speaker']
      start_time = item['start']
      duration = item['duration']

      # push data in organized_data
      if speaker_name not in organized_data:
          organized_data[speaker_name] = {'starts': [], 'durations': []}

      organized_data[speaker_name]['starts'].append(start_time)
      organized_data[speaker_name]['durations'].append(duration)

  return organized_data

rttm_file = rttms[0]
content = get_content(rttm_file)
organized_data = get_organized_data(content)

print(organized_data)


{'speaker_0': {'starts': [3.34, 53.82, 55.66, 56.78, 75.58, 150.86, 151.82, 197.34, 201.82, 206.38, 208.06, 210.46, 212.78, 217.58, 228.38, 234.06, 242.14, 265.5, 288.06, 363.98, 394.22, 405.34, 408.94, 417.98, 419.34, 421.26, 431.26, 440.06, 441.66, 446.22, 460.46, 469.02, 482.22, 483.74, 490.94, 495.02, 498.7, 501.02, 506.46, 509.74, 511.02, 514.94, 516.7, 518.78, 520.46, 521.02, 522.22, 523.9, 534.14, 540.62, 541.82, 548.46, 550.94, 563.5, 568.3, 570.86, 583.82, 629.04, 653.82, 654.62, 658.7, 659.34, 661.74, 672.38, 675.66, 676.3, 677.02, 679.02, 683.26, 693.54, 697.58, 702.54, 705.34, 712.46, 747.9, 757.42, 784.38, 819.26, 828.62, 829.5, 830.78, 839.9, 848.38, 851.5, 853.74, 858.3, 872.94, 890.38, 913.34, 935.26, 945.9, 959.82, 966.06, 967.66, 969.74, 971.98, 973.34, 991.82, 996.46, 1008.46, 1009.58, 1010.7, 1012.38, 1013.58, 1014.94, 1016.3, 1017.26, 1019.02, 1063.9, 1068.38, 1070.54, 1075.98, 1089.42, 1091.98, 1107.74, 1128.78, 1156.38, 1165.18, 1166.86, 1167.34, 1167.9, 1183.74,

In [None]:
import wave
import shutil
from pydub import AudioSegment
from pydub.playback import play

def save_wave_file(destination, data, sample_width=2, channels=1, frame_rate=44100):
    with wave.open(destination, 'wb') as wave_file:
        wave_file.setnchannels(channels)
        wave_file.setsampwidth(sample_width)
        wave_file.setframerate(frame_rate)
        wave_file.writeframes(data)


def cut_portion(audio, start_time, duration):
    start_frame = int(start_time * 1000)
    end_frame = int((start_time + duration) * 1000)
    return audio[start_frame:end_frame].raw_data

def cut_all(audio, destination, speaker):
    data = b''
    for start, duration in zip(speaker['starts'], speaker['durations']):
        data += cut_portion(audio, start, duration)
    save_wave_file(destination, data)
    play(AudioSegment.from_wav(destination))


In [None]:
for ep, rttm in zip(eps, rttms):
  content = get_content(rttm)
  organized_data = get_organized_data(content)

  file_name_without_extension, _ = os.path.splitext(os.path.basename(ep))
  data_dir = os.path.join(os.path.dirname(ep),file_name_without_extension)
  os.makedirs(data_dir, exist_ok=True)

  audio = AudioSegment.from_wav(ep)
  for speaker in organized_data.keys():
    cut_all(audio, os.path.join(data_dir, speaker), organized_data[speaker])