In [1]:
import os, re
from IPython.display import Audio, display
from bs4 import BeautifulSoup

import utils.prepare as prep
import pandas as pd

In [2]:
# Audacity parsing functions
def find_aup_file(path):
    aup_files = [f for f in os.listdir(path) if f.endswith('.aup')]
    return aup_files[0] # Assume only one file per directory

def parse_audacity_project(path_to_aup_file):
    
    with open(path_to_aup_file, 'r') as f:
        aup = BeautifulSoup(f, features="lxml-xml")
    
    root_path = os.path.split(path_to_aup_file)[:-1]
    root_path = os.path.join(*root_path)

    project = aup.find('project')
    proj_name = project.get('projname')
    data_path = os.path.join(root_path, proj_name)

    proj_imports = project.find_all('import')
    proj_files = []
    for item in proj_imports:
        filename, offset= item.get('filename'), float(item.get('offset'))
        entry = {
            'filename': filename,
            'offset': offset
        }
        proj_files.append(entry)

    
    output = {
        'project_name' : proj_name,
        'root_path': root_path,
        'data_path': data_path,
        'files': proj_files
    }

    return output

def get_discord_name(filename):
    discord_name_pattern = r'.+-(.*)\..+'
    username = re.search(discord_name_pattern, filename).group(1)
    return username

In [3]:
directory = './data/db_02-03-2023/'

aup_file = os.path.join(directory, find_aup_file(directory))
project = parse_audacity_project(aup_file)


In [4]:

# paths = [os.path.join(project['data_path'], file['filename']) for file in project['files']]

# asr_datas = []
# for path in paths:
#     # print(path)
#     result = prep.get_asr_data(path, refresh=False, model_name='medium.en')
#     asr_datas.append(result)

In [5]:
# for file in project['files']:
#     filepath = os.path.join(project['data_path'], file['filename'])
#     alignment_data = prep.get_alignment_data(filepath, refresh=True)


In [6]:
def listen_segment(segment, audio):
    from IPython.display import display, Audio
    SR = 16000
    start = segment['start']
    end = segment['end']
    clip = audio[start:end]
    display(Audio(clip, rate=SR))

In [7]:
dataframes = []
for file in project['files']:
        filepath = os.path.join(project['data_path'], file['filename'])
        srt_data = prep.get_srt_data(filepath)
        df = pd.DataFrame(srt_data)
        df['file'] = file['filename']
        dataframes.append(df)

df.head()


Unnamed: 0,start,end,children,text,file
0,940576,957408,"[{'text': 'I', 'start': 942530, 'end': 943832}...",I believe you.,4-IlMaximuslI_9218.ogg
1,1013792,1032672,"[{'text': 'Hello', 'start': 1015520, 'end': 10...",Hello friends.,4-IlMaximuslI_9218.ogg
2,1158688,1211360,"[{'text': 'What', 'start': 1161376, 'end': 116...",What are we doing for down below?,4-IlMaximuslI_9218.ogg
3,2412064,2442208,"[{'text': 'Was', 'start': 2417797, 'end': 2419...",Was it 400 and something days?,4-IlMaximuslI_9218.ogg
4,2475040,2494432,"[{'text': '14 months?', 'start': 2484329, 'end...",14 months?,4-IlMaximuslI_9218.ogg


In [8]:
big = pd.concat(dataframes)
big.sort_values('start')

Unnamed: 0,start,end,children,text,file
0,2080,17376,"[{'text': 'Now', 'start': 2080, 'end': 5672}, ...",Now recording!,1-CS12_4510.ogg
0,30752,91616,"[{'text': 'Oh', 'start': 33004, 'end': 42012},...","Oh no, he didn't say the thing!",3-JadePixie_7138.ogg
0,42016,56800,"[{'text': 'I', 'start': 51985, 'end': 55523}]",I,2-Crux_4429.ogg
1,99360,138720,"[{'text': 'took', 'start': 101620, 'end': 1045...",took away his permissions to say things.,2-Crux_4429.ogg
1,151072,201184,"[{'text': 'But', 'start': 151012, 'end': 15294...","But I like to hear him say, now recording.",3-JadePixie_7138.ogg
...,...,...,...,...,...
1423,233645600,233654752,"[{'text': 'indeed', 'start': 233647192, 'end':...",indeed,4-IlMaximuslI_9218.ogg
866,233663520,233679328,"[{'text': 'Get', 'start': 233663717, 'end': 23...","Get out of here, Craig.",2-Crux_4429.ogg
1424,233699872,233708512,[],,4-IlMaximuslI_9218.ogg
915,233727008,233742304,[],,1-CS12_4510.ogg


In [9]:
# import torch
# import torchaudio
# audios = dict()
# for file in project['files']:
#         filename = file['filename']
#         filepath = os.path.join(project['data_path'], file['filename'])
#         audios[filename] = prep.get_waveform(filepath)

# audios
# def save_audio(path: str,
#                tensor: torch.Tensor,
#                sampling_rate: int = 16000):
#     torchaudio.save(path, tensor.unsqueeze(0), sampling_rate, bits_per_sample=16)


# chunks = []
# for idx, row in big.sort_values('start').iterrows():
#     file = row['file']
#     start = row['start']
#     end = row['end']
#     chunks.append(audios[file][start:end])

# wav = torch.cat(chunks)
# save_audio('output.wav', wav)

In [17]:
text = ''
current_speaker = ''
for idx, row in big.sort_values('start').iterrows():
    speaker = get_discord_name(row['file'])
    s = row['text']
    if len(s.strip()) > 0:
        if not current_speaker == speaker:
            text += f"\n\n{speaker}:\n"
            current_speaker = speaker
    text = ' '.join([text,s])


with open(os.path.join(directory, 'output.txt'), 'w') as f:
    f.write(text)

For the most part this works.  There is a minor issue where some words were attached to the segment before the segment they were actually spoken.  For now I will probably leave it that way since it seems to happen the most to drop-words.