The idea behind this notebook is to transcribe and align an entire audacity project.  However, this time I will take a different approach:  

Instead of transcribing and then mixing the files together I will try to transcribe after mixing the files together.

In [15]:
import os, re, json
import pandas as pd

import torch
import torchaudio

import utils
import utils.prepare as prep

from IPython.display import display, Audio

In [2]:
SR = 16_000
folder = './data/jotun 02-05-2023/'
vofile = 'voice_only.wav'

In [3]:
aupfile = utils.find_aup_file(folder)
aupfile = os.path.join(folder,aupfile)
project = utils.parse_audacity_project(aupfile)
project

{'project_name': 'gWJmbiLGkb_data',
 'root_path': './data/jotun 02-05-2023',
 'data_path': './data/jotun 02-05-2023\\gWJmbiLGkb_data',
 'files': [{'filename': '1-JadePixie_7138.ogg', 'offset': 0.0},
  {'filename': '2-CS12_4510.ogg', 'offset': 0.0},
  {'filename': '3-Marburg42_5566.ogg', 'offset': 0.0},
  {'filename': '4-IlMaximuslI_9218.ogg', 'offset': 0.0},
  {'filename': '5-Crux_4429.ogg', 'offset': 0.0}]}

In [4]:
vad_datas = list()
datapath = project['data_path']
for file in project['files']:
    filename = file['filename']
    path = os.path.join(datapath,filename)
    segments = prep.get_vad_data(path)
    for segment in segments:
        segment['file'] = filename
    vad_datas.append(segments)

In [13]:
vad_df = [pd.DataFrame(data) for data in vad_datas]
vad_df = pd.concat(vad_df)
vad_df = vad_df.sort_values('start').reset_index().drop(columns='index')
vad_df.head()

Unnamed: 0,start,end,file
0,69664,109024,2-CS12_4510.ogg
1,111136,120800,2-CS12_4510.ogg
2,217120,259552,2-CS12_4510.ogg
3,268320,284128,1-JadePixie_7138.ogg
4,288800,306656,3-Marburg42_5566.ogg


In [6]:
# def save_audio(path: str,
#                tensor: torch.Tensor,
#                sampling_rate: int = 16000):
#     torchaudio.save(path, tensor.unsqueeze(0), sampling_rate, bits_per_sample=16)

# audios = dict()
# for file in project['files']:
#         filename = file['filename']
#         filepath = os.path.join(project['data_path'], file['filename'])
#         audios[filename] = prep.get_waveform(filepath)

# chunks = []
# for idx, row in df.sort_values('start').iterrows():
#     file = row['file']
#     start = row['start']
#     end = row['end']
#     chunks.append(audios[file][start:end])

# wav = torch.cat(chunks)
# save_audio(os.path.join(folder, 'voice_only.wav'), wav)

# short_vad = prep.align_chunks(df.to_dict(orient='records'))
# prep.cache_json(short_vad, os.path.join(folder, 'voice_only.wav_vad.json'))

# del audios

In [108]:
filepath = os.path.join(folder, vofile)
vad_short = prep.get_vad_data(filepath)
align_short = prep.get_alignment_data(filepath)

In [109]:
seg_short = pd.DataFrame(vad_short)
seg_short['file'] = vad_df['file']
words = pd.DataFrame(align_short)
display(words.head(15))
display(seg_short.head())

Unnamed: 0,text,start,end
0,"Welcome,",322,5154
1,"scribe,",5798,12885
2,to,13207,13851
3,this,26736,28991
4,place.,30601,35111
5,Apparently,48640,53476
6,nobody,54121,57345
7,else,57990,60569
8,wants,60891,63793
9,to,64115,64760


Unnamed: 0,start,end,file
0,0,39360,2-CS12_4510.ogg
1,39360,49024,2-CS12_4510.ogg
2,49024,91456,2-CS12_4510.ogg
3,91456,107264,1-JadePixie_7138.ogg
4,107264,125120,3-Marburg42_5566.ogg


In [110]:
# filter if words midframe in segment
words[words[['start','end']].mean(axis=1).between(49024,91456)]

Unnamed: 0,text,start,end
5,Apparently,48640,53476
6,nobody,54121,57345
7,else,57990,60569
8,wants,60891,63793
9,to,64115,64760
10,say,65728,67340
11,hi,67984,69919
12,to,70241,70886
13,you,71208,72820
14,anymore.,73465,77012


In [111]:
seg_short['subsegments'] = seg_short.apply(lambda x: words[words[['start','end']].mean(axis=1).between(x['start'],x['end'])], axis=1)

In [113]:
words['mid'] = words[['start','end']].mean(axis=1)

In [114]:
seg_short['subsegments'] = seg_short.apply(lambda x: words[words.mid.between(x['start'],x['end'])], axis=1)

In [112]:
seg_short['text'] = seg_short.apply(lambda x: ' '.join(x.subsegments.text.ravel()), axis=1)
seg_short.head()

Unnamed: 0,start,end,file,subsegments,text
0,0,39360,2-CS12_4510.ogg,"text start end 0 Welcome, 322 ...","Welcome, scribe, to this place."
1,39360,49024,2-CS12_4510.ogg,"Empty DataFrame Columns: [text, start, end] In...",
2,49024,91456,2-CS12_4510.ogg,text start end 5 Apparently 4...,Apparently nobody else wants to say hi to you ...
3,91456,107264,1-JadePixie_7138.ogg,"text start end 15 Hi, 91847 ...","Hi, Craig."
4,107264,125120,3-Marburg42_5566.ogg,"text start end 17 Hi, 107520 ...","Hi, Craig."


In [116]:
## srt subtitles
output = ''
n = '\n'
count = 0
for i, (idx, row) in enumerate(seg_short.sort_values('start').iterrows()):
    txt = row['text']
    if len(txt) > 0:
        count += 1
        speaker = utils.get_discord_name(row['file'])
        start = utils.frame_to_srt_timestamp(row['start'])
        end = utils.frame_to_srt_timestamp(row['end'])
        output += f"{count}"+n
        output += f"{start} --> {end}"+n
        output += f"{speaker}:{n}{txt.strip()}"+n+n

with open(os.path.join(folder, 'short.srt'), 'w') as f:
    f.write(output)    
print(output)

1
00:00:00,000 --> 00:00:02,460
CS12_4510:
Welcome, scribe, to this place.

2
00:00:03,064 --> 00:00:05,716
CS12_4510:
Apparently nobody else wants to say hi to you anymore.

3
00:00:05,716 --> 00:00:06,704
JadePixie_7138:
Hi, Craig.

4
00:00:06,704 --> 00:00:07,820
Marburg42_5566:
Hi, Craig.

5
00:00:07,820 --> 00:00:08,712
IlMaximuslI_9218:
Hi,

6
00:00:08,712 --> 00:00:09,412
IlMaximuslI_9218:
thing.

7
00:00:09,412 --> 00:00:10,080
IlMaximuslI_9218:
First to go.

8
00:00:10,080 --> 00:00:11,612
Crux_4429:
His name's not Craig anymore.

9
00:00:11,612 --> 00:00:15,704
CS12_4510:
Hello, jerk. Hello, jerks, evil twin.

10
00:00:17,200 --> 00:00:18,956
CS12_4510:
OK, OK.

11
00:00:18,956 --> 00:00:23,688
CS12_4510:
Yggdrasil. Hello, Library of Jade. Hello, all my players.

12
00:00:23,688 --> 00:00:27,588
CS12_4510:
Hello, me being a player. Actually, sorry, let's just have it.

13
00:00:27,588 --> 00:00:31,136
CS12_4510:
at us doing research,

14
00:00:31,136 --> 00:00:32,220
CS12_451

In [117]:
## Script format
text = ''
current_speaker = ''
for idx, row in seg_short.sort_values('start').iterrows():
    speaker = utils.get_discord_name(row['file']).split('_')[0]
    s = row['text']
    if len(s.strip()) > 0:
        if not current_speaker == speaker:
            text += f"\n\n{speaker}:\n"
            current_speaker = speaker
    text = ' '.join([text,s])


with open(os.path.join(folder, 'script.txt'), 'w') as f:
    f.write(text)