In [36]:
import os
from pydub import AudioSegment
import re
import numpy as np
import pandas as pd
from datetime import timedelta

In [60]:
d_path = "diarized_audio/raw_diarizations"
diarization_paths = {}
for directory in os.listdir(d_path):
    diarization_paths[directory] = os.path.join(d_path, directory)

First, process all the files to have a suitable format. This will be a pandas dataframe, stored in memory as a .csv file

In [51]:
time1 = pd.Timedelta('00:00:06.186')
time2 = pd.Timedelta('00:00:27.836')

Timedelta('0 days 00:00:00')

In [72]:
# Remove all csv files from diarization directory
for path in os.listdir(d_path):
    !rm -r {os.path.join(d_path, path)}/speaker_turns.csv

In [79]:
import datetime


def parse_time(time_str):
    # Parses the time string to a pandas.Timedelta object
    return pd.Timedelta(time_str)

def process_line(line):
    # Splits the line and extracts the required information
    time_data, speaker = line.split(']')
    time_data = time_data[2:]
    start_time, end_time = time_data.split(' -->  ')
    speaker = speaker.split(' ')[-1]  # Gets only the SPEAKER_xx part
    return speaker, parse_time(start_time), parse_time(end_time)

# Example of processing the file
def process_all_diarizations(to_replace_dict):
    for directory in list(diarization_paths.values()):

        podcast_df = pd.DataFrame(columns=['speaker', 'start', 'end'])
        global_time = pd.Timedelta('0')

        for split in os.listdir(directory):
            
            file_path = os.path.join(directory, split)
            # get split number with regex that searches for a number
            number = re.search(r'\d+(\.\d+)?', split).group()
            # get the speakers (SPEAKER_00, SPEAKER_...) that talk in this split
            #talkers = ["SPEAKER_" + n for n in re.split("[,;]", to_replace_dict[number])[0::2]]
            print(directory, split)
            if to_replace_dict[number] == "":
                continue

            real_speakers = to_replace_dict[number].split(";")

            with open(file_path, 'r') as file:
                for line in file:
                    speaker, start, end = process_line(line)

                    # Set the correct speaker
                    speaker = speaker.strip('\n')
                    for rsp in real_speakers:
                        if rsp[0] == speaker:
                            speaker = rsp[1]
                    
                    # Adjust start and end to the global time
                    start = start + global_time
                    end = end + global_time
                    podcast_df.loc[len(podcast_df)] = [speaker, start, end]
            
            global_time = podcast_df["end"].max()
        
        podcast_df.to_csv(os.path.join(directory, "speaker_turns.csv"), header=True, index=False)

In [105]:
def process_andrew_huberman(to_replace_dict):
    directory = "diarized_audio/raw_diarizations/flagrant-andrewHubermam-17102022"

    podcast_df = pd.DataFrame(columns=['speaker', 'start', 'end'])
    global_time = pd.Timedelta('0')

    for split in os.listdir(directory):
        
        file_path = os.path.join(directory, split)
        # get split number with regex that searches for a number
        number = re.search(r'\d+(\.\d+)?', split).group()
        # get the speakers (SPEAKER_00, SPEAKER_...) that talk in this split
        #talkers = ["SPEAKER_" + n for n in re.split("[,;]", to_replace_dict[number])[0::2]]
        
        if to_replace_dict[number] == "":   #TODO hay que utilizarlo pero no aquí
            continue

        real_speakers = to_replace_dict[number].split(";")

        with open(file_path, 'r') as file:
            for line in file:
                speaker, start, end = process_line(line)

                # Set the correct speaker
                speaker = speaker.strip('\n')
                for rsp in real_speakers:
                    rsp = rsp.split(",")
                    pyannote_speaker = 'SPEAKER_' + rsp[0]
                    
                    if pyannote_speaker == speaker:
                        speaker = rsp[1]
                
                # Adjust start and end to the global time
                start = start + global_time
                end = end + global_time
                podcast_df.loc[len(podcast_df)] = [speaker, start, end]
        
        global_time = podcast_df["end"].max()
    
    podcast_df.to_csv(os.path.join(directory, "speaker_turns.csv"), header=True, index=False)

In [98]:
to_replace_dict = {
    "1":"00,HUBERMAN;01,ANDREW",
    "2":"00,HUBERMAN;01,ANDREW",
    "3":"01,HUBERMAN;00,ANDREW",
    "4":"00,HUBERMAN;01,ANDREW",
    "5":"02,HUBERMAN;01,ANDREW",
    "6":"01,HUBERMAN;00,ANDREW",
    "7":"01,HUBERMAN",
    "8":"01,HUBERMAN;02,ANDREW",
    "9":"01,HUBERMAN;00,ANDREW",
    "10":"02,HUBERMAN;03,ANDREW",
    "11":"02,HUBERMAN;01,ANDREW",
    "12":"00,HUBERMAN",
    "13":"00,HUBERMAN;01,ANDREW"
}

# dejar value de key vacío para indicar que no queremos coger de ese split

process_andrew_huberman(to_replace_dict)

In [96]:
pd.read_csv("diarized_audio/raw_diarizations/flagrant-andrewHubermam-17102022/speaker_turns.csv")

Unnamed: 0,speaker,start,end
0,0,0 days 00:00:01.390000,0 days 00:02:44.598000
1,SPEAKER_01,0 days 00:00:18.984000,0 days 00:00:23.796000
2,SPEAKER_01,0 days 00:00:29.496000,0 days 00:00:30.366000
3,SPEAKER_01,0 days 00:01:04.087000,0 days 00:01:04.496000
4,SPEAKER_01,0 days 00:01:07.346000,0 days 00:01:07.841000
...,...,...,...
633,0,0 days 02:12:58.383000,0 days 02:12:59.407000
634,0,0 days 02:13:02.069000,0 days 02:13:07.751000
635,0,0 days 02:13:11.096000,0 days 02:13:11.437000
636,0,0 days 02:13:14.987000,0 days 02:13:35.994000


In [4]:
def create_speaker_files(splits_path, to_replace_dict):

    speakers = [re.split("[,;]", val)[1::2] for val in list(to_replace_dict.values())]
    unique_speakers = list({speaker for subarray in speakers for speaker in subarray})
    interventions_dict = {}

    for speaker in unique_speakers:
        interventions_dict[speaker] = []

    global_time = np.float64(0)

    splits = os.listdir(splits_path)
    assert len(splits) == len(to_replace_dict)

    for split in splits:
        # get split number with regex that searches for a number
        number = re.search(r'\d+(\.\d+)?', split).group()
        # get the speakers (SPEAKER_00, SPEAKER_...) that talk in this split
        talkers = ["SPEAKER_" + number for number in re.split("[,;]", to_replace_dict["1"])[0::2]]












    #     #Extraemos los timestamps de todos los talkers
    #     with open(split, 'r') as file:
    #         for line in file:

                

    #             # Check if the line needs to be modified
    #             if 'some condition' in line:
    #                 # Modify the line as needed
    #                 modified_line = line.replace('old text', 'new text')
    #                 modified_lines.append(modified_line)
    #             else:
    #                 # If no modification is needed, keep the line as is
    #                 modified_lines.append(line)



    # # Optionally, write the modified lines back to a file
    # with open('yourfile_modified.txt', 'w') as file:
    #     for line in modified_lines:
    #         file.write(line)


# Now 'lines' is a list where each element is a line in the file


In [5]:
# Format will be split_number:speakers_with_replacement, where speakers
# with replacement is a string with the number of speaker to replace and
# the name, separated by a comma and separated by semicolon from other
# speakers
to_replace_dict = {
    "1":"00,HUBERMAN;01,ANDREW",
    "2":"00,HUBERMAN;01,ANDREW",
    "3":"01,HUBERMAN;00,ANDREW",
    "4":"00,HUBERMAN;01,ANDREW",
    "5":"02,HUBERMAN;01,ANDREW",
    "6":"01,HUBERMAN;00,ANDREW",
    "7":"01,HUBERMAN",
    "8":"01,HUBERMAN;02,ANDREW",
    "9":"01,HUBERMAN;00,ANDREW",
    "10":"02,HUBERMAN;03,ANDREW",
    "11":"02,HUBERMAN;01,ANDREW",
    "12":"00,HUBERMAN",
    "13":"00,HUBERMAN;01,ANDREW"
}
speakers = [re.split("[,;]", val)[1::2] for val in list(to_replace_dict.values())]

In [18]:
["SPEAKER_" + number for number in re.split("[,;]", to_replace_dict["1"])[0::2]]

['SPEAKER_00', 'SPEAKER_01']

In [21]:
speakers = [re.split("[,;]", val)[1::2] for val in list(to_replace_dict.values())]
list({speaker for subarray in speakers for speaker in subarray})

['HUBERMAN', 'ANDREW']

In [8]:
split = "split_1_diarized.txt"
int(re.search(r'\d+(\.\d+)?', split).group())

1

In [9]:
dest_path = "diarized_audio/final_diarizations"
