In [None]:
import os
from pydub import AudioSegment
import re
import numpy as np
import pandas as pd
from datetime import timedelta

In [None]:
d_path = "diarized_audio/raw_diarizations"
diarization_paths = {}
for directory in os.listdir(d_path):
    diarization_paths[directory] = os.path.join(d_path, directory)

First, process all the files to have a suitable format. This will be a pandas dataframe, stored in memory as a .csv file

In [None]:
# Remove all csv files from diarization directory
# for path in os.listdir(d_path):
#     !rm -r {os.path.join(d_path, path)}/speaker_turns.csv

In [None]:
# to_replace_dict = {
#     "1":"01,HUBERMAN;00,ANDREW",
#     "2":"00,HUBERMAN;01,ANDREW",
#     "3":"01,HUBERMAN;00,ANDREW",
#     "4":"00,HUBERMAN;01,ANDREW",
#     "5":"02,HUBERMAN;01,ANDREW",
#     "6":"01,HUBERMAN;00,ANDREW",
#     "7":"01,HUBERMAN",
#     "8":"01,HUBERMAN;02,ANDREW",
#     "9":"01,HUBERMAN;00,ANDREW",
#     "10":"02,HUBERMAN;03,ANDREW",
#     "11":"02,HUBERMAN;01,ANDREW",
#     "12":"00,HUBERMAN",
#     "13":"00,HUBERMAN;01,ANDREW"
# }

In [None]:
def sort_directories(arr):
    def sort_key(s):
        return int(s.split('_')[1])

    return sorted(arr, key=sort_key)

In [None]:
def parse_time(time_str):
    # Parses the time string to a pandas.Timedelta object
    return pd.Timedelta(time_str)

def process_line(line):
    # Splits the line and extracts the required information
    time_data, speaker = line.split(']')
    time_data = time_data[2:]
    start_time, end_time = time_data.split(' -->  ')
    speaker = speaker.split(' ')[-1]  # Gets only the SPEAKER_xx part
    return speaker, parse_time(start_time), parse_time(end_time)

def process_joerogan_kevinhart(to_replace_dict):
    directory = "diarized_audio/raw_diarizations/JRE-kevinHart-25052020"

    podcast_df = pd.DataFrame(columns=['speaker', 'start', 'end'])
    spacer_time = pd.Timedelta('0 days 00:00:00.998000')
    global_time = pd.Timedelta('0')
    splits = sort_directories(os.listdir(directory))

    for idx, split in enumerate(splits):
        
        file_path = os.path.join(directory, split)
        # get split number with regex that searches for a number
        number = re.search(r'\d+(\.\d+)?', split).group()
        # get the speakers (SPEAKER_00, SPEAKER_...) that talk in this split
        #talkers = ["SPEAKER_" + n for n in re.split("[,;]", to_replace_dict[number])[0::2]]
        
        if to_replace_dict[number] == "":   #TODO hay que utilizarlo pero no aquí
            continue

        real_speakers = to_replace_dict[number].split(";")

        with open(file_path, 'r') as file:
            for line in file:
                speaker, start, end = process_line(line)

                # Set the correct speaker
                speaker = speaker.strip('\n')

                for rsp in real_speakers:
                    rsp = rsp.split(",")
                    pyannote_speaker = 'SPEAKER_' + rsp[0]
                    
                    if pyannote_speaker == speaker:
                        speaker = rsp[1]
                
                # Adjust start and end to the global time
                start = start + global_time - spacer_time
                end = end + global_time - spacer_time
                podcast_df.loc[len(podcast_df)] = [speaker, start, end]
        
        global_time = podcast_df["end"].max()
    
    podcast_df.to_csv(os.path.join(directory, "speaker_turns.csv"), header=True, index=False)

In [None]:
# Format will be split_number:speakers_with_replacement, where speakers
# with replacement is a string with the number of speaker to replace and
# the name, separated by a comma and separated by semicolon from other
# speakers
to_replace_dict = {
    "1": "00,KEVINHART;01,JOEROGAN",
    "2": "00,KEVINHART;01,JOEROGAN",
    "3": "00,KEVINHART;01,JOEROGAN",
    "4": "00,JOEROGAN;01,KEVINHART",
    "5": "00,JOEROGAN;01,KEVINHART",
    "6": "00,KEVINHART;01,JOEROGAN",
    "7": "00,KEVINHART;01,JOEROGAN",
    "8": "00,KEVINHART;01,JOEROGAN",
    "9": "01,KEVINHART;02,JOEROGAN",
    "10": "00,KEVINHART;01,JOEROGAN",
    "11": "00,KEVINHART;01,JOEROGAN",
    "12": "00,KEVINHART;01,JOEROGAN"
}

# dejar value de key vacío para indicar que no queremos coger de ese split

process_joerogan_kevinhart(to_replace_dict)

### Now we get what a speaker has said in text, linking the diarizations with the transcriptions

In [None]:
diarization = pd.read_csv("diarized_audio/raw_diarizations/JRE-kevinHart-25052020/speaker_turns.csv")
transcription = pd.read_csv("transcribed_audio/JRE-kevinHart-25052020_transcribed.csv")

In [None]:
transcription['start'] = pd.to_timedelta(transcription['start'], unit='s')
transcription['end'] = pd.to_timedelta(transcription['end'], unit='s')

In [None]:
diarization['start'] = pd.to_timedelta(diarization['start'])
diarization['end'] = pd.to_timedelta(diarization['end'])

In [None]:
from nltk.tokenize import TweetTokenizer
from nltk import sent_tokenize

tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

all_transcriptions = "".join(transcription["text"])
print("Number of words:", len(tweet_tokenizer.tokenize(all_transcriptions)))
print("Number of sentences:", len(sent_tokenize(all_transcriptions)))

In [None]:
transcription

In [None]:
diarization

Now a new column should be added to the diarization dataframe. This column will be the text said in that time frame.

In [None]:
import pandas as pd

# Function to find the closest time in df2 to a given time in df1
def find_closest_start_time(df, given_time):
    # Calculate absolute time differences
    
    time_diff = (df['start'] - given_time).abs()
    
    # Find the index of the minimum difference
    closest_index = time_diff.idxmin()
    return closest_index

def find_closest_end_time(df, given_time):
    # Calculate absolute time differences
    
    time_diff = (df['end'] - given_time).abs()
    
    # Find the index of the minimum difference
    closest_index = time_diff.idxmin()
    return closest_index

# New column for the combined text
diarization['text'] = ''

for index, row in diarization.iterrows():
    # Find closest start and end times in df2
    
    closest_start_index = find_closest_start_time(transcription, row['start'])
    closest_end_index = find_closest_end_time(transcription, row['end'])

    # Extract all rows in-between these indices
    if closest_start_index <= closest_end_index:
        relevant_text = transcription.loc[closest_start_index:closest_end_index, 'text']
    else:
        relevant_text = transcription.loc[closest_end_index:closest_start_index, 'text']

    # Combine the text and add to df1
    combined_text = "".join(relevant_text)
    diarization.at[index, 'text'] = combined_text

# df1 now contains the combined text in the new 'text' column


In [None]:
speaked = diarization.groupby("speaker")['text'].apply(lambda x: ''.join(x))
speaked