In [None]:
import os
from pydub import AudioSegment
import re
import numpy as np
import pandas as pd
from datetime import timedelta

In [None]:
d_path = "diarized_audio/raw_diarizations"
diarization_paths = {}
for directory in os.listdir(d_path):
    diarization_paths[directory] = os.path.join(d_path, directory)

`Careful!` -- the following cell deletes previous work. If intended, run to make way for the new one

In [None]:
# # Remove all csv files from diarization directory
for path in os.listdir(d_path):
    !rm -r {os.path.join(d_path, path)}/speaker_turns.csv

Utils:

In [None]:
def sort_directories(arr):
    def sort_key(s):
        return int(s.split('_')[1])

    return sorted(arr, key=sort_key)

def parse_time(time_str):
    # Parses the time string to a pandas.Timedelta object
    return pd.Timedelta(time_str)

def process_line(line):
    # Splits the line and extracts the required information
    time_data, speaker = line.split(']')
    time_data = time_data[2:]
    start_time, end_time = time_data.split(' -->  ')
    speaker = speaker.split(' ')[-1]  # Gets only the SPEAKER_xx part
    return speaker, parse_time(start_time), parse_time(end_time)

def most_common_letters(string, dictionary):
    def common_letter_count(s1, s2):
        return sum(min(s1.count(c), s2.count(c)) for c in set(s1))

    max_key = max(dictionary.keys(), key=lambda k: common_letter_count(k.lower(), string.lower()))
    return dictionary[max_key]

In [None]:
def get_speaker_turns(podcast_dictionaries, verbose : bool = False):
    diarizations = "diarized_audio/raw_diarizations"
    directories = [
        directory 
        for directory in os.listdir(diarizations) 
        if directory in podcast_dictionaries.keys()
    ]
    
    for podcast_name in directories:

        directory = os.path.join(diarizations, podcast_name)
        podcast_df = pd.DataFrame(columns=['speaker', 'start', 'end'])
        spacer_time = pd.Timedelta('0 days 00:00:00.998000')
        global_time = pd.Timedelta('0')
        splits = sort_directories([d for d in os.listdir(directory) if d.endswith('.txt')])
        #to_replace_dict = most_common_letters(podcast_name, podcast_dictionaries)
        to_replace_dict = podcast_dictionaries[podcast_name]
        
        if verbose: 
            print(directory)
            print(to_replace_dict)
        
        for idx, split in enumerate(splits):
            
            file_path = os.path.join(directory, split)
            # get split number with regex that searches for a number
            number = re.search(r'\d+(\.\d+)?', split).group()
            # get the speakers (SPEAKER_00, SPEAKER_...) that talk in this split

            # In case we want to discard the data in one of the splits, just go to the next split.
            if to_replace_dict[number] == "":
                continue

            real_speakers = to_replace_dict[number].split(";")

            with open(file_path, 'r') as file:
                for line in file:
                    speaker, start, end = process_line(line)

                    # Set the correct speaker

                    speaker = speaker.strip('\n')
                    for rsp in real_speakers:
                        rsp = rsp.split(",")
                        pyannote_speaker = 'SPEAKER_' + rsp[0]
                        
                        if pyannote_speaker == speaker:
                            speaker = rsp[1]
                    
                    # Adjust start and end to the global time
                    start = start + global_time - spacer_time
                    end = end + global_time - spacer_time

                    #Create a new tagged speaker intervention in the podcast
                    podcast_df.loc[len(podcast_df)] = [speaker, start, end]
            
            global_time = podcast_df["end"].max()
    
        pattern = 'SPEAKER_\d+'
        filtered_df = podcast_df[~podcast_df['speaker'].str.contains(pattern, na=False)]
        filtered_df.to_csv(os.path.join(directory, "speaker_turns.csv"), header=True, index=False)

In [None]:
from speakers import replacer_dict

get_speaker_turns(replacer_dict, True)

### Now we get what a speaker has said in text, linking the diarizations with the transcriptions

In [None]:
import pandas as pd

# Function to find the closest time in df2 to a given time in df1
def find_closest_start_time(df, given_time):
    # Calculate absolute time differences
    
    time_diff = (df['start'] - given_time).abs()
    
    # Find the index of the minimum difference
    closest_index = time_diff.idxmin()
    return closest_index

def find_closest_end_time(df, given_time):
    # Calculate absolute time differences
    
    time_diff = (df['end'] - given_time).abs()
    
    # Find the index of the minimum difference
    closest_index = time_diff.idxmin()
    return closest_index

def add_text_to_diarization(diarization_df, transcriptions_df):

    # New column for the combined text
    diarization_df['text'] = ''

    for index, row in diarization_df.iterrows():
        # Find closest start and end times in df2
        
        closest_start_index = find_closest_start_time(transcriptions_df, row['start'])
        closest_end_index = find_closest_end_time(transcriptions_df, row['end'])

        # Extract all rows in-between these indices
        if closest_start_index <= closest_end_index:
            relevant_text = transcriptions_df.loc[closest_start_index:closest_end_index, 'text']
        else:
            relevant_text = transcriptions_df.loc[closest_end_index:closest_start_index, 'text']

        # Combine the text and add to dataframe
        combined_text = "".join(relevant_text)
        diarization_df.at[index, 'text'] = combined_text

    # dataframe now contains the combined text in the new 'text' column
    return diarization_df


In [None]:
from nltk.tokenize import TweetTokenizer
from nltk import sent_tokenize

tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

#### Get all podcasts

Get speaker-tagged transcriptions for each dataframe, and save them in the datasets directory

In [None]:
diarizations = "diarized_audio/raw_diarizations"

directories = [
    directory
    for directory in os.listdir(diarizations) 
    if "speaker_turns.csv" in os.listdir(os.path.join(diarizations, directory))
]

for podcast_name in directories:
    directory = os.path.join(diarizations, podcast_name)

    diarization = pd.read_csv(f"diarized_audio/raw_diarizations/{podcast_name}/speaker_turns.csv")
    transcription = pd.read_csv(f"transcribed_audio/{podcast_name}_transcribed.csv")

    # Do a bit of data cleaning
    transcription['start'] = pd.to_timedelta(transcription['start'], unit='s')
    transcription['end'] = pd.to_timedelta(transcription['end'], unit='s')

    diarization['start'] = pd.to_timedelta(diarization['start'])
    diarization['end'] = pd.to_timedelta(diarization['end'])

    all_transcriptions = "".join(transcription["text"])

    final_df = add_text_to_diarization(diarization, transcription)
    final_df.to_csv(os.path.join("datasets", podcast_name + ".csv"), header=True, index=False)

    # speaked = final_df.groupby("speaker")['text'].apply(lambda x: ''.join(x))
    # print("Number of words:", len(tweet_tokenizer.tokenize(all_transcriptions)))
    # print("Number of sentences:", len(sent_tokenize(all_transcriptions)))
    # print(speaked)

Then join them all in a single dataframe.

In [None]:
df_name = "diarized_transcribed_df"
podcast_arr = []

for podcast_name in directories:
    podcast = pd.read_csv(os.path.join('datasets', podcast_name + '.csv'))
    podcast_arr.append(podcast)

diarized_transcribed_df = pd.concat(podcast_arr)
diarized_transcribed_df.to_csv(os.path.join("datasets", df_name + '.csv'), header=True, index=False)

### Prepare podcasts for the neural network & prediction making

In [None]:
df = diarized_transcribed_df[['speaker', 'text']]

In [None]:
def undersample_and_split_dataframe(df, class_column, split_prob=[0.7, 0.15, 0.15]):

    #split parameters
    choices = ['train', 'test', 'val']

    # Group by class and find the smallest class size
    group = df.groupby(class_column)
    smallest_class_size = group.size().min()

    # Sample from each class (we can use with multi-label)
    undersampled_df = pd.DataFrame()
    for _, group_df in group:
        sampled_df = group_df.sample(n=smallest_class_size, replace=False, random_state=1)

        # Perform the split class by class, for the train, test and validation rows to be balanced.
        sampled_df['split'] = np.random.choice(choices, size=len(sampled_df), p=split_prob)
        undersampled_df = pd.concat([undersampled_df, sampled_df], axis=0)

    # Check the distribution
    print(undersampled_df['split'].value_counts(normalize=True))

    return undersampled_df

In [None]:
prepared_df = undersample_and_split_dataframe(df, 'speaker')
prepared_df.columns = ['category', 'title', 'split']
prepared_df = prepared_df.reset_index(drop=True)

In [None]:
prepared_df.groupby('category').size()

In [None]:
prepared_df.to_csv(os.path.join("datasets", df_name + "_preparado.csv"), header=True, index=False)