In [1]:
import os
from pydub import AudioSegment
import re
import numpy as np
import pandas as pd
from datetime import timedelta

In [2]:
d_path = "diarized_audio/raw_diarizations"
diarization_paths = {}
for directory in os.listdir(d_path):
    diarization_paths[directory] = os.path.join(d_path, directory)

`Careful!` -- the following cell deletes previous work. If intended, run to make way for the new one

In [3]:
# # Remove all csv files from diarization directory
for path in os.listdir(d_path):
    !rm -r {os.path.join(d_path, path)}/speaker_turns.csv

rm: cannot remove 'diarized_audio/raw_diarizations/JRE-neildeGrasseTyson-06092019/speaker_turns.csv': No such file or directory
rm: cannot remove 'diarized_audio/raw_diarizations/JRE-edwardSnowden-23102019/speaker_turns.csv': No such file or directory
rm: cannot remove 'diarized_audio/raw_diarizations/JRE-mikeTyson-04092020/speaker_turns.csv': No such file or directory
rm: cannot remove 'diarized_audio/raw_diarizations/JRE-postMalone-29072020/speaker_turns.csv': No such file or directory
rm: cannot remove 'diarized_audio/raw_diarizations/JRE-kanyeWest-24102020/speaker_turns.csv': No such file or directory
rm: cannot remove 'diarized_audio/raw_diarizations/JRE-kevinHart-25052020/speaker_turns.csv': No such file or directory
rm: cannot remove 'diarized_audio/raw_diarizations/JRE-joeyDiaz-26032020/speaker_turns.csv': No such file or directory
rm: cannot remove 'diarized_audio/raw_diarizations/JRE-mileyCirus-02092020/speaker_turns.csv': No such file or directory
rm: cannot remove 'diarized

Utils:

In [4]:
def sort_directories(arr):
    def sort_key(s):
        return int(s.split('_')[1])

    return sorted(arr, key=sort_key)

def parse_time(time_str):
    # Parses the time string to a pandas.Timedelta object
    return pd.Timedelta(time_str)

def process_line(line):
    # Splits the line and extracts the required information
    time_data, speaker = line.split(']')
    time_data = time_data[2:]
    start_time, end_time = time_data.split(' -->  ')
    speaker = speaker.split(' ')[-1]  # Gets only the SPEAKER_xx part
    return speaker, parse_time(start_time), parse_time(end_time)

def most_common_letters(string, dictionary):
    def common_letter_count(s1, s2):
        return sum(min(s1.count(c), s2.count(c)) for c in set(s1))

    max_key = max(dictionary.keys(), key=lambda k: common_letter_count(k.lower(), string.lower()))
    return dictionary[max_key]

In [5]:
def get_speaker_turns(podcast_dictionaries, verbose : bool = False):
    diarizations = "diarized_audio/raw_diarizations"
    directories = [
        directory 
        for directory in os.listdir(diarizations) 
        if directory in podcast_dictionaries.keys()
    ]
    
    for podcast_name in directories:

        directory = os.path.join(diarizations, podcast_name)
        podcast_df = pd.DataFrame(columns=['speaker', 'start', 'end'])
        spacer_time = pd.Timedelta('0 days 00:00:00.998000')
        global_time = pd.Timedelta('0')
        splits = sort_directories([d for d in os.listdir(directory) if d.endswith('.txt')])
        #to_replace_dict = most_common_letters(podcast_name, podcast_dictionaries)
        to_replace_dict = podcast_dictionaries[podcast_name]
        
        if verbose: 
            print(directory)
            print(to_replace_dict)
        
        for idx, split in enumerate(splits):
            
            file_path = os.path.join(directory, split)
            # get split number with regex that searches for a number
            number = re.search(r'\d+(\.\d+)?', split).group()
            # get the speakers (SPEAKER_00, SPEAKER_...) that talk in this split

            # In case we want to discard the data in one of the splits, just go to the next split.
            if to_replace_dict[number] == "":
                continue

            real_speakers = to_replace_dict[number].split(";")

            with open(file_path, 'r') as file:
                for line in file:
                    speaker, start, end = process_line(line)

                    # Set the correct speaker

                    speaker = speaker.strip('\n')
                    for rsp in real_speakers:
                        rsp = rsp.split(",")
                        pyannote_speaker = 'SPEAKER_' + rsp[0]
                        
                        if pyannote_speaker == speaker:
                            speaker = rsp[1]
                    
                    # Adjust start and end to the global time
                    start = start + global_time - spacer_time
                    end = end + global_time - spacer_time

                    #Create a new tagged speaker intervention in the podcast
                    podcast_df.loc[len(podcast_df)] = [speaker, start, end]
            
            global_time = podcast_df["end"].max()
    
        pattern = 'SPEAKER_\d+'
        filtered_df = podcast_df[~podcast_df['speaker'].str.contains(pattern, na=False)]
        filtered_df.to_csv(os.path.join(directory, "speaker_turns.csv"), header=True, index=False)

In [6]:
%reload_ext autoreload
%autoreload 2
from speakers import replacer_dict

get_speaker_turns(replacer_dict, True)

diarized_audio/raw_diarizations/lexFridman-guidoVanRossum-26112022
{'1': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '2': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '3': '00,LEXFRIDMAN;01,GUIDOVANROSSUM;02,LEXFRIDMAN', '4': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '5': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '6': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '7': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '8': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '9': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '10': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '11': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '12': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '13': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '14': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '15': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '16': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '17': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '18': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '19': '00,LEXFRIDMAN;01,GUIDOVANROSSUM'}


diarized_audio/raw_diarizations/lexFridman-markZuckerberg-09062023
{'1': '00,LEXFRIDMAN;01,MARKZUCKERBERG', '2': '00,MARKZUCKERBERG;01,LEXFRIDMAN', '3': '00,LEXFRIDMAN;01,MARKZUCKERBERG', '4': '00,MARKZUCKERBERG;01,LEXFRIDMAN', '5': '00,MARKZUCKERBERG;01,LEXFRIDMAN', '6': '00,LEXFRIDMAN;01,MARKZUCKERBERG', '7': '00,LEXFRIDMAN;01,MARKZUCKERBERG', '8': '00,LEXFRIDMAN;01,MARKZUCKERBERG', '9': '00,LEXFRIDMAN;01,MARKZUCKERBERG', '10': '00,LEXFRIDMAN;01,MARKZUCKERBERG', '11': '00,MARKZUCKERBERG;01,LEXFRIDMAN', '12': '00,MARKZUCKERBERG;01,LEXFRIDMAN', '13': '00,MARKZUCKERBERG;01,LEXFRIDMAN', '14': '00,LEXFRIDMAN;01,MARKZUCKERBERG', '15': '00,LEXFRIDMAN;01,MARKZUCKERBERG', '16': '00,LEXFRIDMAN;01,MARKZUCKERBERG'}
diarized_audio/raw_diarizations/lexFridman-matthewMcConaughey-13062023
{'1': '00,MATTHEWMCCOUNAGHEY;01,LEXFRIDMAN', '2': '00,MATTHEWMCCOUNAGHEY;01,LEXFRIDMAN', '3': '00,MATTHEWMCCOUNAGHEY;01,LEXFRIDMAN', '4': '00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY', '5': '00,MATTHEWMCCOUNAGHEY;01,LEXFRI

### Now we get what a speaker has said in text, linking the diarizations with the transcriptions

In [7]:
import pandas as pd

# Function to find the closest time in df2 to a given time in df1
def find_closest_start_time(df, given_time):
    # Calculate absolute time differences
    
    time_diff = (df['start'] - given_time).abs()
    
    # Find the index of the minimum difference
    closest_index = time_diff.idxmin()
    return closest_index

def find_closest_end_time(df, given_time):
    # Calculate absolute time differences
    
    time_diff = (df['end'] - given_time).abs()
    
    # Find the index of the minimum difference
    closest_index = time_diff.idxmin()
    return closest_index

def add_text_to_diarization(diarization_df, transcriptions_df):

    # New column for the combined text
    diarization_df['text'] = ''

    for index, row in diarization_df.iterrows():
        # Find closest start and end times in df2
        
        closest_start_index = find_closest_start_time(transcriptions_df, row['start'])
        closest_end_index = find_closest_end_time(transcriptions_df, row['end'])

        # Extract all rows in-between these indices
        if closest_start_index <= closest_end_index:
            relevant_text = transcriptions_df.loc[closest_start_index:closest_end_index, 'text']
        else:
            relevant_text = transcriptions_df.loc[closest_end_index:closest_start_index, 'text']

        # Combine the text and add to dataframe
        combined_text = "".join(relevant_text)
        diarization_df.at[index, 'text'] = combined_text

    # dataframe now contains the combined text in the new 'text' column
    return diarization_df


In [8]:
from nltk.tokenize import TweetTokenizer
from nltk import sent_tokenize

tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

#### Get all podcasts

Get speaker-tagged transcriptions for each dataframe, and save them in the datasets directory

In [9]:
diarizations = "diarized_audio/raw_diarizations"

directories = [
    directory
    for directory in os.listdir(diarizations) 
    if "speaker_turns.csv" in os.listdir(os.path.join(diarizations, directory))
]

for podcast_name in directories:
    directory = os.path.join(diarizations, podcast_name)

    diarization = pd.read_csv(f"diarized_audio/raw_diarizations/{podcast_name}/speaker_turns.csv")
    transcription = pd.read_csv(f"transcribed_audio/{podcast_name}_transcribed.csv")

    # Do a bit of data cleaning
    transcription['start'] = pd.to_timedelta(transcription['start'], unit='s')
    transcription['end'] = pd.to_timedelta(transcription['end'], unit='s')

    diarization['start'] = pd.to_timedelta(diarization['start'])
    diarization['end'] = pd.to_timedelta(diarization['end'])

    all_transcriptions = "".join(transcription["text"])

    final_df = add_text_to_diarization(diarization, transcription)
    final_df.to_csv(os.path.join("datasets", podcast_name + ".csv"), header=True, index=False)

    # speaked = final_df.groupby("speaker")['text'].apply(lambda x: ''.join(x))
    # print("Number of words:", len(tweet_tokenizer.tokenize(all_transcriptions)))
    # print("Number of sentences:", len(sent_tokenize(all_transcriptions)))
    # print(speaked)

Then join them all in a single dataframe.

In [10]:
df_name = "diarized_transcribed_df"
podcast_arr = []

for podcast_name in directories:
    podcast = pd.read_csv(os.path.join('datasets', podcast_name + '.csv'))
    podcast_arr.append(podcast)

diarized_transcribed_df = pd.concat(podcast_arr)
diarized_transcribed_df.to_csv(os.path.join("datasets", df_name + '.csv'), header=True, index=False)

In [11]:
text_df = diarized_transcribed_df.reset_index(drop=True)

### Preprocess the podcasts to ensure everything is correct

In [12]:
text_df['start'] = pd.to_timedelta(text_df['start'])
text_df['end'] = pd.to_timedelta(text_df['end'])

In [13]:
fix_df = (
    text_df[
        ~(( text_df['end'] - text_df['start'] ) < pd.Timedelta(1, unit='s'))
    ]
)

In [14]:
# Are there duplicates?
print(f"There are {fix_df['text'].duplicated().sum()} duplicates")

There are 107 duplicates


In [15]:
# Remove duplicates
duplicates_across_speakers = fix_df[fix_df.duplicated(subset='text', keep=False)]

unique_speakers_per_text = duplicates_across_speakers.drop_duplicates(subset=['text', 'speaker'])

texts_to_remove = unique_speakers_per_text[unique_speakers_per_text.duplicated(subset='text', keep=False)]['text']

# Remove entirely duplicate text if different speakers have said it
fix_df = fix_df[~fix_df['text'].isin(texts_to_remove)]

# Keep the first ocurrence of duplicate text if the same speaker has said it
# (it could be a catch phrase, or something that identifies the speaker, which is very useful for speaker prediction)
fix_df = fix_df.drop_duplicates(subset=['speaker', 'text'])

In [16]:
# Check that we have removed all the duplicates
fix_df['text'].duplicated().any()

False

In [17]:
fix_grouped_df = (
    fix_df.groupby('speaker')['text']
    .agg(
        lambda x: ''.join(x)
    ).to_frame().reset_index()
)

In [18]:
grouped_df = text_df.groupby('speaker')['text'].agg(lambda x: ''.join(x)).to_frame().reset_index()

In [19]:
grouped_df['words'] = grouped_df['text'].apply(lambda x: len(tweet_tokenizer.tokenize(x)))
grouped_df['sentences']  = grouped_df['text'].apply(lambda x: len(sent_tokenize(x)))

fix_grouped_df['words'] = fix_grouped_df['text'].apply(lambda x: len(tweet_tokenizer.tokenize(x)))
fix_grouped_df['sentences']  = fix_grouped_df['text'].apply(lambda x: len(sent_tokenize(x)))

In [20]:
wordiff = sum(grouped_df['words'] - fix_grouped_df['words'])
sentdiff = sum(grouped_df['sentences'] - fix_grouped_df['sentences'])

print(f"Eliminated Words = {wordiff} out of {grouped_df['words'].sum()} ({((wordiff / grouped_df['words'].sum()) * 100):.2f}%) | {grouped_df['words'].sum()} reduced to -> {fix_grouped_df['words'].sum()} ")
print(f"Eliminated Sentences = {sentdiff} out of {grouped_df['sentences'].sum()} ({((sentdiff / grouped_df['sentences'].sum()) * 100):.2f}%) | {grouped_df['sentences'].sum()} reduced to -> {fix_grouped_df['sentences'].sum()} ")

Eliminated Words = 34613 out of 444378 (7.79%) | 444378 reduced to -> 409765 
Eliminated Sentences = 2937 out of 33511 (8.76%) | 33511 reduced to -> 30574 


In [21]:
fix_grouped_df

Unnamed: 0,speaker,text,words,sentences
0,ANDREW,"Flatter for archaeologists. Yeah, it's from t...",27450,3317
1,ANDREWHUBERMAN,"Listen, when it comes to romantic relationshi...",49949,3605
2,BENSHAPIRO,The great light we tell ourselves is that peo...,28319,1621
3,GEORGEHOTZ,Sure. So I think the most obvious way to me i...,23863,2057
4,GUIDOVANROSSUM,How did you pull that off? And what's C Pytho...,23891,1214
5,HANCOCK,We're almost at the edge of history when we g...,26843,2027
6,KANYEWEST,Based off of our connection and just you bein...,21401,1594
7,LEXFRIDMAN,Can you imagine possible features that Python...,73199,5429
8,MARKZUCKERBERG,"that experience like? Oh, it's fun. I know. Y...",35694,1528
9,MARQUES,"Yeah. This is new to me. Yeah. No, you emaile...",15168,1355


In [52]:
# Filter our df taking only speakers that have said more than 40k words
speakers_of_interest = fix_grouped_df[fix_grouped_df['words'] > 40_000]['speaker']
df = fix_df[fix_df['speaker'].isin(speakers_of_interest)][['speaker', 'text']]

In [53]:
df['sentences'] = df['text'].apply(sent_tokenize)
df = df.explode('sentences').drop('text', axis=1).reset_index(drop=True)

In [56]:
# Keep only sentences with more than three words, others are irrelevant
df['word_count'] = df['sentences'].apply(lambda x: len(x.split()))
df = df[df['word_count'] > 3].drop('word_count', axis=1)

In [57]:
df

Unnamed: 0,speaker,sentences
0,LEXFRIDMAN,Can you imagine possible features that Python...
1,LEXFRIDMAN,of the new 4.0?
2,LEXFRIDMAN,"Given the amount of pain and joy, suffering, a..."
3,LEXFRIDMAN,his second time on this podcast.
4,LEXFRIDMAN,He is the creator of the Python programming la...
...,...,...
16034,ANDREWHUBERMAN,"I've seen so many people who were doing great,..."
16035,ANDREWHUBERMAN,So those would be the recommendations.
16036,ANDREWHUBERMAN,There are a bunch of others and I'll keep spou...
16040,ANDREWHUBERMAN,"Oh, that's just Salmonella."


In [58]:
def undersample_and_split_dataframe(df, class_column, split_prob=[0.7, 0.15, 0.15]):

    #split parameters
    choices = ['train', 'test', 'val']

    # Group by class and find the smallest class size
    group = df.groupby(class_column)
    smallest_class_size = group.size().min()

    # Sample from each class (we can use with multi-label)
    undersampled_df = pd.DataFrame()
    for _, group_df in group:
        sampled_df = group_df.sample(n=smallest_class_size, replace=False, random_state=1)

        # Perform the split class by class, for the train, test and validation rows to be balanced.
        sampled_df['split'] = np.random.choice(choices, size=len(sampled_df), p=split_prob)
        undersampled_df = pd.concat([undersampled_df, sampled_df], axis=0)

    # Check the distribution
    print(undersampled_df['split'].value_counts(normalize=True))

    return undersampled_df

In [59]:
prepared_df = undersample_and_split_dataframe(df, 'speaker')
prepared_df.columns = ['category', 'title', 'split']
prepared_df = prepared_df.reset_index(drop=True)

split
train    0.695123
test     0.153796
val      0.151081
Name: proportion, dtype: float64


In [60]:
prepared_df.groupby('category').size()

category
ANDREWHUBERMAN    3069
LEXFRIDMAN        3069
MRBEAST           3069
dtype: int64

In [61]:
prepared_df.to_csv(os.path.join("datasets", df_name + "_preparado.csv"), header=True, index=False)