In [1]:
import os
from pydub import AudioSegment
import re
import numpy as np
import pandas as pd
from datetime import timedelta

In [2]:
d_path = "diarized_audio/raw_diarizations"
diarization_paths = {}
for directory in os.listdir(d_path):
    diarization_paths[directory] = os.path.join(d_path, directory)

First, process all the files to have a suitable format. This will be a pandas dataframe, stored in memory as a .csv file

In [None]:
# Remove all csv files from diarization directory
for path in os.listdir(d_path):
    !rm -r {os.path.join(d_path, path)}/speaker_turns.csv

In [23]:
# to_replace_dict = {
#     "1":"01,HUBERMAN;00,ANDREW",
#     "2":"00,HUBERMAN;01,ANDREW",
#     "3":"01,HUBERMAN;00,ANDREW",
#     "4":"00,HUBERMAN;01,ANDREW",
#     "5":"02,HUBERMAN;01,ANDREW",
#     "6":"01,HUBERMAN;00,ANDREW",
#     "7":"01,HUBERMAN",
#     "8":"01,HUBERMAN;02,ANDREW",
#     "9":"01,HUBERMAN;00,ANDREW",
#     "10":"02,HUBERMAN;03,ANDREW",
#     "11":"02,HUBERMAN;01,ANDREW",
#     "12":"00,HUBERMAN",
#     "13":"00,HUBERMAN;01,ANDREW"
# }

Utils:

In [6]:
def sort_directories(arr):
    def sort_key(s):
        return int(s.split('_')[1])

    return sorted(arr, key=sort_key)

def parse_time(time_str):
    # Parses the time string to a pandas.Timedelta object
    return pd.Timedelta(time_str)

def process_line(line):
    # Splits the line and extracts the required information
    time_data, speaker = line.split(']')
    time_data = time_data[2:]
    start_time, end_time = time_data.split(' -->  ')
    speaker = speaker.split(' ')[-1]  # Gets only the SPEAKER_xx part
    return speaker, parse_time(start_time), parse_time(end_time)

def most_common_letters(string, dictionary):
    def common_letter_count(s1, s2):
        return sum(min(s1.count(c), s2.count(c)) for c in set(s1))

    max_key = max(dictionary.keys(), key=lambda k: common_letter_count(k.lower(), string.lower()))
    return dictionary[max_key]

In [49]:
def process_joerogan_kevinhart(to_replace_dict):
    directory = "diarized_audio/raw_diarizations/JRE-kevinHart-25052020"

    podcast_df = pd.DataFrame(columns=['speaker', 'start', 'end'])
    spacer_time = pd.Timedelta('0 days 00:00:00.998000')
    global_time = pd.Timedelta('0')
    splits = sort_directories(os.listdir(directory))

    for idx, split in enumerate(splits):
        
        file_path = os.path.join(directory, split)
        # get split number with regex that searches for a number
        number = re.search(r'\d+(\.\d+)?', split).group()
        # get the speakers (SPEAKER_00, SPEAKER_...) that talk in this split
        #talkers = ["SPEAKER_" + n for n in re.split("[,;]", to_replace_dict[number])[0::2]]
        
        if to_replace_dict[number] == "":   #TODO hay que utilizarlo pero no aqu√≠
            continue

        real_speakers = to_replace_dict[number].split(";")

        with open(file_path, 'r') as file:
            for line in file:
                speaker, start, end = process_line(line)

                # Set the correct speaker
                speaker = speaker.strip('\n')

                for rsp in real_speakers:
                    rsp = rsp.split(",")
                    pyannote_speaker = 'SPEAKER_' + rsp[0]
                    
                    if pyannote_speaker == speaker:
                        speaker = rsp[1]
                
                # Adjust start and end to the global time
                start = start + global_time - spacer_time
                end = end + global_time - spacer_time
                podcast_df.loc[len(podcast_df)] = [speaker, start, end]
        
        global_time = podcast_df["end"].max()
    
    pattern = 'SPEAKER_\d+'
    filtered_df = podcast_df[~podcast_df['speaker'].str.contains(pattern, na=False)]
    filtered_df.to_csv(os.path.join(directory, "speaker_turns.csv"), header=True, index=False)

In [78]:
def process_lexFridman(podcast_dictionaries):
    diarizations = "diarized_audio/raw_diarizations"
    directories = [
        directory 
        for directory in os.listdir(diarizations) 
        if bool(re.match(r"^lexFridman", directory))
    ]
    
    for podcast_name in directories:

        directory = os.path.join(diarizations, podcast_name)
        print(directory)
        podcast_df = pd.DataFrame(columns=['speaker', 'start', 'end'])
        spacer_time = pd.Timedelta('0 days 00:00:00.998000')
        global_time = pd.Timedelta('0')
        splits = sort_directories([d for d in os.listdir(directory) if d.endswith('.txt')])
        #to_replace_dict = most_common_letters(podcast_name, podcast_dictionaries)
        to_replace_dict = podcast_dictionaries[podcast_name]
        print(to_replace_dict)
        
        for idx, split in enumerate(splits):
            
            file_path = os.path.join(directory, split)
            # get split number with regex that searches for a number
            number = re.search(r'\d+(\.\d+)?', split).group()
            # get the speakers (SPEAKER_00, SPEAKER_...) that talk in this split
            #talkers = ["SPEAKER_" + n for n in re.split("[,;]", to_replace_dict[number])[0::2]]

            if to_replace_dict[number] == "":   #TODO modificar
                continue

            real_speakers = to_replace_dict[number].split(";")

            with open(file_path, 'r') as file:
                for line in file:
                    speaker, start, end = process_line(line)

                    # Set the correct speaker

                    speaker = speaker.strip('\n')
                    for rsp in real_speakers:
                        rsp = rsp.split(",")
                        pyannote_speaker = 'SPEAKER_' + rsp[0]
                        
                        if pyannote_speaker == speaker:
                            speaker = rsp[1]
                    
                    # Adjust start and end to the global time
                    start = start + global_time - spacer_time
                    end = end + global_time - spacer_time

                    #Create a new tagged speaker intervention in the podcast
                    podcast_df.loc[len(podcast_df)] = [speaker, start, end]
            
            global_time = podcast_df["end"].max()
    
        pattern = 'SPEAKER_\d+'
        filtered_df = podcast_df[~podcast_df['speaker'].str.contains(pattern, na=False)]
        filtered_df.to_csv(os.path.join(directory, "speaker_turns.csv"), header=True, index=False)

In [47]:
# Format will be split_number:speakers_with_replacement, where speakers
# with replacement is a string with the number of speaker to replace and
# the name, separated by a comma and separated by semicolon from other
# speakers
to_replace_dict = {
    "1": "00,KEVINHART;01,JOEROGAN",
    "2": "00,KEVINHART;01,JOEROGAN",
    "3": "00,KEVINHART;01,JOEROGAN",
    "4": "00,JOEROGAN;01,KEVINHART",
    "5": "00,JOEROGAN;01,KEVINHART",
    "6": "00,KEVINHART;01,JOEROGAN",
    "7": "00,KEVINHART;01,JOEROGAN",
    "8": "00,KEVINHART;01,JOEROGAN",
    "9": "01,KEVINHART;02,JOEROGAN",
    "10": "00,KEVINHART;01,JOEROGAN",
    "11": "00,KEVINHART;01,JOEROGAN",
    "12": "00,KEVINHART;01,JOEROGAN"
}

In [79]:
replacer_dict = {}

to_replace_dict = {
    "1": "00,ANDREWHUBERMAN;01,LEXFRIDMAN",
    "2": "00,ANDREWHUBERMAN;01,LEXFRIDMAN",
    "3": "00,LEXFRIDMAN;01,ANDREWHUBERMAN",
    "4": "00,LEXFRIDMAN;01,ANDREWHUBERMAN",
    "5": "00,LEXFRIDMAN;01,ANDREWHUBERMAN",
    "6": "00,LEXFRIDMAN;01,ANDREWHUBERMAN",
    "7": "00,LEXFRIDMAN;01,ANDREWHUBERMAN",
    "8": "00,LEXFRIDMAN;01,ANDREWHUBERMAN",
    "9": "00,ANDREWHUBERMAN;01,LEXFRIDMAN",
    "10": "00,LEXFRIDMAN;01,ANDREWHUBERMAN",
    "11": "00,LEXFRIDMAN;01,ANDREWHUBERMAN",
    "12": "00,LEXFRIDMAN;01,ANDREWHUBERMAN",
    "13": "00,LEXFRIDMAN;01,ANDREWHUBERMAN"
}

replacer_dict["lexFridman-andrewHuberman-17082023"] = to_replace_dict

to_replace_dict = {
    "1": "00,BENSHAPIRO;01,LEXFRIDMAN",
    "2": "00,BENSHAPIRO;01,LEXFRIDMAN",
    "3": "00,LEXFRIDMAN;01,BENSHAPIRO",
    "4": "00,BENSHAPIRO;01,LEXFRIDMAN",
    "5": "00,LEXFRIDMAN;01,BENSHAPIRO",
    "6": "00,BENSHAPIRO;01,LEXFRIDMAN",
    "7": "00,BENSHAPIRO;01,LEXFRIDMAN",
    "8": "00,BENSHAPIRO;01,LEXFRIDMAN",
    "9": "00,BENSHAPIRO;01,LEXFRIDMAN",
    "10": "00,BENSHAPIRO;01,LEXFRIDMAN",
    "11": "00,BENSHAPIRO;01,LEXFRIDMAN",
    "12": "00,BENSHAPIRO;01,LEXFRIDMAN",
    "13": "00,LEXFRIDMAN;01,BENSHAPIRO",
    "14": "00,LEXFRIDMAN;01,BENSHAPIRO",
    "15": "00,LEXFRIDMAN;01,BENSHAPIRO"
}

replacer_dict["lexFridman-benShapiro-07112022"] = to_replace_dict

to_replace_dict = {
    "1": "00,GUIDOVANROSSUM;01,LEXFRIDMAN",
    "2": "00,LEXFRIDMAN;01,GUIDOVANROSSUM",
    "3": "00,LEXFRIDMAN;01,GUIDOVANROSSUM;02,LEXFRIDMAN",
    "4": "00,LEXFRIDMAN;01,GUIDOVANROSSUM",
    "5": "00,LEXFRIDMAN;01,GUIDOVANROSSUM",
    "6": "00,LEXFRIDMAN;01,GUIDOVANROSSUM",
    "7": "00,LEXFRIDMAN;01,GUIDOVANROSSUM",
    "8": "00,GUIDOVANROSSUM;01,LEXFRIDMAN",
    "9": "00,GUIDOVANROSSUM;01,LEXFRIDMAN",
    "10": "00,LEXFRIDMAN;01,GUIDOVANROSSUM",
    "11": "00,GUIDOVANROSSUM;01,LEXFRIDMAN",
    "12": "00,LEXFRIDMAN;01,GUIDOVANROSSUM",
    "13": "00,GUIDOVANROSSUM;01,LEXFRIDMAN",
    "14": "00,GUIDOVANROSSUM;01,LEXFRIDMAN",
    "15": "00,LEXFRIDMAN;01,GUIDOVANROSSUM",
    "16": "00,LEXFRIDMAN;01,GUIDOVANROSSUM",
    "17": "00,GUIDOVANROSSUM;01,LEXFRIDMAN",
    "18": "00,LEXFRIDMAN;01,GUIDOVANROSSUM",
    "19": "00,LEXFRIDMAN;01,GUIDOVANROSSUM"
}

replacer_dict["lexFridman-guidoVanRossum-26112022"] = to_replace_dict

to_replace_dict = {
    "1": "00,LEXFRIDMAN;01,GEORGEHOTZ",
    "2": "00,GEORGEHOTZ;01,LEXFRIDMAN",
    "3": "00,LEXFRIDMAN;01,LEXFRIDMAN;02,GEORGEHOTZ;03,GEORGEHOTZ;04,GEORGEHOTZ",
    "4": "00,LEXFRIDMAN;01,GEORGEHOTZ",
    "5": "00,LEXFRIDMAN;01,GEORGEHOTZ",
    "6": "00,GEORGEHOTZ;01,LEXFRIDMAN",
    "7": "00,GEORGEHOTZ;01,LEXFRIDMAN",
    "8": "00,LEXFRIDMAN;01,GEORGEHOTZ",
    "9": "00,GEORGEHOTZ;01,LEXFRIDMAN",
    "10": "00,LEXFRIDMAN;01,GEORGEHOTZ",
    "11": "00,LEXFRIDMAN;01,GEORGEHOTZ",
    "12": "00,GEORGEHOTZ;01,LEXFRIDMAN",
    "13": "00,LEXFRIDMAN;01,GEORGEHOTZ",
    "14": "00,LEXFRIDMAN;01,GEORGEHOTZ",
    "15": "00,GEORGEHOTZ;01,GEORGEHOTZ;02,LEXFRIDMAN",
    "16": "00,GEORGEHOTZ;01,LEXFRIDMAN",
    "17": "00,LEXFRIDMAN;01,GEORGEHOTZ;02,GEORGEHOTZ",
    "18": "00,GEORGEHOTZ;01,LEXFRIDMAN",
    "19": "00,LEXFRIDMAN;01,GEORGEHOTZ"
}

replacer_dict["lexFridman-georgeHotz-30062023"] = to_replace_dict

to_replace_dict = {
    "1": "00,LEXFRIDMAN;01,KANYEWEST",
    "2": "00,KANYEWEST;01,LEXFRIDMAN",
    "3": "00,LEXFRIDMAN;01,KANYEWEST",
    "4": "00,LEXFRIDMAN;01,KANYEWEST",
    "5": "00,KANYEWEST;01,LEXFRIDMAN",
    "6": "00,LEXFRIDMAN;01,KANYEWEST",
    "7": "00,KANYEWEST;01,LEXFRIDMAN",
    "8": "00,KANYEWEST;01,LEXFRIDMAN",
    "9": "00,KANYEWEST;01,LEXFRIDMAN",
    "10": "00,LEXFRIDMAN;01,KANYEWEST",
    "11": "00,LEXFRIDMAN;01,KANYEWEST",
    "12": "00,LEXFRIDMAN;01,KANYEWEST",
    "13": "00,KANYEWEST;01,LEXFRIDMAN",
    "14": "00,LEXFRIDMAN;01,KANYEWEST",
    "15": "00,KANYEWEST;01,LEXFRIDMAN"
}

replacer_dict["lexFridman-kanyeWest-24102022"] = to_replace_dict

to_replace_dict = {
    "1": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "2": "00,MARKZUCKERBERG;01,LEXFRIDMAN",
    "3": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "4": "00,MARKZUCKERBERG;01,LEXFRIDMAN",
    "5": "00,MARKZUCKERBERG;01,LEXFRIDMAN",
    "6": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "7": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "8": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "9": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "10": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "11": "00,MARKZUCKERBERG;01,LEXFRIDMAN",
    "12": "00,MARKZUCKERBERG;01,LEXFRIDMAN",
    "13": "00,MARKZUCKERBERG;01,LEXFRIDMAN",
    "14": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "15": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "16": "00,LEXFRIDMAN;01,MARKZUCKERBERG"
}

replacer_dict["lexFridman-markZuckerberg-09062023"] = to_replace_dict

to_replace_dict = {
    "1": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "2": "00,MARKZUCKERBERG;01,LEXFRIDMAN",
    "3": "00,LEXFRIDMAN;01,MARKZUCKERBERG",
    "4": "00,MARKZUCKERBERG;01,LEXFRIDMAN",
    "5": "00,MARKZUCKERBERG;01,LEXFRIDMAN",
    "6": "00,MARKZUCKERBERG;01,LEXFRIDMAN"
}

replacer_dict["lexFridman-markZuckerberg-28092023"] = to_replace_dict

to_replace_dict = {
    "1": "00,MATTHEWMCCOUNAGHEY;01,LEXFRIDMAN",
    "2": "00,MATTHEWMCCOUNAGHEY;01,LEXFRIDMAN",
    "3": "00,MATTHEWMCCOUNAGHEY;01,LEXFRIDMAN",
    "4": "00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY",
    "5": "00,MATTHEWMCCOUNAGHEY;01,LEXFRIDMAN",
    "6": "00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY",
    "7": "00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY",
    "8": "00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY",
    "9": "00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY",
    "10": "00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY",
    "11": "00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY",
    "12": "00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY",
    "13": "00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY",
    "14": "00,LEXFRIDMAN;01,MATTHEWMCCOUNAGHEY"
}

replacer_dict["lexFridman-matthewMcConaughey-13062023"] = to_replace_dict

to_replace_dict = {
    "1": "00,MRBEAST;01,LEXFRIDMAN",
    "2": "00,MRBEAST;01,LEXFRIDMAN",
    "3": "00,LEXFRIDMAN;01,MRBEAST",
    "4": "00,MRBEAST;01,LEXFRIDMAN",
    "5": "00,MRBEAST;01,LEXFRIDMAN",
    "6": "00,MRBEAST;01,LEXFRIDMAN",
    "7": "00,LEXFRIDMAN;01,MRBEAST",
    "8": "00,MRBEAST;01,LEXFRIDMAN",
    "9": "00,MRBEAST;01,LEXFRIDMAN",
    "10": "00,MRBEAST;01,LEXFRIDMAN",
    "11": "00,LEXFRIDMAN;01,MRBEAST",
    "12": "00,LEXFRIDMAN;01,MRBEAST",
    "13": "00,LEXFRIDMAN;01,MRBEAST",
    "14": "00,LEXFRIDMAN;01,MRBEAST"
}

replacer_dict["lexFridman-mrBeast-11012023"] = to_replace_dict

In [80]:
#process_joerogan_kevinhart(to_replace_dict)
process_lexFridman(replacer_dict)

diarized_audio/raw_diarizations/lexFridman-guidoVanRossum-26112022
{'1': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '2': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '3': '00,LEXFRIDMAN;01,GUIDOVANROSSUM;02,LEXFRIDMAN', '4': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '5': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '6': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '7': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '8': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '9': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '10': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '11': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '12': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '13': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '14': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '15': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '16': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '17': '00,GUIDOVANROSSUM;01,LEXFRIDMAN', '18': '00,LEXFRIDMAN;01,GUIDOVANROSSUM', '19': '00,LEXFRIDMAN;01,GUIDOVANROSSUM'}
diarized_audio/raw_diarizations/lexFridman-markZuckerberg-09062023
{'1': '00,LEXFRIDMAN;01,MARKZUCKERBERG', '2': '00,MARKZUCKERBERG;01,LEXFRIDMAN', 

### Now we get what a speaker has said in text, linking the diarizations with the transcriptions

In [83]:
import pandas as pd

# Function to find the closest time in df2 to a given time in df1
def find_closest_start_time(df, given_time):
    # Calculate absolute time differences
    
    time_diff = (df['start'] - given_time).abs()
    
    # Find the index of the minimum difference
    closest_index = time_diff.idxmin()
    return closest_index

def find_closest_end_time(df, given_time):
    # Calculate absolute time differences
    
    time_diff = (df['end'] - given_time).abs()
    
    # Find the index of the minimum difference
    closest_index = time_diff.idxmin()
    return closest_index

def add_text_to_diarization(diarization_df, transcriptions_df, output_name):

    # New column for the combined text
    diarization_df['text'] = ''

    for index, row in diarization_df.iterrows():
        # Find closest start and end times in df2
        
        closest_start_index = find_closest_start_time(transcriptions_df, row['start'])
        closest_end_index = find_closest_end_time(transcriptions_df, row['end'])

        # Extract all rows in-between these indices
        if closest_start_index <= closest_end_index:
            relevant_text = transcriptions_df.loc[closest_start_index:closest_end_index, 'text']
        else:
            relevant_text = transcriptions_df.loc[closest_end_index:closest_start_index, 'text']

        # Combine the text and add to dataframe
        combined_text = "".join(relevant_text)
        diarization_df.at[index, 'text'] = combined_text

    # dataframe now contains the combined text in the new 'text' column
    diarization_df.to_csv(os.path.join("datasets", output_name + ".csv"), header=True, index=False)

    return diarization_df


In [76]:
from nltk.tokenize import TweetTokenizer
from nltk import sent_tokenize

tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

#### Joe Rogan & Kevin Hart

In [65]:
diarization = pd.read_csv("diarized_audio/raw_diarizations/JRE-kevinHart-25052020/speaker_turns.csv")
transcription = pd.read_csv("transcribed_audio/JRE-kevinHart-25052020_transcribed.csv")

In [66]:
transcription['start'] = pd.to_timedelta(transcription['start'], unit='s')
transcription['end'] = pd.to_timedelta(transcription['end'], unit='s')

In [67]:
diarization['start'] = pd.to_timedelta(diarization['start'])
diarization['end'] = pd.to_timedelta(diarization['end'])

In [68]:
all_transcriptions = "".join(transcription["text"])
print("Number of words:", len(tweet_tokenizer.tokenize(all_transcriptions)))
print("Number of sentences:", len(sent_tokenize(all_transcriptions)))

Number of words: 25477
Number of sentences: 2325


In [69]:
final_df = add_text_to_diarization(diarization, transcription, 'JRE_kevinHart_25052020')

In [70]:
speaked = final_df.groupby("speaker")['text'].apply(lambda x: ''.join(x))
speaked

speaker
JOEROGAN      You come in here moving in shaking, man. You ...
KEVINHART     You're always moving. I mean, is there anythi...
Name: text, dtype: object

#### All Lex Fridman Podcasts

In [84]:
# Get lex fridman podcasts
diarizations = "diarized_audio/raw_diarizations"
directories = [
    directory 
    for directory in os.listdir(diarizations) 
    if bool(re.match(r"^lexFridman", directory))
]

for podcast_name in directories:
    directory = os.path.join(diarizations, podcast_name)

    diarization = pd.read_csv(f"diarized_audio/raw_diarizations/{podcast_name}/speaker_turns.csv")
    transcription = pd.read_csv(f"transcribed_audio/{podcast_name}_transcribed.csv")

    # Do a bit of data cleaning
    transcription['start'] = pd.to_timedelta(transcription['start'], unit='s')
    transcription['end'] = pd.to_timedelta(transcription['end'], unit='s')

    diarization['start'] = pd.to_timedelta(diarization['start'])
    diarization['end'] = pd.to_timedelta(diarization['end'])

    all_transcriptions = "".join(transcription["text"])

    final_df = add_text_to_diarization(diarization, transcription, podcast_name)
    speaked = final_df.groupby("speaker")['text'].apply(lambda x: ''.join(x))
    print("Number of words:", len(tweet_tokenizer.tokenize(all_transcriptions)))
    print("Number of sentences:", len(sent_tokenize(all_transcriptions)))
    print(speaked)

Number of words: 30192
Number of sentences: 1642
speaker
GUIDOVANROSSUM     of the new 4.0? Given the amount of pain and ...
LEXFRIDMAN         Can you imagine possible features that Python...
Name: text, dtype: object
Number of words: 31351
Number of sentences: 1443
speaker
LEXFRIDMAN         The following is a conversation with Mark Zuc...
MARKZUCKERBERG     that experience like? Oh, it's fun. I know. Y...
Name: text, dtype: object
Number of words: 21661
Number of sentences: 643
speaker
LEXFRIDMAN             The following is a conversation with Matthew ...
MATTHEWMCCOUNAGHEY     If you really want to give a character an obs...
Name: text, dtype: object
Number of words: 27534
Number of sentences: 1724
speaker
ANDREWHUBERMAN     Listen, when it comes to romantic relationshi...
LEXFRIDMAN         The following is a conversation with my dear ...
Name: text, dtype: object
Number of words: 26227
Number of sentences: 2085
speaker
KANYEWEST      Based off of our connection and just you bein

Create lexFridman - others

In [122]:
lexfridman_podcasts = []
for podcast_name in directories:
    podcast = pd.read_csv(f"datasets/{podcast_name}.csv")
    #print(podcast.shape[0])
    lexfridman_podcasts.append(podcast)

lexfridman_podcasts_df = pd.concat(lexfridman_podcasts)
lexfridman_podcasts_df['speaker'] = lexfridman_podcasts_df['speaker'].map(lambda x: 'RESTO' if x != 'LEXFRIDMAN' else 'LEXFRIDMAN')
lexfridman_podcasts_df.to_csv(os.path.join("datasets", "all_lexFridman_resto.csv"), header=True, index=False)