# Off menu transcripts and timestamps processing

This notebook serves as a development environment for the logic to process the transcripts and generate timestamps from them. The final production code is located in off_menu/data_processing.py.

In [None]:
# Ensure imports can find my utils:

import sys
import os

notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..', '..'))

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")
print(f"Current sys.path: {sys.path}")

# Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests 
import re 
from typing import List
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

# Project specific imports
from off_menu.utils import Episode
from off_menu.utils import clean_text
from off_menu.utils import num_check
from off_menu.utils import find_num_end
from off_menu.utils import name_num_split
from off_menu.utils import clean_res
from off_menu.utils import get_episode_sentences
from off_menu.utils import create_sentence_list

Project root added to sys.path: c:\Users\jbara\OneDrive\Desktop\Data_science\Python projects\Off Menu project
Current sys.path: ['c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project', 'C:\\Users\\jbara\\miniconda3\\python312.zip', 'C:\\Users\\jbara\\miniconda3\\DLLs', 'C:\\Users\\jbara\\miniconda3\\Lib', 'C:\\Users\\jbara\\miniconda3', 'c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project\\.venv', '', 'c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project\\.venv\\Lib\\site-packages', 'c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project\\.venv\\Lib\\site-packages\\win32', 'c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project\\.venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project\\.venv\\Lib\\site-packages\\Pythonwin']


## Functions to get transcript string from html

In [None]:
#  Function to access sentences bs4 elements, which will be converted into cleaned text later
def get_episode_sentences(html):
    """
    Given the html of an Off Menu episode transcript site, returns the soup object with div elements with class 'single sentence'

    Parameters:
        episode_HTML (html): HTML of the podcast episode from podscripts.po

    Returns:
        List[str]: list of div elements with class 'single sentence'"""
    soup = BeautifulSoup(html, features="html.parser")
    all_sentences = soup.find_all("div", class_="single-sentence")
    return all_sentences


#  Function to take in sentces bs4 elements and return a cleaned string transcript
def clean_text(sentences):
    """
    Given "all sentences" (a list of bs4.Tag elements) this function returns cleaned text (str).

    "all sentences" is a list of bs4.Tag element representing all divs with class single sentence in this case. The
    function also lowercases all text, removes extra whitespace and lines, and all puntuation besides full stops.

    Args:
        sentences (bs4.Tag): all sentences using the find_all bs4 function.

    Returns:
        cleaned text
    """
    cleaned_line_list = []
    for section in sentences:
        # make lowercase
        text_lower = section.text.lower()
        # Split into lines
        lines = text_lower.splitlines()
        # Clean whitespace from end/start of lines, and don't include empty lines (recall list comp conditional at the end, and empty string is falsy)
        cleaned_lines = [line.strip() for line in lines if line.strip()]
        # Join lines using .join, with a space between then
        single_line_text = " ".join(cleaned_lines)
        if single_line_text:
            cleaned_line_list.append(single_line_text)
    all_text_single_line = " ".join(cleaned_line_list)
    return all_text_single_line


starting point is 00:00:00 hello, listeners of the off menu podcast. it is ed gamble here from the o


## Testing transcript production

In [None]:

test_filepath = os.path.join(project_root, 'data/test_temp/ep_1.html')
try:
    with open(test_filepath, 'r', encoding='utf-8') as html:
        html_text = html.read()
    transcript_str = clean_text(get_episode_sentences(html_text))
except FileNotFoundError:
    print(f"Error: The file was not found at {test_filepath}. Did it save correctly?")

print(transcript_str[:100])

## Collating timestamps from transcript

In [None]:
# Helper function which contains the actual collation logic

def _extract_timestamps_as_list_of_dicts(transcript, ep_num):
    """
    Given transcript (str), and ep_num, returns a list of dicts containing all timestamps in the transcript. 
    Each dict contains episode number, timestamp, and start_index.
    """
    timestamp_pattern = re.compile(r"starting point is (\d{2}:\d{2}:\d{2})")
    all_timestamps_in_transcript = []
    for match in timestamp_pattern.finditer(transcript):
        # Get the captured timestamp string (e.g., "00:00:05")
        actual_time_string = match.group(
            1
        )  
        # We use group(1) because that's our (HH:MM:SS) part, group(0) refers to the whole string by default

        # Get the starting index of the entire match
        start_position_in_text = match.start()
        # Store this as a dict with episode_number as key
        stamp_dict = {
        'episode_number': ep_num,
        'timestamp': actual_time_string,
        'start_index': start_position_in_text
        }
        # Store this extracted data (the timestamp string and its position)
        all_timestamps_in_transcript.append(stamp_dict)
    return all_timestamps_in_transcript

# Testing function using transcript_str from cell above

timestamps_dict_list = _extract_timestamps_as_list_of_dicts(transcript_str, 1)
print(timestamps_dict_list[:2])

[{'episode_number': 1, 'timestamp': '00:00:00', 'start_index': 0}, {'episode_number': 1, 'timestamp': '00:00:43', 'start_index': 729}]


## Creating a dataframe from the timestamps

In [None]:


timestamps_test_df = pd.DataFrame(timestamps_dict_list)
timestamps_test_df

Unnamed: 0,episode_number,timestamp,start_index
0,1,00:00:00,0
1,1,00:00:43,729
2,1,00:01:45,1473
3,1,00:02:36,2284
4,1,00:03:13,3064
...,...,...,...
165,1,01:06:08,71654
166,1,01:06:24,71973
167,1,01:07:02,72331
168,1,01:07:21,72689


## Script to apply timestamp collation to each episode and store in dataframe

In [None]:
# Load ep_and_mentions data (ep num, name, url etc.)
test_temp_dir = os.path.join(project_root, 'data/test_temp')
ep_and_mentions_filepath = os.path.join(test_temp_dir, 'ep_and_mentions.parquet')
ep_and_mentions = pd.read_parquet(ep_and_mentions_filepath)

print("---Ep and mentions dataframe head---")
print(ep_and_mentions.head())

# Save a small version for testing (required for testing transcript extraction logic)
five_row_test_ep_meta_and_mentions = ep_and_mentions.head()
five_row_test_ep_meta_and_mentions.to_parquet(test_temp_dir, index=False)

# Create list to store lists of dict entries with individual timestamps
test_timestamp_list = []

# iterate through test episodes (html already stored via earlier tests)
for index, row in ep_and_mentions.head(2).iterrows(): 
    episode_num = row['episode_number']
    episode_url = row['url']
    filepath = os.path.join(project_root, f'data/test_temp/ep_{episode_num}.html')
    try:
        with open(test_filepath, 'r', encoding='utf-8') as html:
            html_text_ = html.read()
            transcript_str = clean_text(get_episode_sentences(html_text))
    except FileNotFoundError:
        print(f"Error: The file was not found at {test_filepath}. Did it save correctly?")
    timestamps_dict = _extract_timestamps_as_list_of_dicts(transcript_str, episode_num)
    test_timestamp_list.extend(timestamps_dict) # Note using extend so we will have a "flat" list of all entries rather than list of lists, better for df

# Print first 2 dict entries from the single list containing all entries
print(test_timestamp_list[:2])

# Create the dataframe

test_timestamp_df = pd.DataFrame(test_timestamp_list)
test_timestamp_df


---Ep and mentions dataframe head---
   episode_number     guest_name  \
0               1  Scroobius Pip   
1               2     Grace Dent   
2               3  Richard Osman   
3               4     Nish Kumar   
4               5    Aisling Bea   

                                                 url  \
0  https://podscripts.co/podcasts/off-menu-with-e...   
1  https://podscripts.co/podcasts/off-menu-with-e...   
2  https://podscripts.co/podcasts/off-menu-with-e...   
3  https://podscripts.co/podcasts/off-menu-with-e...   
4  https://podscripts.co/podcasts/off-menu-with-e...   

                     restaurants_mentioned  
0                [Oli Baba’s, Kerb Camden]  
1                     [Little Owl, Trullo]  
2  [Five Guys, Cora Pearl, Berners Tavern]  
3       [Bademiya, The Owl & The Pussycat]  
4       [Café Gratitude, Burger & Lobster]  


PermissionError: [WinError 5] Failed to open local file 'c:/Users/jbara/OneDrive/Desktop/Data_science/Python projects/Off Menu project/data/test_temp'. Detail: [Windows error 5] Access is denied.


## Function to locate specific quote location using RegEx

In [None]:
def find_timestamp(quote, transcript, transcript_timestamps):
    quote_loc = transcript.find(quote)
    for timestamp in reversed(transcript_timestamps):
        for key in timestamp.keys():
            if timestamp["start_index"] <= quote_loc:
                return timestamp["timestamp_str"]