# Off menu episodes processing

This notebook serves as a development environment for the logic to process the episodes list HTML from the episodes section of the off menu website into useable episode metadata. The final production code is located in off_menu/data_processing.py.

In [None]:
# Ensure imports can find my utils:

import sys
import os

notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..', '..'))

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")
print(f"Current sys.path: {sys.path}")

# Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests

# Project specific imports
from off_menu.config import episodes_list_url
from off_menu.config import transcript_base_url

## Configuration and filepaths

In [None]:
episodes_html_filepath = os.path.join(project_root, 'data', 'test_temp', 'episodes.html')
test_filepath = episodes_html_filepath
test_temp_dir = os.path.join(project_root, 'data/test_temp')
test_processed_eps_filepath = os.path.join(test_temp_dir, 'episodes_metadata_test.parquet')
test_meta_and_mentions_filepath = os.path.join(test_temp_dir, 'ep_meta_and_mentions.parquet')

## Helper function to check if text contains numbers (to rule out special episodes, with no numbers)

In [None]:
def _num_check(text):
    """This function checks is a string contains digits.

    This string checks if a string contains digits, this can be used to determine if an episode has a valid number
    or not (e.g. if it's a special episode).

    Args:
        text (str): The text you want to check

    Returns:
        bool:
            - False if the argument is not text, or if it doesn't meet criteria below
            - True if the arg is text and the second character is a digit, or if the first char is '-' and second digit
    """
    if not text:
        return False
    if text[0] == "-":
        return text[1:].isdigit()
    return text.isdigit()



## Helper function to identify the index after the a number ends in text
Some eps don't have the colon splitting the title and number, this can be used to split their name and number (rather than colon)

In [None]:
def _find_num_end(tag):
    """
    This function takes a bs4.Tag (div) element, converts to text, and if it contains a digit, and another element after the digit that is a non digit, it returns this index.

    Args:
        div (bs4.Tag): The text you wish to scan for the index where a number ends

    Returns:
        Optional: returns the index, otherwise None
    """
    text = tag.text
    counter = 0
    while counter < len(text) - 1:
        if text[counter].isdigit() and not text[counter + 1].isdigit():
            return counter + 1
        else:
            counter += 1
            continue

## Helper function to split episode bs4.Tag elements into their name and numbers
Works if they're in standard form, if not, none returned

In [None]:
def _name_num_split(episode):
    """
    This function takes an episode div element, and returns a name and number for episodes in standard format.

    The standard format is "ep number". The find_num_end function is used to find an index where a number ends.
    It then splits this which should result in a number if the episode is in standard form, if it isn't, None is returned.

    Args:
        episode (bs4.Tag element): The episode HTML element (div)

    Returns:
        Optional: returns the name and number (str, str) or "not in standard form" (str)
    """
    split = []
    text = episode.text
    break_point = _find_num_end(episode)
    split.append(text[:break_point])
    split.append(text[break_point:])
    #  The following splits the name before the number (ep number) by space, and then selects the number
    #  It only works for regular episodes
    number = split[0].split()[1]
    # Deal with "best of" episodes or episodes without numbers (not included)
    if _num_check(number) == False:
        return "not in standard form"
    # Format name - just the slice beyond the break unless there are "("
    if "(" in split[1]:
        name = split[1].split("(")[0].strip(":").strip()
        return (name, number)
    else:
        name = split[1].strip(":").strip()
        return (name, number)

## Function to create numbers / names dict

In [None]:
def create_epnumber_epname_dict(html_string: str):
    episodes_site_html = BeautifulSoup(html_string, features="html.parser")
    episode_elements = episodes_site_html.find_all("div", class_="image-slide-title")
    numbers_and_names = {}
    # Loop through the items
    for item in episode_elements:
        not_included = []
        counter = 0
        name = _name_num_split(item)[0]
        number = _name_num_split(item)[1]
        #  Deal with non standard episodes
        if _num_check(number) == False:
            not_included.append(counter)
            counter += 1
            continue
        else:
            numbers_and_names[int(number)] = name
    return numbers_and_names

## Testing function to create numbers / names dict

In [None]:
test_filepath = os.path.join(project_root, 'data/test_temp/episodes.html')
try:
    with open(test_filepath, 'r', encoding='utf-8') as html:
        html_text = html.read()
    numbers_names_dict = create_epnumber_epname_dict(html_text)
    print(numbers_names_dict[1])
except FileNotFoundError:
    print(f"Error: The file was not found at {test_filepath}. Did it save correctly?")

## Creating a dataframe of the numbers / names dict

### Create dict using test data

In [None]:
try:
    with open(test_filepath, 'r', encoding='utf-8') as html:
        html_text = html.read()
    numbers_names_dict = create_epnumber_epname_dict(html_text)
    print(f"First episode name: {numbers_names_dict[1]}")
except FileNotFoundError:
    print(f"Error: The file was not found at {test_filepath}. Did it save correctly?")

### Create the dataframe, with dict keys as index, ep names as column

In [None]:
df_episodes_metadata = pd.DataFrame.from_dict(
        numbers_names_dict,
        orient='index', # dict keys become "index" col in dataframe
        columns=['guest_name'] # means dict values
    )

# Rename index 
df_episodes_metadata.index.name = 'episode_number'

# Reset the index to make 'episode_number' a regular column (instead of the index col)
df_episodes_metadata.reset_index(inplace=True)

print("DataFrame created successfully. Here's its head:")
print(df_episodes_metadata.head())

print("\nDataFrame information:")
df_episodes_metadata.info()

### Saving the dataframe

In [None]:
df_episodes_metadata.to_parquet(test_processed_eps_filepath, index=False)

###  Function to create URL (using data frame) and merge with names and numbers

In [None]:
# Create test data
data = {
    'episode_number': [1, 2, 294],
    'guest_name': ['James Acaster', 'Ed Gamble', 'Carey Mulligan']
}
df = pd.DataFrame(data)

# Function
def create_url_from_row(row):
    num = row['episode_number']
    first_name = row['guest_name'].split()[0].lower()
    if len(row['guest_name'].split()) > 1:
        second_name = row['guest_name'].split()[1].lower()
    else:
        second_name = ""
    url = f"{transcript_base_url}ep-{num}-{first_name}-{second_name}"
    return url

# Test function
df['url'] = df.apply(create_url_from_row, axis=1)

print(df.info())

### Adding URL column to existing metadata

In [None]:
df_episodes_metadata['url'] = df_episodes_metadata.apply(create_url_from_row, axis=1)

print(df_episodes_metadata.head())

## Adding restaurant mentions to existing dataframe

### Retrieving res mentions dataframe from test where it was saved

In [None]:
test_temp_dir = os.path.join(project_root, 'data/test_temp')
restaurant_mentions_test_filepath = os.path.join(test_temp_dir, 'restaurant_mentions_test.parquet')
try:
    restaurant_mentions_df = pd.read_parquet(restaurant_mentions_test_filepath)
except FileNotFoundError:
    print(f"Error: The file was not found at {restaurant_mentions_test_filepath}. Did it save correctly?")

print(restaurant_mentions_df.head())

### Merging with existing numbers, names, urls dataframe

In [None]:
merged_df = pd.merge(df_episodes_metadata, restaurant_mentions_df, on='guest_name', how='left')
print(merged_df.head())

### Aggregating rows so we have one row per episode, with a list of restaurant mentions

In [None]:
ep_meta_and_mentions_df = merged_df.groupby(['episode_number', 'guest_name', 'url'], as_index=False).agg(
    restaurants_mentioned=('restaurant_name', lambda x: list(x.dropna()))
).rename(columns={'restaurant_name': 'restaurants_mentioned'})

print("---Final (ep_meta_and_mentions) dataframe head---")
print(ep_meta_and_mentions_df.head())

### Saving the full dataframe, and the head for testing

In [None]:
# Saving the dataframe

ep_meta_and_mentions_df.to_parquet(test_meta_and_mentions_filepath, index=False)

# Saving just the head for testing purposes
head_filepath = os.path.join(test_temp_dir, 'ep_meta_and_mentions_head.parquet')
ep_meta_and_mentions_head = ep_meta_and_mentions_df.head()
ep_meta_and_mentions_head.to_parquet(head_filepath, index=False)