# Off menu restaurants processing

This notebook serves as a development environment for the logic to process the restaurants and mentions html and produce a dataframe ready to integrate with other episode metadata. The final production code is located in off_menu/data_processing.py.

In [None]:
# Ensure imports can find my utils:

import sys
import os

notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..', '..'))

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")
print(f"Current sys.path: {sys.path}")

# Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests

# Project specific imports
from off_menu.config import episodes_list_url

Project root added to sys.path: c:\Users\jbara\Data science projects (store here not desktop on onedrive)\Off Menu project
Current sys.path: ['c:\\Users\\jbara\\Data science projects (store here not desktop on onedrive)\\Off Menu project', 'C:\\Users\\jbara\\miniconda3\\python312.zip', 'C:\\Users\\jbara\\miniconda3\\DLLs', 'C:\\Users\\jbara\\miniconda3\\Lib', 'C:\\Users\\jbara\\miniconda3', 'c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project\\.venv', '', 'c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project\\.venv\\Lib\\site-packages', 'c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project\\.venv\\Lib\\site-packages\\win32', 'c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project\\.venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\jbara\\OneDrive\\Desktop\\Data_science\\Python projects\\Off Menu project\\.venv\\Lib\\site-packages\\Pythonwin']


## Process restaurants html into a dict

This section loads the HTML of the "restaurants mentioned" section of the off menu website, and processes this into a dict with restaurants as keys and a list of guests who mention them as values.

In [6]:
# (helper) Function to produce restaurant name and mentions from the text of the li element
def clean_res(res_element):
    text = res_element.text
    csplit = text.split("(")
    res_name = csplit[0].strip()
    res_mentions = csplit[1].strip().strip(")").split(",")
    return (res_name, res_mentions)

# Function to produce restaurants_by_res_name_dict
def create_restaurants_by_res_name_dict(html_string: str):
    res_site_html = BeautifulSoup(html_string, features="html.parser")
    res_elements = res_site_html.find_all("li")
    restaurants_by_res_name = {}
    for item in res_elements:
        if not "(" in item.text:
            continue
        else:
            restaurants_by_res_name[clean_res(item)[0]] = clean_res(item)[1]
    return restaurants_by_res_name

# Creating restaurants_by_res_name dict from restaurants_site_html

test_filepath = os.path.join(project_root, 'data/test_temp/restaurants_site.html')
try:
    with open(test_filepath, 'r', encoding='utf-8') as html:
        html_text = html.read()
    restaurants_by_res_name_dict = create_restaurants_by_res_name_dict(html_text)
except FileNotFoundError:
    print(f"Error: The file was not found at {test_filepath}. Did it save correctly?")

print(list(restaurants_by_res_name_dict.items())[:10
    ])

[('Red Chilli', ['Sophie Duker']), ('Barbacoa El Primo', ['Finn Wolfhard']), ('La Taberna Del Gourmet', ['Rhod Gilbert']), ('Ron Gastrobar', ['James Acaster']), ('Disneyland', ['Saoirse-Monica Jackson']), ('Frites Atelier', ['Joe Lycett']), ('Louisiana Bistreaux', ['Killer Mike']), ('Spondivits', ['Killer Mike']), ('Old Lady Gang', ['Killer Mike']), ('Mary Mac’s', ['Killer Mike'])]


## Convert the dict into a dataframe

Converting the dict into an exploded dataframe with one row for each mention/restaurant, so that it can be combined with the epsisode metadata easily.

In [7]:
# 1. Converting into a dataframe

mentions_raw_data = list(restaurants_by_res_name_dict.items()) # List is easier for Pandas to process

# Pandas understands each tuple is a row, and we can assign the column names directly.
mentions_raw_df = pd.DataFrame(
    mentions_raw_data,
    columns=['restaurant_name', 'guests_mentioned'] 
) 
print("Corrected mentions DataFrame (first 5 rows):\n", mentions_raw_df.head())

# 2. # Explode the DataFrame to create a separate row for each restaurant mentioned in an episode.
restaurant_guest_df = mentions_raw_df.explode('guests_mentioned')

# 3. Rename 'guests_mentioned' to 'guest_name' for accuracy and to match the column in episodes_df.
restaurant_guest_df = restaurant_guest_df.rename(columns={'guests_mentioned': 'guest_name'})
print(restaurant_guest_df.head())

# 4. Save dataframe so I can combine with episodes data
test_temp_dir = os.path.join(project_root, 'data/test_temp')
test_processed_filepath = os.path.join(test_temp_dir, 'restaurant_mentions_test.parquet')
restaurant_guest_df.to_parquet(test_processed_filepath, index=False)

Corrected mentions DataFrame (first 5 rows):
           restaurant_name          guests_mentioned
0              Red Chilli            [Sophie Duker]
1       Barbacoa El Primo           [Finn Wolfhard]
2  La Taberna Del Gourmet            [Rhod Gilbert]
3           Ron Gastrobar           [James Acaster]
4              Disneyland  [Saoirse-Monica Jackson]
          restaurant_name              guest_name
0              Red Chilli            Sophie Duker
1       Barbacoa El Primo           Finn Wolfhard
2  La Taberna Del Gourmet            Rhod Gilbert
3           Ron Gastrobar           James Acaster
4              Disneyland  Saoirse-Monica Jackson
