# Tabelog Data Collection

In [2]:
import inspect
import urllib.request
import http.client
from bs4 import BeautifulSoup
import re
import pandas as pd
import tqdm
import time

## Tabelog Restaurant Data

In [3]:
class Tabelog:
    """
    Tabelog scraper for score, review count, daytime and nighttime price, photo count, like count, and bookmark count.
    Credit to https://qiita.com/toshiyuki_tsutsui/items/f143946944a428ed105b for setting the foundation of this scraper. ありがとうございます。ദ്ദി(｡•̀ ,<)~✩‧₊
    """
    def __init__(self, base_url, test_mode=False, p_area="東京都内", genre="ramen", begin_page=1, end_page=10):
        """
        Parameters for scraping
        :param base_url: The base url
        :param test_mode: Test mode or not
        :param p_area: Area of Japan
        :param genre: Genre of food, ramen is the default
        :param begin_page: Starting page, default is 1
        :param end_page: Ending page, default is 10
        """
        
        # Store / restaurant info
        self.store_id = ""
        self.store_id_num = 0
        self.store_name = ""
        self.score = 0
        self.p_area = p_area
        self.genre = genre
        
        # Price range variables
        self.daytime_price = ""
        self.daytime_price_low = None
        self.daytime_price_high = None
        self.nighttime_price = ""
        self.nighttime_price_low = None
        self.nighttime_price_high = None
        
        # Engagement counts
        self.review_count = 0
        self.photo_count = 0
        self.like_count = 0
        self.bookmark_count = 0
        self.columns = ["store_id", "store_name", "score", "area", "genre", "review_count", 
                        "daytime_price", "daytime_price_low", "daytime_price_high", "nighttime_price", "nighttime_price_low", "nighttime_price_high", "photo_count", "like_count", "bookmark_count"]
        self.df = pd.DataFrame(columns=self.columns)
        # \n = newline
        # \s = whitespace
        self.__regexcomp = re.compile(r"\n|\s")
        
        # Add a counter for successful stores
        self.successful_stores = 0
        # Loading bar progress
        # bar = tqdm.tqdm(total=end_page)
        
        # Starting page number
        page_num = begin_page
        
        if test_mode:
            print(f"Test Mode: Only processing up to 3 restaurants from page {begin_page}")
            # Sort by Tabelog rating score
            list_url = base_url + str(page_num) + "?select_sort_flg=1" # "/?Srt=D&SrtT=rt&sort_mode=1" 
            self.scrape_list(list_url, mode=test_mode)
            print(f"Test complete - processed {self.successful_stores} restaurants")
        else:
            print(f"Starting full scrape from page {begin_page} to {end_page}")
            while True:
                print(f"Processing page {page_num}")
                # Sort by Tabelog rating score
                list_url = base_url + str(page_num) + "?select_sort_flg=1" # "/?Srt=D&SrtT=rt&sort_mode=1" 
                if not self.scrape_list(list_url, mode=test_mode):
                    print(f"No more results found after page {page_num-1}")
                    break
                
                # Stop after reaching the end page
                if page_num >= end_page:
                    print(f"Reached specified end page {end_page}")
                    break
                page_num += 1
                # bar.update(1)
            
            print(f"Scraping complete, processed {self.successful_stores} stores / restaurants")
        return

    def scrape_list(self, list_url, mode):
        """
        Parse a whole restaurant list page
        """
        with urllib.request.urlopen(list_url) as r:
            content = r.read()
            status_code = r.status
        if status_code != http.client.OK:
            return False
        
        # Put all content into soup parser
        soup = BeautifulSoup(content, "html.parser")
        # Restaurant name list
        soup_a_list = soup.find_all("a", class_="list-rst__rst-name-target") 

        # If there's nothing, return
        if len(soup_a_list) == 0:
            return False

        if mode:
            # In test mode, only scrape the first 3 restaurants
            for soup_a in soup_a_list[:3]:
                 # Get individual restaurant page URL
                item_url = soup_a.get("href")
                self.store_id_num += 1
                self.scrape_item(item_url)
        else:
            # In normal mode, scrape all restaurants on the page (usually 20 entries)
            for soup_a in soup_a_list:
                # Get the individual restaurant page URL
                item_url = soup_a.get("href")
                self.store_id_num += 1
                self.scrape_item(item_url)

        return True

    def scrape_item(self, item_url):
        """
        Parse an individual restaurant page
        """
        # Request site content
        with urllib.request.urlopen(item_url) as r:
            content = r.read()
            status_code = r.status
        if status_code != http.client.OK:
            print(f"error: not found{ item_url }")
            return

        # Add delay to avoid hitting rate limits
        time.sleep(1)
        
        # Parse with soup
        soup = BeautifulSoup(content, "html.parser")
        
        # Get restaurant name
        store_name_tag = soup.find("h2", class_="display-name")
        if not store_name_tag or not store_name_tag.span:
            print(f"error: cannot find restaurant name at {item_url}")
            self.store_id_num -= 1
            return
            
        store_name = store_name_tag.span.string
        print("{}→店名：{}".format(self.store_id_num, store_name.strip()))
        self.store_name = store_name.strip()
        
        # Store header info
        store_head = soup.find("div", class_="rdheader-subinfo")
        if not store_head:
            print("Cannot find store header information, skipping")
            self.store_id_num -= 1
            return
            
        store_head_list = store_head.find_all("dl")
        if len(store_head_list) < 2:
            print("Store header information is incomplete, skipping")
            self.store_id_num -= 1
            return
            
        store_head_list = store_head_list[1].find_all("span")
        if not store_head_list:
            print("Cannot determine store type, skipping")
            self.store_id_num -= 1
            return
        
        # Check if it's a ramen or tsukemen restaurant
        # if store_head_list[0].text not in {"ラーメン", "つけ麺"}:
        #     print("Not a ramen or tsukemen restaurant, skipping")
        #     self.store_id_num -= 1
        #     return
        
        # Get rating score
        rating_score_tag = soup.find("b", class_="c-rating__val")
        if not rating_score_tag or not rating_score_tag.span:
            print("評価が見つかりません")
            self.store_id_num -= 1
            return
            
        rating_score = rating_score_tag.span.string
        print("評価点数：{}点".format(rating_score), end="")
        self.score = rating_score
        
        # Skip restaurants with no rating
        if rating_score == "-":
            print("  評価がないため処理対象外")
            self.store_id_num -= 1
            return
               
        # This code skips restaurants with rating below 3.5
        # Good for looking for good places
        # if float(rating_score) < 3.5:
        #     print("  食べログ評価が3.5未満のため処理対象外")
        #     self.store_id_num -= 1
        #     return
        
        # Get review count
        review_tag_id = soup.find("li", id="rdnavi-review")
        if not review_tag_id or not review_tag_id.a:
            print("  レビューセクションが見つかりません")
            self.store_id_num -= 1
            return
            
        review_tag = review_tag_id.a.get("href")
        
        # Get review count
        review_count_span = review_tag_id.find("span", class_="rstdtl-navi__total-count")
        if not review_count_span or not review_count_span.em:
            print("  レビュー件数が見つかりません")
            self.review_count = 0
        else:
            print("  レビュー件数：{}".format(review_count_span.em.string), end="")
            self.review_count = review_count_span.em.string
        
              # Get price information (daytime and nighttime)
        # Find all price target elements
        price_targets = soup.find_all("a", class_="rdheader-budget__price-target")
        
         # For each price target, determine if it's lunch or dinner
        for price_target in price_targets:
            # Find the closest time indicator (lunch or dinner icon)
            parent_element = price_target.parent
            while parent_element and not parent_element.find("i", class_="c-rating-v3__time"):
                parent_element = parent_element.parent
                
            time_elem = parent_element.find("i", class_="c-rating-v3__time") if parent_element else None
            
            if time_elem and price_target:
                price_text = price_target.text.strip()
                
                # Check class list for 'lunch' or 'dinner' substring
                class_list = time_elem.get("class", [])
                is_lunch = any("lunch" in class_name for class_name in class_list)
                is_dinner = any("dinner" in class_name for class_name in class_list)
                
                # Assign price to appropriate category
                if is_lunch:
                    self.daytime_price, self.daytime_price_low, self.daytime_price_high = parse_price_range(price_text)
                elif is_dinner:
                    self.nighttime_price, self.nighttime_price_low, self.nighttime_price_high = parse_price_range(price_text)  
            
        # Display the extracted price information
        day_info = f"{self.daytime_price} ({self.daytime_price_low}-{self.daytime_price_high})" if self.daytime_price else "-"
        night_info = f"{self.nighttime_price} ({self.nighttime_price_low}-{self.nighttime_price_high})" if self.nighttime_price else "-"
        print(f"  価格帯：昼 {day_info}、夜 {night_info}", end="")
        
        # Get the review list page URL and scrape the first page of reviews
        page_num = 1  # Just scrape the first page of reviews
        
        if review_tag:
            review_url = review_tag + "COND-0/smp1/?lc=0&rvw_part=all&PG=" + str(page_num)
            # Used after other information is obtained
            print(" . ", end="")
            self.scrape_review_page(review_url)
    
        self.make_df()
        return

    def scrape_review_page(self, review_url):
        """
        Parse the review list page and extract metrics
        """
        # Add a small delay before requesting review page
        time.sleep(0.5)
        
        with urllib.request.urlopen(review_url) as r:
            content = r.read()
            status_code = r.status
        if status_code != http.client.OK:
            print(f"error: not found{ review_url }")
            return False

        soup = BeautifulSoup(content, "html.parser")
        review_items = soup.find_all("div", class_="rvw-item")
        
        if len(review_items) == 0:
            return False
        
        total_photos = 0
        total_likes = 0
        total_bookmarks = 0
        
        for review_item in review_items:
            # Count photos
            photo_section = review_item.find("div", class_="rvw-photo")
            if photo_section:
                # Check for "more photos" indicator
                more_photos = photo_section.find("span", class_="c-photo-more__num")
                if more_photos:
                    try:
                        total_photos += int(more_photos.text)
                    except ValueError:
                        pass
                
                # Count visible photos
                photo_list = photo_section.find_all("li", class_="rvw-photo__list-item")
                if photo_list:
                    total_photos += len(photo_list)
            
            # Count likes
            like_count_elem = review_item.find("div", class_="js-like-source")
            if like_count_elem:
                try:
                    like_data = re.search(r'"count":(\d+)', like_count_elem.text)
                    if like_data:
                        total_likes += int(like_data.group(1))
                except (ValueError, AttributeError):
                    pass
            
            # Count bookmarks
            bookmark_elem = review_item.find("div", class_="js-vote-interest")
            if bookmark_elem and bookmark_elem.get("data-hozon-count"):
                try:
                    total_bookmarks += int(bookmark_elem.get("data-hozon-count"))
                except ValueError:
                    pass
            
        # Update counters
        self.photo_count = total_photos
        self.like_count = total_likes
        self.bookmark_count = total_bookmarks
        
        print(f"  写真：{total_photos}枚、いいね：{total_likes}件、保存：{total_bookmarks}件")
        return True
    
    def make_df(self):
        """
        Create a DataFrame row with the collected data
        """
        # Use 0-padding
        self.store_id = str(self.store_id_num).zfill(8) 
        # Create row for the DataFrame
        se = pd.Series([
            self.store_id, 
            self.store_name, 
            self.score, 
            self.p_area, 
            self.genre,
            self.review_count, 
            self.daytime_price,
            self.daytime_price_low,
            self.daytime_price_high,
            self.nighttime_price,
            self.nighttime_price_low,
            self.nighttime_price_high,
            self.photo_count, 
            self.like_count, 
            self.bookmark_count, 
        ], self.columns) 
        # Add the row to the DF
        self.df = pd.concat([self.df, pd.DataFrame([se], columns=self.columns)], ignore_index=True)
        # Increment the successful stores counter
        self.successful_stores += 1
        
        # Reset values for next restaurant
        self.photo_count = 0
        self.like_count = 0
        self.bookmark_count = 0
        self.daytime_price = ""
        self.daytime_price_low = None
        self.daytime_price_high = None
        self.nighttime_price = ""
        self.nighttime_price_low = None
        self.nighttime_price_high = None
        return
    
def parse_price_range(price_text):
    """
    Parse price ranges like "￥1,000～￥1,999" into low and high values
    Returns a tuple of (original_text, low_value, high_value)
    """
    if not price_text or price_text == "-":
        return price_text, None, None
        
    # Regex the numbers from the price range
    numbers = re.findall(r"￥([0-9,]+)", price_text)
    
    if len(numbers) >= 2:
        # Convert to integers and remove commas
        try:
            low = int(numbers[0].replace(",", ""))
            high = int(numbers[1].replace(",", ""))
            return price_text, low, high
        except ValueError:
            pass
    elif len(numbers) >= 1 and price_text.startswith("～￥"):
        # If price range doesn't start with a number, make lower bound 1 and use upper bound normally
        low = 1
        high = numbers[0]
        return price_text, low, high
    
    return price_text, None, None

In [4]:
# Base URL for Tokyo restaurants
tokyo_base_url = "https://tabelog.com/tokyo/rstLst/"
# Categories of food
food_categories = [
    "ramen", 
    "japanese", 
    "washoku", 
    "sushi", 
    "seafood", 
    "udon", 
    "yakiniku", 
    "curry", 
    "italian", 
    "izakaya", 
    "sweets", 
    "chinese", 
    "pizza", 
    "syabusyabu", 
    "korea"
]
food_category_urls_tokyo = []
for category in food_categories:
    food_category_urls_tokyo.append(tokyo_base_url + category + "/")

print(f"Food categories: {len(food_categories)}, Total URLs to process: {len(food_category_urls_tokyo)}")

Food categories: 15, Total URLs to process: 15


In [None]:
# Progress bar
progress = tqdm.tqdm(total=len(food_category_urls_tokyo))
for category, tokyo_food_category_url in zip(food_categories, food_category_urls_tokyo):
    print(f"Processing category - {category}")
    output_csv_name = "./data/food_category_data/tabelog_tokyo_" + category + "_data.csv"
    # Tokyo Ramen Rating Data, 20 entries per page - 10 pages → 200 entries 
    tabelog_tokyo_genre = Tabelog(tokyo_base_url, test_mode=False, p_area="東京都内", genre=category, begin_page=1, end_page=10)
    tabelog_tokyo_genre.df.to_csv(output_csv_name, encoding="utf-8-sig", index=False)
    progress.update(1)

In [9]:
# Use this if prices are missing for rows that start with "～￥" or have "～￥999" imputed
for category in food_categories:
    output_csv_name = "./data/food_category_data/tabelog_tokyo_" + category + "_data.csv"
    tabelog_tokyo_df = pd.read_csv(output_csv_name, encoding="utf-8-sig")
    tblg_df_copy = tabelog_tokyo_df.copy()
    
    # Find rows where price is "～￥999"
    daytime_mask = tblg_df_copy["daytime_price"] == "～￥999"
    nighttime_mask = tblg_df_copy["nighttime_price"] == "～￥999"
    
    # Impute missing values
    tblg_df_copy.loc[daytime_mask, "daytime_price_low"] = 1
    tblg_df_copy.loc[daytime_mask, "daytime_price_high"] = 999
    tblg_df_copy.loc[nighttime_mask, "nighttime_price_low"] = 1
    tblg_df_copy.loc[nighttime_mask, "nighttime_price_high"] = 999
    
    # Write CSV
    tblg_df_copy.to_csv(output_csv_name, encoding="utf-8-sig", index=False)

In [None]:
# for category in food_categories:
#     output_csv_name = "./data/food_category_data/tabelog_tokyo_" + category + "_data.csv"
#     tabelog_tokyo_df = pd.read_csv(output_csv_name, encoding="utf-8-sig")
#     if "Unnamed: 0.1" in tabelog_tokyo_df.columns:
#         print("Removing unnamed columns")
#         tabelog_tokyo_df = tabelog_tokyo_df.drop("Unnamed: 0.1", axis=1)
#         tabelog_tokyo_df.to_csv(output_csv_name, encoding="utf-8-sig", index=False)

In [45]:
# Insert genre column for all categories
# for category in food_categories:
#     output_csv_name = "./data/food_category_data/tabelog_tokyo_" + category + "_data.csv"
#     tabelog_tokyo_df = pd.read_csv(output_csv_name, encoding="utf-8-sig")
#     tblg_df_copy = tabelog_tokyo_df.copy()
#     tblg_df_copy.insert(loc=4, column="genre", value=category)
#     tblg_df_copy.to_csv(output_csv_name, encoding="utf-8-sig", index=False)

In [46]:
import glob
import os

In [50]:
# Combine all CSV data into one dataset
# This is probably not needed
def combine_tabelog_csvs(directory_path):
    # Get all CSV files in the directory
    csv_files = glob.glob(os.path.join(directory_path, "*.csv"))
    # Combined DataFrame
    combined_tl_df = pd.DataFrame()
    
    # Process each CSV file
    for f in csv_files:
        # Read the current CSV
        df = pd.read_csv(f)
        
        # Check if columns exist
        expected_columns = ["store_id", "store_name", "score", "area", "genre", "review_count",
                           "daytime_price", "daytime_price_low", "daytime_price_high",
                           "nighttime_price", "nighttime_price_low", "nighttime_price_high",
                           "photo_count", "like_count", "bookmark_count"]
        missing_cols = [col for col in expected_columns if col not in df.columns]
        if len(missing_cols) > 0:
            continue
            
        # Append to the combined dataframe
        combined_tl_df = pd.concat([combined_tl_df, df], ignore_index=True)
    
    # Randomize rows
    combined_tl_df = combined_tl_df.sample(frac=1, random_state=42).reset_index(drop=True)
    # Reset store_id to be sequential from 1 to N
    combined_tl_df["store_id"] = range(1, len(combined_tl_df) + 1)
    
    return combined_tl_df


combined_datasets = combine_tabelog_csvs("./data/food_category_data")
combined_datasets.to_csv("./data/tabelog_tokyo_data.csv", encoding="utf-8-sig", index=False)

## Tabelog Review Data

In [36]:
def print_debug(*args, sep=" ", end="\n", file=None, flush=False, debug=False):
    if debug:
        print(*args, sep=sep, end=end, file=file, flush=flush)

In [42]:
class TabelogReviewScraper:
    """
    Tabelog scraper that focuses on collecting reviews/ratings from restaurants
    """
    def __init__(self, restaurant_url, max_pages=None, debug=False):
        """
        Initialize scraper for a specific restaurant
        :param restaurant_url: URL of the restaurant page
        :param max_pages: Maximum number of review pages to scrape (default: None = all pages)
        """
        # Info about accessing the restaurant / store data
        self.restaurant_url = restaurant_url
        self.max_pages = max_pages
        
        # Restaurant info
        self.store_id = ""
        self.store_name = ""
        self.review_count = 0
        
        # User count
        self.user_count = 0
        
        # For debugging
        self.debug = debug
        self.page_num = 0
        
        # DataFrame for reviews with sub-ratings
        self.columns = [
            "user_id", "overall_rating", "food", "service", "atmosphere", "price", "drink"
        ]
        self.reviews_df = pd.DataFrame(columns=self.columns)
        
        print_debug(f"Starting scrape of restaurant reviews at {restaurant_url}", debug=self.debug)
        self.scrape_restaurant()
        print_debug(f"Scraping complete. Collected {len(self.reviews_df)} reviews", debug=self.debug)
        
    def scrape_restaurant(self):
        """
        Scrape the main restaurant page to get name and review link
        """
        try:
            with urllib.request.urlopen(self.restaurant_url) as r:
                content = r.read()
                status_code = r.status
                
            if status_code != http.client.OK:
                print_debug(f"Error: Could not access {self.restaurant_url}", debug=self.debug)
                return
                
            # Parse with soup
            soup = BeautifulSoup(content, "html.parser")
            
            # Extract store ID from URL
            self.store_id = self.restaurant_url.split("/")[-2]
            
            # Get restaurant name
            store_name_tag = soup.find("h2", class_="display-name")
            if not store_name_tag or not store_name_tag.span:
                print_debug(f"Error: Cannot find restaurant name at {self.restaurant_url}", debug=self.debug)
                # Fallback to ID if name not found
                self.store_name = self.store_id
                return
                
            self.store_name = store_name_tag.span.string.strip()
            print_debug(f"Restaurant: {self.store_name} (ID: {self.store_id})", debug=self.debug)
            
            # Get review link and count
            review_tag_id = soup.find("li", id="rdnavi-review")
            if not review_tag_id or not review_tag_id.a:
                print_debug("ロコミのページが見つかりません", debug=self.debug)
                return
                
            review_tag = review_tag_id.a.get("href")
            
            # Get review count
            self.review_count = 0
            review_count_span = review_tag_id.find("span", class_="rstdtl-navi__total-count")
            if review_count_span and review_count_span.em:
                self.review_count = int(review_count_span.em.string)
                print_debug(f"Total reviews: {self.review_count}", debug=self.debug)
            else:
                print_debug("Review count not found, will scrape", debug=self.debug)
            
            # Calculate pages based on review count (20 reviews per page)
            if self.review_count > 0:
                estimated_pages = -((self.review_count + 19) // -20)
                print_debug(f"Estimated number of pages for {self.store_name}: {estimated_pages - 1}〜{estimated_pages}", debug=self.debug)
            
            # Start scraping review pages
            self.page_num = 1
            last_page = False
            
            while not last_page:
                # Check if the maximum pages limit is reached
                if self.max_pages and self.page_num > self.max_pages:
                    print_debug(f"Reached maximum page limit ({self.max_pages})", debug=self.debug)
                    break
                
                print_debug(f"ロコミ{self.page_num}ページ目が加工されています", debug=self.debug)
                review_url = review_tag + f"COND-0/smp1/?lc=0&rvw_part=all&PG={self.page_num}"
                
                # Scrape the current page
                result = self.scrape_review_page(review_url)
                
                if not result:
                    print_debug(f"Error accessing page {self.page_num} or no reviews found", debug=self.debug)
                    break
                
                soup, reviews_found = result
                
                if not reviews_found:
                    print_debug(f"No reviews found on page {self.page_num}", debug=self.debug)
                    break
                
                # Check for pagination to determine if there are more pages
                pagination = soup.find("div", class_="c-pagination")
                
                # If there's no pagination element at all, and we're on page 1, 
                # this means there's only one page of reviews
                if not pagination and self.page_num == 1:
                    print_debug("Only one page of reviews found", debug=self.debug)
                    last_page = True
                elif pagination:
                    # Look for the "next" button
                    next_button = pagination.find("a", class_="c-pagination__arrow--next")
                    
                    # If there's no next button, or if the current page is the last one,
                    # we've reached the end
                    if not next_button:
                        print_debug(f"Reached the last page of reviews ({self.page_num})", debug=self.debug)
                        last_page = True
                else:
                    # No pagination found after page 1, must be the last page
                    print_debug(f"No more pages found after page {self.page_num}", debug=self.debug)
                    last_page = True
                
                # Move to the next page
                if not last_page:
                    self.page_num += 1
                    # Pause between pages to avoid rate limiting
                    time.sleep(2)
                
        except Exception as e:
            print_debug(f"Error scraping restaurant: {e}", debug=self.debug)
            
    def scrape_review_page(self, review_url):
        """
        Scrape a single page of reviews to extract ratings
        Returns tuple of (soup, reviews_found) or None if error
        """
        try:
            # Add delay to prevent ratelimits
            time.sleep(1)
            
            # Request
            with urllib.request.urlopen(review_url) as r:
                content = r.read()
                status_code = r.status
                
            if status_code != http.client.OK:
                print_debug(f"Error: Could not access {review_url}", debug=self.debug)
                return None
    
            # Access contents of page
            soup = BeautifulSoup(content, "html.parser")
            review_items = soup.find_all("div", class_="rvw-item")
            
            # If there are no items to review
            if len(review_items) == 0:
                print_debug("ロコミが見つかりません", debug=self.debug)
                return soup, False
            
            # Number of reviews in the page
            print_debug(f"{self.page_num}ページ目にレビュー{len(review_items)}件", debug=self.debug)
            
            # Parse all ratings and add them to DF
            for review_item in review_items:
                self.parse_review_ratings(review_item)
                
            return soup, True
            
        except Exception as e:
            print(f"Error scraping review page: {e}")
            return None
            
    def parse_review_ratings(self, review_item):
        """
        Parse only the ratings from an individual review
        Uses sequential user IDs
        """
        try:
            # Initialize ratings
            # None (will be converted to NaN in DataFrame)
            overall_rating = None
            food_rating = None
            service_rating = None
            # The atmosphere of the place 
            atmosphere_rating = None
            # CP (Cost Performance)
            price_rating = None
            # Rating of the drinks
            drink_rating = None
            
            # Increment user ID
            self.user_count += 1
            
            # Get overall rating
            rating_elem = review_item.find(["p", "div"], class_="c-rating-v3--xl")
            if rating_elem:
                val_elem = rating_elem.find("b", class_="c-rating-v3__val")
                if val_elem:
                    try:
                        overall_rating = float(val_elem.text.strip())
                    except ValueError:
                        pass
            
            # Get detailed ratings
            rating_detail = review_item.find("ul", class_="c-rating-detail")
            if rating_detail:
                rating_items = rating_detail.find_all("li", class_="c-rating-detail__item")
                for item in rating_items:
                    label = item.find("span")
                    value = item.find("strong")
                    if label and value:
                        # Get label and rating values
                        label_text = label.text.strip()
                        value_text = value.text.strip()
                        
                        try:
                            rating_value = float(value_text)
                            # Extract all ratings
                            if "料理・味" in label_text:
                                food_rating = rating_value
                            elif "サービス" in label_text:
                                service_rating = rating_value
                            elif "雰囲気" in label_text:
                                atmosphere_rating = rating_value
                            elif "CP" in label_text:
                                price_rating = rating_value
                            elif "酒・ドリンク" in label_text:
                                drink_rating = rating_value
                        except ValueError:
                            # Skip if conversion to float somehow fails
                            pass
            
            # Add to DataFrame and only add if at least one rating is present
            ratings = [overall_rating, food_rating, service_rating, atmosphere_rating, price_rating, drink_rating]
            if any(type(r) == float for r in ratings):
                # Series with user id of 6 digits
                se = pd.Series([
                    str(self.user_count).zfill(6),
                    *ratings
                ], self.columns) 
                r_df = pd.DataFrame([se], columns=self.columns)
                for col in self.columns:
                    if col in self.reviews_df.columns:
                        r_df[col] = r_df[col].astype(self.reviews_df[col].dtype)
                # Add the row to the DF
                self.reviews_df = pd.concat([self.reviews_df, r_df], ignore_index=True)
        except Exception as e:
            print(f"Error parsing review: {e}")
            
    def save_data(self, directory="."):
        """
        Save the ratings data to a CSV file
        """
        # Remove problematic characters
        store_name = re.sub(r'[\\/*?:"<>|.]', "", self.store_name)
        store_name = store_name.replace(" ", "_")
        
        filename = os.path.join(directory, f"tabelog_{store_name}_review_data.csv")
        self.reviews_df.to_csv(filename, encoding="utf-8-sig", index=False)
        print_debug(f"Review ratings for {self.store_name} saved to {filename}", debug=self.debug)
        print_debug(f"Collected {len(self.reviews_df)} reviews with {self.user_count} unique users", debug=self.debug)
        return filename

In [40]:
test_restaurant_url = "https://tabelog.com/kanagawa/A1404/A140402/14032275/"
scraper = TabelogReviewScraper(test_restaurant_url, max_pages=None, debug=True)
scraper.save_data(directory="./tabelog_review_data")

Starting scrape of restaurant reviews at https://tabelog.com/kanagawa/A1404/A140402/14032275/
Restaurant: 鎌倉のごはんやさん 石渡 (ID: 14032275)
Total reviews: 64
Estimated number of pages for 鎌倉のごはんやさん 石渡: 5
ロコミ1ページ目が加工されています
1ページ目にレビュー21件
ロコミ2ページ目が加工されています
2ページ目にレビュー20件
ロコミ3ページ目が加工されています
3ページ目にレビュー20件
ロコミ4ページ目が加工されています
4ページ目にレビュー3件
Reached the last page of reviews (4)
Scraping complete. Collected 60 reviews
Review ratings for 鎌倉のごはんやさん 石渡 saved to ./tabelog_review_data/tabelog_鎌倉のごはんやさん_石渡_review_data.csv
Collected 60 reviews with 64 unique users


'./tabelog_review_data/tabelog_鎌倉のごはんやさん_石渡_review_data.csv'