In [9]:
# Auto-reload modules when they change (useful for development)
%load_ext autoreload
%autoreload 2

import pyairbnb
from pyairbnb import search_all, get_calendar
from datetime import date, datetime, timedelta
import calendar
import json
import os
import time
import random

import numpy as np
import pandas as pd
from collections import defaultdict

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
"""
* Load the grid json files
"""
def load_grid_json_files(folder_path):
    data_list = []
    for filename in os.listdir(folder_path):
        if filename.startswith("grid_") and filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
            data_list.append(data)
    return data_list


# """
# * Get the last day of the next month
# """
# def get_the_last_day_of_next_month():
#     month = today.month + 1
#     year = today.year + (month - 1) // 12
#     month = ((month - 1) % 12) + 1

#     last_day = calendar.monthrange(year, month)[1]
#     return date(year, month, last_day)


# """
# * Get the last day of the next next months
# """
# def get_the_last_day_of_next_two_months():
#     month = today.month + 2
#     year = today.year + (month - 1) // 12
#     month = ((month - 1) % 12) + 1

#     last_day = calendar.monthrange(year, month)[1]
#     return date(year, month, last_day)


"""
* Output the number of remaining days in a specific month of a specific year (including the given day)
"""
def number_of_days_until_end_of_month(day, month, year):
    current_date = date(year, month, day)
    
    # Find the first day of the next month
    if month == 12:
        next_month = date(year + 1, 1, 1)
    else:
        next_month = date(year, month + 1, 1)
    
    # Last day of the current month
    last_day_of_month = next_month - timedelta(days=1)
    
    # Difference in days (include today)
    delta = last_day_of_month - current_date
    return delta.days + 1



"""
* Output the available days in a month
"""
def get_available_dates(calendar_month):
    month = calendar_month['month']
    year = calendar_month['year'] 
    days_info = calendar_month['days']

    available_dates_next30days_list = []

    for day_info in days_info:
        available_today = day_info['availableForCheckin'] | day_info['availableForCheckout']
        if available_today:
            date_str = day_info['calendarDate']
            day = datetime.strptime(date_str, "%Y-%m-%d").day
            available_dates_next30days_list.append(day)

    return np.array(available_dates_next30days_list)



"""
* Analyze the reviews of a listing and find missing months of each year

Resolved issue: 
There are instances where is generalized review date (eg. 2 weeks ago) shown on the listing is not the actual review date.
Example: found on https://www.airbnb.com/rooms/8133168
Review date: 2025-11-30T22:29:47Z
Generalized review date: Il y a 2 semaines
Time when I debug this issue : 2025-12-21 2:00 AM

Reason :
2 weeks after the review date should be 2025-12-21T22:29:47Z. It hasn't reach that time yet. Therefore, it's shown as 2 weeks ago.

Conclusion: 
The current code is correct. No change is needed.
"""
def analyze_reviews(room_id):
    room_url = f"https://www.airbnb.com.sg/rooms/{room_id}"
    proxy_url = ""  # Proxy URL (if needed)
    language = "fr"

    reviews_data = pyairbnb.get_reviews(room_url, language, proxy_url)

    review_analysis = {}

    for review in reviews_data:
        reviw_date = datetime.fromisoformat(review['createdAt'].replace('Z', '+00:00'))
        month = int(reviw_date.month)
        year = int(reviw_date.year)
        # comment = review['comments']
        # rating = review['rating']

        if year not in review_analysis:
            review_analysis[year] = set()

        review_analysis[year].add(month)


    all_month = set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
    for year, month_set in review_analysis.items():
        missing_months = sorted(all_month - month_set)
        # if missing_months:
        #     print(f"Year {year} is missing months: {missing_months}")
        # else:
        #     print(f"Year {year} has all months !")

    return review_analysis



# """
# * Check if the 75 rule is met
# """
# def check_75_rule(
#     curr_month_available_dates, 
#     next_month_available_dates,
#     leniency=2
# ):
#     next_30th_day = today + timedelta(days=30)
#     next_mask = next_month_available_dates < next_30th_day.day
#     available_days_next30days = len(curr_month_available_dates) + len(next_month_available_dates[next_mask])
#     days_booked = 30 - available_days_next30days
#     if days_booked > 22 - leniency:
#         return True, days_booked
#     return False, None


# """
# * Check if the 55 rule is met
# """
# def check_55_rule(
#     next_month_available_dates,
#     next_two_months_available_dates,
#     leniency=2
# ):
#     next_30th_day = today + timedelta(days=30)
#     next_next_30th_day = today + timedelta(days=60)
#     next_mask_remains = next_month_available_dates >= next_30th_day.day
#     next_next_mask = next_two_months_available_dates < next_next_30th_day.day
#     available_days_nextnext30days = len(next_month_available_dates[next_mask_remains]) + len(next_two_months_available_dates[next_next_mask]) 
#     days_booked = 30 - available_days_nextnext30days
#     if days_booked > 17 - leniency:
#         return True, days_booked
#     return False, None



"""
* Check if the reviews count is sufficient
"""
def check_reviews_count(review_months_this_year, review_months_last_year, leniency):
    warning_level = None
    warning_type = None
    warning_message = None

    condition_1 = len(review_months_this_year) < today.month - leniency
    condition_2 = len(review_months_last_year) < (12 - leniency)

    if (condition_1 and condition_2):
        warning_level = "High"
        warning_type = "Insufficient months of reviews this year and last year"
        warning_message = f"This listing has only {len(review_months_this_year)} months of reviews this year and {len(review_months_last_year)} months of reviews last year"

    elif condition_1:
        warning_level = "High"
        warning_type = "Insufficient months of reviews this year"
        warning_message = f"This listing has only {len(review_months_this_year)} months of reviews this year."
    
    elif condition_2:
        warning_level = "High"
        warning_type = "Insufficient months of reviews last year"
        warning_message = f"This listing has only {len(review_months_last_year)} months of reviews last year."

    return warning_level, warning_type, warning_message




def find_grid_for_coordinate(lat, lon):
    """
    Given a latitude and longitude, find which grid cell (by grid_id)
    the coordinate falls into, based on the bounding box definitions
    in gird_coords_df.
    
    Parameters:
        lat (float): Latitude of the point.
        lon (float): Longitude of the point.
        gird_coords_df (pd.DataFrame): DataFrame containing columns:
            ['grid_id', 'ne_lat', 'ne_long', 'sw_lat', 'sw_long']
    
    Returns:
        int or None: grid_id of the matching grid, or None if outside.
    """
    match = gird_coords_df[
        (gird_coords_df['sw_lat'] <= lat) & (lat <= gird_coords_df['ne_lat']) &
        (gird_coords_df['sw_long'] <= lon) & (lon <= gird_coords_df['ne_long'])
    ]
    
    if not match.empty:
        return int(match.iloc[0]['grid_id'])
    else:
        return None



"""
* Add a summary row to the summary DataFrame
"""
def add_summary_row(
    df: pd.DataFrame,
    *,
    room_id=None,
    listing_url=None,
    next_30_days_booked_days=None,
    next_30_to_60_days_booked_days=None,
    rule_75_met: bool = False,
    rule_55_met: bool = False,
    warning_level=None,
    warning_type=None,
    warning_message=None,
    rating=None,
    review_count=None,
    review_months_this_year=None,
    review_months_last_year=None,
    missing_review_months_this_year=None,
    missing_review_months_last_year=None,
    total_missing_review_months_this_year=None,
    total_missing_review_months_last_year=None,
    latitiude=None,
    longitude=None,
    grid_index=None,
) -> pd.DataFrame:
    """
    Append a single summary row to the provided DataFrame and return the new DataFrame.
    List-like fields default to empty lists; scalar fields default to NaN/False if not provided.
    """
    row = {
        'Room_id': room_id,
        'Listing_url': listing_url,
        
        'Next_30_days_booked_days': next_30_days_booked_days,
        'Next_30_to_60_days_booked_days': next_30_to_60_days_booked_days,

        '75_rule_met': bool(rule_75_met),
        '55_rule_met': bool(rule_55_met),

        'Warning_level': warning_level,
        'Warning_type': warning_type,
        'Warning_message': warning_message,

        'Rating': rating,
        'Review_count': review_count,

        'Review_months_this_year': review_months_this_year if review_months_this_year is not None else [],
        'Review_months_last_year': review_months_last_year if review_months_last_year is not None else [],
        'Missing_review_months_this_year': missing_review_months_this_year if missing_review_months_this_year is not None else [],
        'Missing_review_months_last_year': missing_review_months_last_year if missing_review_months_last_year is not None else [],
        'Total_missing_review_months_this_year': total_missing_review_months_this_year,
        'Total_missing_review_months_last_year': total_missing_review_months_last_year,

        'Latitiude': latitiude,
        'Longitude': longitude,
        'Grid_index': grid_index,
    }

    # Ensure all DataFrame columns exist before concat; create any missing columns with NaN/False defaults
    for col in row.keys():
        if col not in df.columns:
            df[col] = np.nan if col not in ['75_rule_met', '55_rule_met', 'Review_missing_months_this_year', 'Review_missing_months_last_year'] else (False if 'rule' in col else [])

    # Use concat to avoid SettingWithCopy warnings and to keep types consistent
    new_df = pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
    return new_df




def get_configuration(configuration):
    guest_count = 0
    bedroom_count = 0
    bed_count = 0
    bath_count = 0

    for config in configuration:
        if "guest" in config:
            num = str(config.split(" ")[0])
        elif "bedroom" in config:
            bedroom_count = str(config.split(" ")[0])
        elif "bed" in config:
            bed_count = str(config.split(" ")[0])
        elif "bath" in config:
            bath_count = str(config.split(" ")[0])

    return guest_count, bedroom_count, bed_count, bath_count


def get_review_count_by_year_and_month(reviews):
    """
    {2025: [3, 4, 7, 5, 6, 7, 9, 6, 7, 6, 6, 5],
    2024: [4, 5, 3, 7, 8, 5, 5, 4, 8, 7, 10, 4],
    2023: [10, 7, 8, 8, 7, 10, 9, 7, 5, 11, 7, 6],
    2022: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5]}
    """
    review_dict = {}
    for each_review in reviews:    
        createdAt_str = each_review['createdAt']
        createdAt_dt = datetime.fromisoformat(createdAt_str.replace('Z', '+00:00'))
        year = createdAt_dt.year
        month = createdAt_dt.month
        
        if year not in review_dict:
            review_dict[year] = [0] * 12  # 12 months, all start at 0
        
        review_dict[year][month - 1] += 1

    return review_dict


def get_available_dates_by_year_and_month(data):
    availability_dict = {}

    for each_month_info in data['calendar']:
        month = each_month_info['month']
        year = each_month_info['year']
        days = each_month_info['days']
        
        # Initialize year if not exists
        if year not in availability_dict:
            availability_dict[year] = {}
        
        # Initialize month if not exists
        if month not in availability_dict[year]:
            availability_dict[year][month] = []
        
        # Collect available days in this month
        for day in days:
            date = day['calendarDate']
            available_checkin = day['availableForCheckin']
            available_checkout = day['availableForCheckout']
            
            if available_checkin or available_checkout:
                # Extract day number from date (e.g., "2025-01-15" -> 15)
                day_number = int(date.split('-')[2])
                availability_dict[year][month].append(day_number)

    return availability_dict



def count_available_days_in_range(available_dates_dict, start_date, end_date):
    """
    Count how many days are available in a date range.
    
    Args:
        available_dates_dict: {'year': {'month': [day1, day2, ...]}}
        year: str
        month: str
        day: int
        start_date: datetime.date object
        end_date: datetime.date object (inclusive)
    
    Returns:
        int: Number of available days in the range
    """
    if not isinstance(available_dates_dict, dict) or not available_dates_dict:
        return None
    
    available_count = 0
    current_date = start_date
    
    while current_date <= end_date:
        year = int(current_date.year)   # String
        month = int(current_date.month) # String
        day = current_date.day          # Int
        
        # Check if this date is in the available dates dictionary
        if year in available_dates_dict:
            if month in available_dates_dict[year]:
                if day in available_dates_dict[year][month]:
                    available_count += 1
        current_date += timedelta(days=1)
    
    return available_count


def get_available_dates_75Rule(available_dates_dict):
    start_date = date.today()
    end_date = date.today() + timedelta(days=29)  # Days 0-29 (30 days total)
    available_count = count_available_days_in_range(available_dates_dict, start_date, end_date)
    return available_count


def get_available_dates_55Rule(available_dates_dict):
    today = date.today()
    start_date = today + timedelta(days=30)  # Day 30
    end_date = today + timedelta(days=59)    # Day 59 (30 days total)
    available_count = count_available_days_in_range(available_dates_dict, start_date, end_date)
    return available_count


# Step 0: Extract the grid coordinates

In [11]:
region_name = "Above-San-Franciso"

grid_start_index_list = [1, 31, 61, 91, 121, 151, 181, 211, 241, 271]
grid_end_index_list = [30, 60, 90, 120, 150, 180, 210, 240, 270, 273]



################################################################################################################################


gird_coords_df = pd.DataFrame()
for grid_start_index, grid_end_index in zip(grid_start_index_list, grid_end_index_list):
    grid_filename = f"{region_name}_full_safe_grid{grid_start_index}_to_grid{grid_end_index}_rows.xlsx"

    each_gird_coords_df = pd.read_excel(f"../Grid-Coords-Files/{region_name}/{grid_filename}")
    gird_coords_df = pd.concat([gird_coords_df, each_gird_coords_df])

gird_coords_df = gird_coords_df.reset_index(drop=True)


gird_coords_df.shape

(273, 5)

# Step 1: Discovery Search

In [18]:
skip_grids = [
    4, 5, 6,
    7, 8, 9, 10, 
    17, 18, 19, 20, 21,
    28, 29,
    38, 39, 40, 41, 42,
    48, 49, 50, 51, 
    58, 59, 60, 61, 62, 63,
    69, 70, 71, 72, 73, 79, 80, 81, 82, 83, 84,
    90, 91, 92, 93,
    112,
    174,
    217, 218,
    232, 237, 238, 239, 240, 
    253, 254, 258, 259, 260, 261
]

gird_coords_df.shape[0] - len(skip_grids)

214

In [None]:

from discovery import discover_all_grids, DiscoveryConfig, BBox, AirbnbDiscoveryEngine

# Configure discovery parameters
discovery_config = DiscoveryConfig(
    # Rate limiting (conservative to avoid bans)
    requests_per_minute=10,
    
    # Subdivision strategy
    max_results_before_subdivide=280,
    min_bbox_size_degrees=0.001,
    max_subdivision_depth=4,
    
    # Multi-pass strategy
    num_discovery_passes = 30,
    use_blank_dates=False,
    
    alternate_checkin_offsets=[None, 3, 7, 14, 30, 60, 90],
    alternate_stay_nights=[1, 2, 3],
    alternate_zoom_values=[14, 15, 16],

    # User-agent rotation
    rotate_user_agents=True,
    
    # Search parameters
    price_min=250,
    price_max=10000,
    currency="USD",
    
    # Caching
    cache_dir=f"Data/{region_name}/discovery_cache",
    enable_cache=False,
    
    # Logging (to prevent notebook lag)
    log_to_file=True,
    log_dir="Discovery-Search-Logs",
    log_level="INFO",
    
    # Stats
    stats_file=f"Data/{region_name}/discovery_stats.json",
)

print("üîç Starting improved discovery system...")
print(f"Region: {region_name}")
print(f"Grid cells to search: {len(gird_coords_df)}")
if len(skip_grids) > 0:
    print(f"‚è≠Ô∏è  Grids to skip: {skip_grids}")
    print(f"Grids to process: {len(gird_coords_df) - len(skip_grids)}")
print(f"Discovery passes per cell: {discovery_config.num_discovery_passes}")
print(f"üìù Detailed logs will be saved to: {discovery_config.log_dir}/")
print("=" * 70)

# Run discovery with skip list
discovered_listings, engine = discover_all_grids(
    gird_coords_df, 
    region_name, 
    discovery_config,
    skip_grids=skip_grids  # ‚≠ê NEW: Pass the skip list
)

print(f"\n‚úÖ Discovery complete!")
print(f"üìä Total unique listings: {len(discovered_listings)}")
print(f"üìÅ Saved to: Data/{region_name}/discovered_listings.json")
print(f"üìà Stats saved to: {discovery_config.stats_file}")

# Convert for enrichment
all_results = [listing.raw_data for listing in discovered_listings.values()]
print(f"\n‚úì Ready for enrichment pipeline with {len(all_results)} listings")

# # ============================================================================
# # OPTION A: USE IMPROVED DISCOVERY SYSTEM (RECOMMENDED)
# # ============================================================================

# from discovery import discover_all_grids, DiscoveryConfig, BBox, AirbnbDiscoveryEngine

# # Configure discovery parameters
# discovery_config = DiscoveryConfig(
#     # Rate limiting (conservative to avoid bans)
#     requests_per_minute=10,
    
#     # Subdivision strategy
#     max_results_before_subdivide=280,  # Subdivide if we hit ~280+ results (likely capped)
#     min_bbox_size_degrees=0.001,  # Stop subdividing below ~100m
#     max_subdivision_depth=4,
    
#     # Multi-pass strategy (catches rotated listings)
#     num_discovery_passes = 30,
#     use_blank_dates=False,
    
#     alternate_checkin_offsets=[None, 3, 7, 14, 30, 60, 90],  # Try +14, +21, +30 days ahead
#     alternate_stay_nights=[1, 2, 3],  # Stay lengths 
#     alternate_zoom_values=[14, 15, 16],  # Zoom levels

#     # User-agent rotation (anti-detection)
#     rotate_user_agents=True,
    
#     # Search parameters
#     price_min=250,
#     price_max=10000,
#     currency="USD",
    
#     # Caching for resume capability
#     cache_dir=f"Data/{region_name}/discovery_cache",
#     enable_cache=False,
    
#     # Stats output
#     stats_file=f"Data/{region_name}/discovery_stats.json",
# )

# print("üîç Starting improved discovery system...")
# print(f"Region: {region_name}")
# print(f"Grid cells to search: {len(gird_coords_df)}")
# print(f"Discovery passes per cell: {discovery_config.num_discovery_passes}")
# print("=" * 70)

# # Run discovery across all grids
# discovered_listings, engine = discover_all_grids(gird_coords_df, region_name, discovery_config)

# print(f"\n‚úÖ Discovery complete!")
# print(f"üìä Total unique listings: {len(discovered_listings)}")
# print(f"üìÅ Saved to: Data/{region_name}/discovered_listings.json")
# print(f"üìà Stats saved to: {discovery_config.stats_file}")

# # Convert discoveries to format compatible with enrichment pipeline
# all_results = [listing.raw_data for listing in discovered_listings.values()]
# print(f"\n‚úì Ready for enrichment pipeline with {len(all_results)} listings")




üîç Starting improved discovery system...
Region: Above-San-Franciso
Grid cells to search: 273
‚è≠Ô∏è  Grids to skip: [4, 5, 6, 7, 8, 9, 10, 17, 18, 19, 20, 21, 28, 29, 38, 39, 40, 41, 42, 48, 49, 50, 51, 58, 59, 60, 61, 62, 63, 69, 70, 71, 72, 73, 79, 80, 81, 82, 83, 84, 90, 91, 92, 93, 112, 174, 217, 218, 232, 237, 238, 239, 240, 253, 254, 258, 259, 260, 261]
Grids to process: 214
Discovery passes per cell: 30
üìù Detailed logs will be saved to: Discovery-Search-Logs/
‚è≠Ô∏è  Skipping 59 grids: [4, 5, 6, 7, 8, 9, 10, 17, 18, 19, 20, 21, 28, 29, 38, 39, 40, 41, 42, 48, 49, 50, 51, 58, 59, 60, 61, 62, 63, 69, 70, 71, 72, 73, 79, 80, 81, 82, 83, 84, 90, 91, 92, 93, 112, 174, 217, 218, 232, 237, 238, 239, 240, 253, 254, 258, 259, 260, 261]

üìç Processing grid 1 of 273...

üìç Processing grid 2 of 273...

üìç Processing grid 3 of 273...
‚è≠Ô∏è  Skipping grid 4 (in skip list)
‚è≠Ô∏è  Skipping grid 5 (in skip list)
‚è≠Ô∏è  Skipping grid 6 (in skip list)
‚è≠Ô∏è  Skipping grid 7 (in ski

# Step 2: Load Segmented Data for Enrichment

**Flexible Grid Selection:**  
- Specify which grids to process from the segmented data  
- Leave `grids_to_process = None` to process **all available grids**  
- Or set `grids_to_process = [1, 10, 15]` to process **only specific grids**


In [6]:
# start_index = grid_start_index
# end_index = grid_end_index

start_index = 1
end_index = 110


grid_index_array = np.arange(start_index, end_index+1)
grid_index_array

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110])

In [7]:
# ============================================================================
# LOAD SEGMENTED DATA FOR ENRICHMENT
# ============================================================================

import glob
from pathlib import Path

# CONFIGURE WHICH GRIDS TO PROCESS
# Option 1: Process ALL grids (set to None)
# Option 2: Process specific grids (e.g., [1, 10, 15])
grids_to_process = grid_index_array  # Change this to [1, 10] to process only grids 1 and 10

# Find all segmented listing files
segmented_data_dir = Path(f"Data/{region_name}/Segmented-Data")
listing_files = sorted(segmented_data_dir.glob("discovered_listings_grid_*.json"))

print(f"üìÇ Found {len(listing_files)} segmented data files in {segmented_data_dir}")

# Load listings from selected grids
all_results = []
processed_grids = []

for file_path in listing_files:
    # Extract grid number from filename (e.g., "discovered_listings_grid_11_20251221_040349.json" -> 11)

    filename = file_path.stem
    try:
        grid_num = int(filename.split('_')[3])  # Extract grid number
    except (IndexError, ValueError):
        print(f"‚ö†Ô∏è Skipping file with unexpected name format: {file_path.name}")
        continue
    
    # Check if this grid should be processed
    if grids_to_process is not None and grid_num not in grids_to_process:
        continue  # Skip this grid
    
    # Load the JSON file
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            listings = data.get('listings', {})
            
            # Convert to list format compatible with enrichment code
            for listing_id, listing_data in listings.items():
                all_results.append(listing_data)
            
            processed_grids.append(grid_num)
            print(f"  ‚úì Loaded grid {grid_num}: {len(listings)} listings")
    
    except Exception as e:
        print(f"  ‚ö†Ô∏è Error loading {file_path.name}: {e}")

print("\n" + "=" * 70)
print("SEGMENTED DATA LOADED")
print("=" * 70)
if grids_to_process is None:
    print(f"Selected grids: ALL ({len(processed_grids)} grids)")
else:
    print(f"Selected grids: {sorted(grids_to_process)}")
print(f"Actually processed: {sorted(processed_grids)}")
print(f"Total listings loaded: {len(all_results)}")
print("=" * 70)


üìÇ Found 110 segmented data files in Data/Oakland/Segmented-Data
  ‚úì Loaded grid 100: 2 listings
  ‚úì Loaded grid 101: 1 listings
  ‚úì Loaded grid 102: 0 listings
  ‚úì Loaded grid 103: 0 listings
  ‚úì Loaded grid 104: 0 listings
  ‚úì Loaded grid 105: 0 listings
  ‚úì Loaded grid 106: 0 listings
  ‚úì Loaded grid 107: 0 listings
  ‚úì Loaded grid 108: 11 listings
  ‚úì Loaded grid 109: 11 listings
  ‚úì Loaded grid 10: 10 listings
  ‚úì Loaded grid 110: 13 listings
  ‚úì Loaded grid 11: 2 listings
  ‚úì Loaded grid 12: 185 listings
  ‚úì Loaded grid 13: 171 listings
  ‚úì Loaded grid 14: 46 listings
  ‚úì Loaded grid 15: 27 listings
  ‚úì Loaded grid 16: 3 listings
  ‚úì Loaded grid 17: 0 listings
  ‚úì Loaded grid 18: 2 listings
  ‚úì Loaded grid 19: 1 listings
  ‚úì Loaded grid 1: 44 listings
  ‚úì Loaded grid 20: 8 listings
  ‚úì Loaded grid 21: 2 listings
  ‚úì Loaded grid 22: 147 listings
  ‚úì Loaded grid 23: 177 listings
  ‚úì Loaded grid 24: 84 listings
  ‚úì Loaded gri

In [8]:
uni_room_ids = []

for res in all_results:
    uni_room_ids.append(str(res['room_id']))

uni_room_ids = list(set(uni_room_ids))

len(uni_room_ids)

2169

# 75, 55 Rules

In [None]:
# ============================================================================
# PHASE 2: CHUNKED ENRICHMENT - Copy these cells to your notebook
# ============================================================================

# CELL 1: Configuration and Helper Functions
# ============================================================================
# Copy this entire cell to replace your current enrichment cell
# ============================================================================

import time
import random
import json
from datetime import datetime

# ============================================================================
# CONFIGURATION
# ============================================================================
CHUNK_SIZE = 50  # Process 50 listings at a time
OUTPUT_DIR = f"Data/{region_name}/Enrichment_Chunks"
PROGRESS_FILE = f"{OUTPUT_DIR}/progress.json"
FINAL_OUTPUT = f"Data/{region_name}/summary_all_enriched.xlsx"

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# üîß CONFIGURE PROCESSING RANGE HERE
# Set to None to process all, or specify start/end indices
START_INDEX = None  # Example: 0, 1000, 2000, etc.
END_INDEX = None    # Example: 1000, 2000, 6000, etc.


# Set today's date for calculations
today = date.today()
COMPLETE_MONTHS_THIS_YEAR = set([m for m in range(1, today.month+1)])
COMPLETE_MONTHS_LAST_YEAR = set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

print("üî¨ Starting CHUNKED enrichment phase...")
print(f"üìÅ Output directory: {OUTPUT_DIR}")
print(f"üì¶ Chunk size: {CHUNK_SIZE} listings per file")
print(f"üìä Total discovered listings: {len(all_results)}")

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def load_progress():
    """Load existing progress from disk"""
    if os.path.exists(PROGRESS_FILE):
        try:
            with open(PROGRESS_FILE, 'r') as f:
                return json.load(f)
        except:
            return {"completed_chunks": [], "last_updated": None}
    return {"completed_chunks": [], "last_updated": None}

def save_progress(completed_chunks):
    """Save progress to disk"""
    progress_data = {
        "completed_chunks": completed_chunks,
        "last_updated": datetime.now().isoformat(),
        "total_listings": len(all_results),
        "chunk_size": CHUNK_SIZE
    }
    with open(PROGRESS_FILE, 'w') as f:
        json.dump(progress_data, f, indent=2)

def save_chunk(chunk_id, chunk_start, chunk_end, chunk_data):
    """Save a chunk of enriched listings to disk"""
    filename = f"enrichment_chunk_{chunk_start:04d}_{chunk_end:04d}.json"
    filepath = os.path.join(OUTPUT_DIR, filename)
    
    chunk_output = {
        "metadata": {
            "chunk_id": chunk_id,
            "start_index": chunk_start,
            "end_index": chunk_end,
            "total_listings": len(chunk_data),
            "timestamp": datetime.now().isoformat()
        },
        "listings": chunk_data
    }
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(chunk_output, f, indent=2, ensure_ascii=False)
    
    print(f"  üíæ Saved chunk {chunk_id}: {filename} ({len(chunk_data)} listings)")
    return filepath

def chunk_exists(chunk_start, chunk_end):
    """Check if a chunk file already exists"""
    filename = f"enrichment_chunk_{chunk_start:04d}_{chunk_end:04d}.json"
    filepath = os.path.join(OUTPUT_DIR, filename)
    return os.path.exists(filepath)

# ============================================================================
# DETERMINE PROCESSING RANGE
# ============================================================================
start_idx = START_INDEX if START_INDEX is not None else 0
end_idx = END_INDEX if END_INDEX is not None else len(all_results)
total_to_process = end_idx - start_idx

print(f"üéØ Processing range: {start_idx} to {end_idx} ({total_to_process} listings)")
print("=" * 70)

# Calculate chunks
total_chunks = (total_to_process + CHUNK_SIZE - 1) // CHUNK_SIZE

# Load existing progress
progress = load_progress()
completed_chunks = set(progress.get("completed_chunks", []))
if completed_chunks:
    print(f"üìã Found {len(completed_chunks)} already completed chunks (will skip)")


# ============================================================================
# CELL 2: Main Processing Loop
# ============================================================================
# Copy this as a SEPARATE cell
# ============================================================================

# Statistics
total_passed_75 = 0
total_failed_75 = 0
total_errors = 0
chunks_processed = 0

# Error listings
roomIDs_with_errors = []


# Process each chunk
for chunk_id in range(total_chunks):
    chunk_start = start_idx + (chunk_id * CHUNK_SIZE)
    chunk_end = min(chunk_start + CHUNK_SIZE, end_idx)
    
    # Skip if chunk already exists
    if chunk_exists(chunk_start, chunk_end - 1):
        print(f"‚è≠Ô∏è  Chunk {chunk_id + 1}/{total_chunks}: {chunk_start}-{chunk_end-1} already processed, skipping...")
        continue
    
    print(f"\nüîÑ Processing Chunk {chunk_id + 1}/{total_chunks}: Listings {chunk_start}-{chunk_end-1}")
    
    chunk_data = []
    chunk_passed_75 = 0
    chunk_failed_75 = 0
    chunk_errors = []
    
    # Process listings in this chunk
    for idx in range(chunk_start, chunk_end):
        result = all_results[idx]
        
        try:
            room_id = result['room_id']

            room_url = f"https://www.airbnb.com.sg/rooms/{room_id}"
            
            # Progress indicator
            if (idx - chunk_start + 1) % 10 == 0:
                print(f"  {idx - chunk_start + 1}/{chunk_end - chunk_start} in this chunk...")
                # Rate limiting
                time.sleep(random.uniform(2, 5))
            
            # Get listing details
            listing_details = pyairbnb.get_details(room_url=room_url, currency="USD", language="en")
            
            # Extract all data
            accuracy_rating = listing_details['rating']['accuracy']
            checking_rating = listing_details['rating']['checking']
            cleanliness_rating = listing_details['rating']['cleanliness']
            communication_rating = listing_details['rating']['communication']
            location_rating = listing_details['rating']['location']
            value_rating = listing_details['rating']['value']
            overall_rating = listing_details['rating']['guest_satisfaction']

            review_count = listing_details['rating']['review_count']
            latitude = listing_details['coordinates']['latitude']
            longitude = listing_details['coordinates']['longitude']
            
            # Superhost
            if 'is_superhost' in listing_details.keys():
                is_superhost = listing_details['is_superhost']
            elif 'is_super_host' in listing_details.keys():
                is_superhost = listing_details['is_super_host']
            else:
                is_superhost = "ERROR"
            
            configuration = listing_details['sub_description']['items']
            guest_count, bedroom_count, bed_count, bath_count = get_configuration(configuration)
            
            amenities = listing_details['amenities']
            co_hosts = listing_details['co_hosts']
            highlights = listing_details['highlights']
            is_guest_favorite = listing_details['is_guest_favorite']
            title = listing_details['title']
            grid_index = result.get('grid_index', None)
            
            available_dates_by_year_and_month = get_available_dates_by_year_and_month(listing_details)

            days_booked_next30days = 30 - get_available_dates_75Rule(available_dates_by_year_and_month)
            days_booked_nextnext30days = 30 - get_available_dates_55Rule(available_dates_by_year_and_month)

            rule75_met = days_booked_next30days >= 22 
            rule55_met = days_booked_nextnext30days >= 16

            if rule75_met:
                chunk_passed_75 += 1
            else:
                chunk_failed_75 += 1
            
            # Calendar analysis
            # calendar_info = listing_details['calendar']
            # curr_month_available_dates = get_available_dates(calendar_info[0])
            # next_month_available_dates = get_available_dates(calendar_info[1])
            # next_two_months_available_dates = get_available_dates(calendar_info[2])
            
            # rule75_met, days_booked_next30days = check_75_rule(
            #     curr_month_available_dates=curr_month_available_dates,
            #     next_month_available_dates=next_month_available_dates,
            #     leniency=1
            # )
            
            # rule55_met, days_booked_nextnext30days = check_55_rule(
            #     next_month_available_dates=next_month_available_dates,
            #     next_two_months_available_dates=next_two_months_available_dates,
            #     leniency=2
            # )
            
            # Reviews analysis
            review_months = analyze_reviews(room_id)
            review_months_this_year = review_months.get(today.year, set())
            review_months_last_year = review_months.get(today.year - 1, set())
            
            warning_level, warning_type, warning_message = check_reviews_count(
                review_months_this_year=review_months_this_year,
                review_months_last_year=review_months_last_year,
                leniency=3
            )
            
            missing_review_months_this_year = list(COMPLETE_MONTHS_THIS_YEAR - review_months_this_year)
            missing_review_months_last_year = list(COMPLETE_MONTHS_LAST_YEAR - review_months_last_year)

            ##### New Approach
            review_count_by_year_and_month = get_review_count_by_year_and_month(listing_details['reviews'])

            # Store as dictionary (not DataFrame row - much faster!)
            listing_record = {
                'Room_id': room_id,
                'Listing_url': room_url,
                'Next_30_days_booked_days': days_booked_next30days,
                'Next_30_to_60_days_booked_days': days_booked_nextnext30days,
                '75_rule_met': rule75_met,
                '55_rule_met': rule55_met,
                'Available_dates_by_year_and_month': available_dates_by_year_and_month,
                'Review_count_by_year_and_month': review_count_by_year_and_month,
                'Warning_level': warning_level,
                'Warning_type': warning_type,
                'Warning_message': warning_message,

                'Rating': overall_rating,
                'Accuracy_rating': accuracy_rating,
                'Checking_rating': checking_rating,
                'Cleanliness_rating': cleanliness_rating,
                'Communication_rating': communication_rating,
                'Location_rating': location_rating,
                'Value_rating': value_rating,
                
                'Review_count': review_count,
                'Review_months_this_year': list(review_months_this_year),
                'Review_months_last_year': list(review_months_last_year),
                'Missing_review_months_this_year': missing_review_months_this_year,
                'Missing_review_months_last_year': missing_review_months_last_year,
                'Total_missing_review_months_this_year': len(missing_review_months_this_year),
                'Total_missing_review_months_last_year': len(missing_review_months_last_year),
                'Is_superhost': is_superhost,
                'Guest_count': guest_count,
                'Bedroom_count': bedroom_count,
                'Bed_count': bed_count,
                'Bath_count': bath_count,
                'Amenities': amenities,
                'Co_hosts': co_hosts,
                'Highlights': highlights,
                'Is_guest_favorite': is_guest_favorite,
                'Title': title,
                'Latitude': latitude,
                'Longitude': longitude,
                'Grid_index': grid_index,
            }
            
            chunk_data.append(listing_record)

            
        except Exception as e:
            error_msg = f"Error processing room_id {result.get('room_id', 'unknown')}: {str(e)}"
            print(f"  ‚ö†Ô∏è {error_msg}")
            roomIDs_with_errors.append(result.get('room_id'))
            chunk_errors.append({
                "room_id": result.get('room_id'),
                "error": str(e),
                "index": idx
            })
    
    # Save this chunk
    save_chunk(chunk_id, chunk_start, chunk_end - 1, chunk_data)
    
    # Update progress
    completed_chunks.add(chunk_id)
    save_progress(list(completed_chunks))
    
    # Update statistics
    total_passed_75 += chunk_passed_75
    total_failed_75 += chunk_failed_75
    total_errors += len(chunk_errors)
    chunks_processed += 1
    
    print(f"  ‚úÖ Chunk complete: {chunk_passed_75} passed 75%, {chunk_failed_75} failed, {len(chunk_errors)} errors")

print("\n" + "=" * 70)
print("‚úÖ CHUNKED ENRICHMENT COMPLETE")
print("=" * 70)
print(f"Total chunks processed: {chunks_processed}/{total_chunks}")
print(f"Total passed 75% rule: {total_passed_75}")
print(f"Total failed 75% rule: {total_failed_75}")
print(f"Total errors: {total_errors}")
print(f"üìÅ Chunk files saved in: {OUTPUT_DIR}")
print("=" * 70)
print("\nüí° Next step: Run the consolidation cell below to create final Excel file")


# ============================================================================
# CELL 3: Consolidation Function
# ============================================================================
# Copy this as a SEPARATE cell
# Run this AFTER all chunks are processed to create the final Excel file
# ============================================================================

def consolidate_chunks_to_excel():
    """Merge all chunk files into one final Excel file"""
    print("üîÑ Consolidating all chunks into final Excel...")
    
    all_data = []
    chunk_files = sorted([f for f in os.listdir(OUTPUT_DIR) if f.startswith('enrichment_chunk_') and f.endswith('.json')])
    
    print(f"üìÅ Found {len(chunk_files)} chunk files")
    
    for chunk_file in chunk_files:
        filepath = os.path.join(OUTPUT_DIR, chunk_file)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                chunk_data = json.load(f)
                all_data.extend(chunk_data['listings'])
            print(f"  ‚úÖ Loaded {chunk_file}: {len(chunk_data['listings'])} listings")
        except Exception as e:
            print(f"  ‚ùå Error loading {chunk_file}: {e}")
    
    # Create DataFrame
    summary_df = pd.DataFrame(all_data)
    
    # Save to Excel
    summary_df.to_excel(FINAL_OUTPUT, index=False)
    
    print(f"\n‚úÖ Consolidation complete!")
    print(f"üìä Total listings: {len(summary_df)}")
    print(f"üìÅ Saved to: {FINAL_OUTPUT}")
    
    return summary_df


In [None]:
# Run consolidation
summary_df = consolidate_chunks_to_excel()
summary_df.head()

##### 

https://www.airbnb.com/rooms/1506163384607639418?check_in=2026-01-02&check_out=2026-01-04

In [None]:
summary_df.columns

In [None]:
# summary_df = pd.concat([summary_df, missing_summary_df], axis=0)

In [None]:
output_summary_path = f"Output-Summary-Data/{region_name}_listings_results_grid_{1}_to_{120}.csv"
output_summary_path

In [None]:
summary_df.to_csv(output_summary_path, index=False)

# Search Room IDs

In [None]:
test_links = [
    "https://www.airbnb.co.uk/rooms/1506163384607639418?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765407790_P3-0aYGmiG-QRrvX&previous_page_section_name=1001&federated_search_id=1f38d13e-d757-483e-a64d-62a424e4b5f1&guests=1",
    "https://www.airbnb.co.uk/rooms/1269151490151009015?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765408749_P3q2kzaN2fS-Xtfn&previous_page_section_name=1001&federated_search_id=c9fa4f15-85d3-491d-85e9-1c8e82ad7d51&guests=1",
    "https://www.airbnb.co.uk/rooms/613718467814096374?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765409074_P3Ft3C2-JnHomL-q&previous_page_section_name=1001&federated_search_id=462ee2f7-00e0-4af9-825b-086bf9e349e5&guests=1",
    "https://www.airbnb.co.uk/rooms/3208254?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765409076_P3xcYx0o1UWIeJld&previous_page_section_name=1001&federated_search_id=462ee2f7-00e0-4af9-825b-086bf9e349e5&guests=1&check_in=2026-01-05&check_out=2026-01-07",
    "https://www.airbnb.co.uk/rooms/702505738423530993?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765409081_P31wzND1HZQx6--D&previous_page_section_name=1001&federated_search_id=462ee2f7-00e0-4af9-825b-086bf9e349e5&guests=1#availability-calendar",
    "https://www.airbnb.co.uk/rooms/31618189?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765419141_P3A7FAnASZ9vnHOZ&previous_page_section_name=1001&federated_search_id=395191b1-28a2-4ca5-b874-fd9e5e9806ea&guests=1",
    "https://www.airbnb.co.uk/rooms/32017392?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765419142_P3V78M9jeeZTCqzS&previous_page_section_name=1001&federated_search_id=395191b1-28a2-4ca5-b874-fd9e5e9806ea&guests=1&check_in=2025-12-13&check_out=2025-12-15",
    "https://www.airbnb.co.uk/rooms/1014152739375310984?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765420517_P3vHJ6mN1WMqMLDO&previous_page_section_name=1001&federated_search_id=cd9b34b8-7553-47ef-87d4-15e375750603&guests=1",
    "https://www.airbnb.co.uk/rooms/1413329775040945628?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765420850_P3mW5Q6SW84GvZio&previous_page_section_name=1001&federated_search_id=942c6549-8761-4ae3-b927-2c0e33ef169f&guests=1&check_in=2026-01-07&check_out=2026-01-09",
    "https://www.airbnb.co.uk/rooms/1279294796042952467?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765420908_P3S730efQJLvmgwB&previous_page_section_name=1001&federated_search_id=948a60e6-f98a-4b99-8568-c7e8a363f6d9&guests=1#availability-calendar",
    "https://www.airbnb.co.uk/rooms/34838894?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765422549_P3R6UyXWwQAYWEy8&previous_page_section_name=1001&federated_search_id=426f9609-5f1c-461d-91f2-0672fd233efa&guests=1&check_in=2026-01-19&check_out=2026-01-21",
    "https://www.airbnb.co.uk/rooms/1515200652264910676?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765423726_P3aIgNz8v7Du9YKB&previous_page_section_name=1001&federated_search_id=01f47499-8e09-445c-b131-b6f76de501b1&guests=1",
    "https://www.airbnb.co.uk/rooms/1364847844572148487?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765409068_P3ml-kaNRiXKslyO&previous_page_section_name=1001&federated_search_id=15f4969e-18be-4545-99a7-94a76e9866c1&guests=1&check_in=2026-01-02&check_out=2026-01-04",
    "https://www.airbnb.co.uk/rooms/1506163384607639418?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765407790_P3-0aYGmiG-QRrvX&previous_page_section_name=1001&federated_search_id=1f38d13e-d757-483e-a64d-62a424e4b5f1&guests=1",
    "https://www.airbnb.co.uk/rooms/1515200652264910676?location=Napa%2C%20CA&search_mode=regular_search&adults=1&source_impression_id=p3_1765423726_P3aIgNz8v7Du9YKB&previous_page_section_name=1001&federated_search_id=01f47499-8e09-445c-b131-b6f76de501b1&guests=1",
]

test_room_ID_list = []

for link in test_links:
    link = link.split("?")[0]
    link = link.split("rooms/")[1]
    test_room_ID_list.append(int(link))

for room_id in test_room_ID_list:
    if room_id not in summary_df['Room_id'].values:
        print(room_id)

# Missing and Debug 

In [None]:
test_room_ID_list = [
    935168255357797927,
    1185926597599979261,
    25254687,
]


debug_mode = False


# Set today's date for calculations
today = date.today()
COMPLETE_MONTHS_THIS_YEAR = set([m for m in range(1, today.month+1)])
COMPLETE_MONTHS_LAST_YEAR = set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

print("üî¨ Starting enrichment phase...")
print(f"Total discovered listings to process: {len(all_results)}")
print(f"Strategy: Only enrich listings that pass 75% rule")
print("=" * 70)

# Initialize summary DataFrame
missing_summary_df = pd.DataFrame(columns=[
    'Room_id', 'Listing_url', 
    'Next_30_days_booked_days', 'Next_30_to_60_days_booked_days',
    '75_rule_met', '55_rule_met',
    'Warning_level', 'Warning_type', 'Warning_message',
    'Rating', 'Review_count',
    'Review_months_this_year', 'Review_months_last_year',
    'Missing_review_months_this_year', 'Missing_review_months_last_year',
    'Total_missing_review_months_this_year', 'Total_missing_review_months_last_year',
    'Latitude', 'Longitude', 'Grid_index'
])

# Process each discovered listing
listings_passed_75_rule = 0
listings_failed_75_rule = 0
enrichment_errors = []


for idx, room_id in enumerate(test_room_ID_list):
    try:
        # room_id = result['room_id']
        room_url = f"https://www.airbnb.com.sg/rooms/{room_id}"

        if debug_mode:
            print(f"room_url: {room_url}")
        
        if (idx + 1) % 10 == 0:
            print(f"  Processing {idx + 1}/{len(all_results)}... (75% pass: {listings_passed_75_rule}, skip: {listings_failed_75_rule})")
            # Add random delay every 10 listings to avoid rate limiting
            delay = random.uniform(2, 5)  # Random delay between 2-5 seconds
            # print(f"  ‚è∏Ô∏è  Pausing for {delay:.1f}s to avoid rate limits...")
            time.sleep(delay) 
        
        listing_details = pyairbnb.get_details(room_url=room_url, currency="USD", language="en")
        
        print(type(listing_details))
        print("listing_details: ", listing_details)

        if listing_details is None: 
            print(f"room_url: {room_url} is not found")
            break

        if debug_mode:
            print("CP - 0")
            print(listing_details.keys())
        
        # Category Ratings
        accuracy_rating = listing_details['rating']['accuracy']
        checking_rating = listing_details['rating']['checking']
        cleanliness_rating = listing_details['rating']['cleanliness']
        communication_rating = listing_details['rating']['communication']
        location_rating = listing_details['rating']['location']
        value_rating = listing_details['rating']['value']
        overall_rating = listing_details['rating']['guest_satisfaction']

        # Review Count
        review_count = listing_details['rating']['review_count']
        latitude = listing_details['coordinates']['latitude']
        longitude = listing_details['coordinates']['longitude']

        # Coordinates
        latitude = listing_details['coordinates']['latitude']
        longitude = listing_details['coordinates']['longitude']

        # Superhost
        if 'is_superhost' in listing_details.keys():
            is_superhost = listing_details['is_superhost']
        elif 'is_super_host' in listing_details.keys():
            is_superhost = listing_details['is_super_host']
        else:
            is_superhost = None

        if debug_mode:
            print("CP - 5")
            print("sub_description : ", listing_details['sub_description'])

        # Configuration (guest, bedroom, bed, bath counts)
        configuration = listing_details['sub_description']['items']

        if debug_mode:
            print("CP - 6")
        
        guest_count, bedroom_count, bed_count, bath_count = get_configuration(configuration)

        if debug_mode:
            print("configuration: ", configuration)
            print("guest_count, bedroom_count, bed_count, bath_count: ", guest_count, bedroom_count, bed_count, bath_count)
            print("CP - 7")

        # Amenities
        amenities = listing_details['amenities']

        if debug_mode:
            print("CP - 8")

        # Cohost
        co_hosts = listing_details['co_hosts']

        if debug_mode:
            print("CP - 9")

        # Hightlights
        highlights = listing_details['highlights']

        if debug_mode:
            print("CP - 10")

        # Is Guest Favorite
        is_guest_favorite = listing_details['is_guest_favorite']

        if debug_mode:
            print("CP - 11")

        # Title
        title = listing_details['title']

        if debug_mode:
            print("CP - 12")

        # Gird Index
        # grid_index = result.get('grid_index', None)

        available_dates_by_year_and_month = get_available_dates_by_year_and_month(listing_details)

        days_booked_next30days = 30 - get_available_dates_75Rule(available_dates_by_year_and_month)
        days_booked_nextnext30days = 30 - get_available_dates_55Rule(available_dates_by_year_and_month)

        rule75_met = days_booked_next30days >= 22 
        rule55_met = days_booked_nextnext30days >= 16

        if rule75_met:
            chunk_passed_75 += 1
        else:
            chunk_failed_75 += 1


        # Reviews analysis
        review_months = analyze_reviews(room_id)
        review_months_this_year = review_months.get(today.year, set())
        review_months_last_year = review_months.get(today.year - 1, set())
        
        warning_level, warning_type, warning_message = check_reviews_count(
            review_months_this_year=review_months_this_year,
            review_months_last_year=review_months_last_year,
            leniency=3
        )
        
        missing_review_months_this_year = list(COMPLETE_MONTHS_THIS_YEAR - review_months_this_year)
        missing_review_months_last_year = list(COMPLETE_MONTHS_LAST_YEAR - review_months_last_year)

        ##### New Approach
        review_count_by_year_and_month = get_review_count_by_year_and_month(listing_details['reviews'])










        # # Reviews Count by Year and Month
        # review_count_by_year_and_month = get_review_count_by_year_and_month(listing_details['reviews'])

        # if debug_mode:
        #     print("CP - 12 - A")

        # if debug_mode:
        #     print("review_count_by_year_and_month: ", review_count_by_year_and_month)

        # # Available Dates by Year and Month FOR BACKUP
        # available_dates_by_year_and_month = get_available_dates_by_year_and_month(listing_details)

        # if debug_mode:
        #     print("available_dates_by_year_and_month: ", available_dates_by_year_and_month)
        
        # # Get calendar data (3 months)
        # if debug_mode:
        #     print("CP - 13")
        #     print("listing_details 000: ", listing_details['calendar'])

        # calendar_info = listing_details['calendar']

        # if debug_mode:
        #     print("calendar: ", calendar_info)

        # current_month_calendar = calendar_info[0]
        # next_month_calendar = calendar_info[1]
        # next_two_months_calendar = calendar_info[2]

        # if debug_mode:
        #     print("CP - 14")
        
        # curr_month_available_dates = get_available_dates(current_month_calendar)
        # next_month_available_dates = get_available_dates(next_month_calendar)
        # next_two_months_available_dates = get_available_dates(next_two_months_calendar)

        # if debug_mode:
        #     print("CP - 15")
        
        # # Check 75% rule (next 30 days, leniency=2)
        # rule75_met, days_booked_next30days = check_75_rule(
        #     curr_month_available_dates=curr_month_available_dates,
        #     next_month_available_dates=next_month_available_dates,
        #     leniency=2
        # )

        # if debug_mode:
        #     print("CP - 16")
        
        # # ONLY PROCESS IF 75% RULE IS MET
        # # if True:  # Get all info regardless of 75% rule

        # if rule75_met:
        #     listings_passed_75_rule += 1
        # else: 
        #     listings_failed_75_rule += 1
        
        # # Check 55% rule (days 31-60, leniency=2)
        # rule55_met, days_booked_nextnext30days = check_55_rule(
        #     next_month_available_dates=next_month_available_dates,
        #     next_two_months_available_dates=next_two_months_available_dates,
        #     leniency=2
        # )
        
        # # Get reviews data
        # review_months = analyze_reviews(room_id)
        
        # if today.year in review_months:
        #     review_months_this_year = review_months[today.year]
        # else:
        #     review_months_this_year = set()
        
        # if today.year - 1 in review_months:
        #     review_months_last_year = review_months[today.year - 1]
        # else:
        #     review_months_last_year = set()
        
        # # Check review frequency (leniency=3)
        # warning_level, warning_type, warning_message = check_reviews_count(
        #     review_months_this_year=review_months_this_year,
        #     review_months_last_year=review_months_last_year,
        #     leniency=3
        # )
        
        # missing_review_months_this_year = list(COMPLETE_MONTHS_THIS_YEAR - review_months_this_year)
        # missing_review_months_last_year = list(COMPLETE_MONTHS_LAST_YEAR - review_months_last_year)
        
        # Add to summary DataFrame
        new_row = pd.DataFrame([{
            'Room_id': room_id,
            'Listing_url': room_url,

            'Next_30_days_booked_days': days_booked_next30days,
            'Next_30_to_60_days_booked_days': days_booked_nextnext30days,
            '75_rule_met': rule75_met,
            '55_rule_met': rule55_met,

            'Available_dates_by_year_and_month': available_dates_by_year_and_month,
            'Review_count_by_year_and_month': review_count_by_year_and_month,

            'Warning_level': warning_level,
            'Warning_type': warning_type,
            'Warning_message': warning_message,

            'Rating': overall_rating,
            'Accuracy_rating': accuracy_rating,
            'Checking_rating': checking_rating,
            'Cleanliness_rating': cleanliness_rating,
            'Communication_rating': communication_rating,
            'Location_rating': location_rating,
            'Value_rating': value_rating,
            'Overall_rating': overall_rating,

            'Review_count': review_count,
            'Review_months_this_year': list(review_months_this_year),
            'Review_months_last_year': list(review_months_last_year),
            'Missing_review_months_this_year': missing_review_months_this_year,
            'Missing_review_months_last_year': missing_review_months_last_year,
            'Total_missing_review_months_this_year': len(missing_review_months_this_year),
            'Total_missing_review_months_last_year': len(missing_review_months_last_year),

            'Is_superhost': is_superhost,
            'Guest_count': guest_count,
            'Bedroom_count': bedroom_count,
            'Bed_count': bed_count,
            'Bath_count': bath_count,

            'Amenities': amenities,
            'Co_hosts': co_hosts,
            'Highlights': highlights,
            'Is_guest_favorite': is_guest_favorite,
            'Title': title,

            'Latitude': latitude,
            'Longitude': longitude,
            'Grid_index': grid_index,
        }])
        
        missing_summary_df = pd.concat([missing_summary_df, new_row], ignore_index=True)



    except Exception as e:
        error_msg = f"Error processing room_id {result.get('room_id', 'unknown')}: {str(e)}"
        print(f"  ‚ö†Ô∏è {error_msg}")
        enrichment_errors.append(error_msg)

print("\n" + "=" * 70)
print("‚úÖ ENRICHMENT COMPLETE")
print("=" * 70)
print(f"Total discovered listings: {len(all_results)}")
print(f"Passed 75% rule (enriched): {listings_passed_75_rule}")
print(f"Failed 75% rule (skipped): {listings_failed_75_rule}")
print(f"Errors during enrichment: {len(enrichment_errors)}")
print(f"Final qualified listings: {len(missing_summary_df)}")
print("=" * 70)


In [None]:
missing_summary_df

In [None]:
room_id = "805725215071886985"
room_url = f"https://www.airbnb.com.sg/rooms/{room_id}"
listing_details = pyairbnb.get_details(room_url=room_url, currency="USD", language="en")

In [None]:
for id in test_room_ID_list:
    if id not in missing_summary_df['Room_id'].tolist():
        print(f"www.airbnb.com/rooms/{id}")

In [None]:
missing_summary_df.columns

In [None]:
final_summary_df = pd.concat([summary_df, missing_summary_df], axis=0)
final_summary_df.reset_index(drop=True, inplace=True)
print(final_summary_df.columns)
final_summary_df.shape

In [None]:
# ============================================================================
# Recalculate 75% and 55% Rules from Available_dates_by_year_and_month
# ============================================================================

from datetime import date, timedelta
import calendar

# Get today's date
today = date.today()



# Calculate date ranges
next_30_days_start = today
next_30_days_end = today + timedelta(days=29)  # Days 0-29 (30 days total)

next_31_to_60_start = today + timedelta(days=30)  # Day 30
next_31_to_60_end = today + timedelta(days=59)    # Day 59 (30 days total)

print(f"üìÖ Calculating available days from stored calendar data...")
print(f"Today: {today}")
print(f"Next 30 days: {next_30_days_start} to {next_30_days_end}")
print(f"Days 31-60: {next_31_to_60_start} to {next_31_to_60_end}")
print("=" * 70)

# Apply the function to each row
final_summary_df['Next_30_days_available_days_NEW'] = final_summary_df['Available_dates_by_year_and_month'].apply(
    lambda x: count_available_days_in_range(x, next_30_days_start, next_30_days_end)
)

final_summary_df['Next_30_to_60_days_available_days_NEW'] = final_summary_df['Available_dates_by_year_and_month'].apply(
    lambda x: count_available_days_in_range(x, next_31_to_60_start, next_31_to_60_end)
)

# Calculate booked days (30 - available = booked)
final_summary_df['Next_30_days_booked_days_NEW'] = 30 - final_summary_df['Next_30_days_available_days_NEW']
final_summary_df['Next_30_to_60_days_booked_days_NEW'] = 30 - final_summary_df['Next_30_to_60_days_available_days_NEW']

# Recalculate 75% and 55% rules with NEW data
# 75% rule: >22 booked days (with leniency=1, that's >21)
final_summary_df['75_rule_met_NEW'] = final_summary_df['Next_30_days_booked_days_NEW'] > 21

# 55% rule: >17 booked days (with leniency=2, that's >15)
final_summary_df['55_rule_met_NEW'] = final_summary_df['Next_30_to_60_days_booked_days_NEW'] > 15

preview_cols = [
    'Room_id',

    'Next_30_days_available_days_NEW', 
    'Next_30_days_booked_days_NEW', 
    
    'Next_30_to_60_days_available_days_NEW', 
    'Next_30_to_60_days_booked_days_NEW', 

    '75_rule_met_NEW',
    '55_rule_met_NEW',

    'Next_30_days_booked_days',
    'Next_30_to_60_days_booked_days',
]

final_summary_df['CHECK_NEXT_30'] = final_summary_df['Next_30_days_booked_days'] - final_summary_df['Next_30_days_booked_days_NEW']
final_summary_df['CHECK_NEXT_30_TO_60'] = final_summary_df['Next_30_to_60_days_booked_days'] - final_summary_df['Next_30_to_60_days_booked_days_NEW']



check_df = final_summary_df[[
    'Room_id',
    'CHECK_NEXT_30',
    'CHECK_NEXT_30_TO_60'
]]

check_df[(~pd.isna(check_df['CHECK_NEXT_30'])) & (~pd.isna(check_df['CHECK_NEXT_30_TO_60'])) & (check_df['CHECK_NEXT_30'] != 0) & (check_df['CHECK_NEXT_30_TO_60'] != 0)]


In [None]:
# Calculate date ranges
next_30_days_start = today
next_30_days_end = today + timedelta(days=29)  # Days 0-29 (30 days total)

next_31_to_60_start = today + timedelta(days=30)  # Day 30
next_31_to_60_end = today + timedelta(days=59)    # Day 59 (30 days total)

In [None]:
dates_dict = final_summary_df[final_summary_df['Room_id'] == 805725215071886985]['Available_dates_by_year_and_month'].values[0]

print("next_30_days_start: ", next_30_days_start)
print("next_30_days_end: ", next_30_days_end)

count_available_days_in_range(dates_dict, next_30_days_start, next_30_days_end)

In [None]:
print("next_31_to_60_start: ", next_31_to_60_start)
print("next_31_to_60_end: ", next_31_to_60_end)

count_available_days_in_range(dates_dict, next_31_to_60_start, next_31_to_60_end)

In [None]:
dd = final_summary_df[final_summary_df['Room_id'] == 1400836150848230066]

dd

In [None]:
output_summary_path = f"Output-Summary-Data/{region_name}_results_grid_{1}_to_{100}.csv"
final_summary_df.to_csv(output_summary_path, index=False)

In [None]:
# # ============================================================================
# # PHASE 2: ENRICHMENT - Apply 75%/55% Rules & Review Analysis
# # ============================================================================

# # Set today's date for calculations
# today = date.today()
# COMPLETE_MONTHS_THIS_YEAR = set([m for m in range(1, today.month+1)])
# COMPLETE_MONTHS_LAST_YEAR = set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

# print("üî¨ Starting enrichment phase...")
# print(f"Total discovered listings to process: {len(all_results)}")
# print(f"Strategy: Only enrich listings that pass 75% rule")
# print("=" * 70)

# # Initialize summary DataFrame
# summary_df = pd.DataFrame(columns=[
#     'Room_id', 'Listing_url', 
#     'Next_30_days_booked_days', 'Next_30_to_60_days_booked_days',
#     '75_rule_met', '55_rule_met',
#     'Warning_level', 'Warning_type', 'Warning_message',
#     'Rating', 'Review_count',
#     'Review_months_this_year', 'Review_months_last_year',
#     'Missing_review_months_this_year', 'Missing_review_months_last_year',
#     'Total_missing_review_months_this_year', 'Total_missing_review_months_last_year',
#     'Latitude', 'Longitude', 'Grid_index'
# ])

# # Process each discovered listing
# listings_passed_75_rule = 0
# listings_failed_75_rule = 0
# enrichment_errors = []

# for idx, result in enumerate(all_results):
#     try:
#         room_id = result['room_id']
#         room_url = f"https://www.airbnb.com.sg/rooms/{room_id}"
        
#         if (idx + 1) % 10 == 0:
#             print(f"  Processing {idx + 1}/{len(all_results)}... (75% pass: {listings_passed_75_rule}, skip: {listings_failed_75_rule})")
#             # Add random delay every 10 listings to avoid rate limiting
#             delay = random.uniform(2, 5)  # Random delay between 2-5 seconds
#             # print(f"  ‚è∏Ô∏è  Pausing for {delay:.1f}s to avoid rate limits...")
#             time.sleep(delay)
        
#         # Extract basic info
#         # This is for Chicago
#         if 'rating' not in result.keys():
#             rating = result['raw_data']['rating']['value']
#             review_count = result['raw_data']['rating']['reviewCount']
#             latitude = result['raw_data']['coordinates']['latitude']
#             longitude = result['raw_data']['coordinates']['longitud']
#         else:
#             rating = result['rating']['value']
#             review_count = result['rating']['reviewCount']
#             latitude = result['coordinates']['latitude']
#             longitude = result['coordinates']['longitud']
            
#         grid_index = result.get('grid_index', None)
        
#         # Get calendar data (3 months)
#         calendar_info = get_calendar(room_id=room_id)
#         current_month_calendar = calendar_info[0]
#         next_month_calendar = calendar_info[1]
#         next_two_months_calendar = calendar_info[2]
        
#         curr_month_available_dates = get_available_dates(current_month_calendar)
#         next_month_available_dates = get_available_dates(next_month_calendar)
#         next_two_months_available_dates = get_available_dates(next_two_months_calendar)
        
#         # Check 75% rule (next 30 days, leniency=2)
#         rule75_met, days_booked_next30days = check_75_rule(
#             curr_month_available_dates=curr_month_available_dates,
#             next_month_available_dates=next_month_available_dates,
#             leniency=2
#         )
        
#         # ONLY PROCESS IF 75% RULE IS MET
#         if True:  # Get all info regardless of 75% rule
#             if rule75_met:
#                 listings_passed_75_rule += 1
            
#             # Check 55% rule (days 31-60, leniency=2)
#             rule55_met, days_booked_nextnext30days = check_55_rule(
#                 next_month_available_dates=next_month_available_dates,
#                 next_two_months_available_dates=next_two_months_available_dates,
#                 leniency=2
#             )
            
#             # Get reviews data
#             review_months = analyze_reviews(room_id)
            
#             if today.year in review_months:
#                 review_months_this_year = review_months[today.year]
#             else:
#                 review_months_this_year = set()
            
#             if today.year - 1 in review_months:
#                 review_months_last_year = review_months[today.year - 1]
#             else:
#                 review_months_last_year = set()
            
#             # Check review frequency (leniency=3)
#             warning_level, warning_type, warning_message = check_reviews_count(
#                 review_months_this_year=review_months_this_year,
#                 review_months_last_year=review_months_last_year,
#                 leniency=3
#             )
            
#             missing_review_months_this_year = list(COMPLETE_MONTHS_THIS_YEAR - review_months_this_year)
#             missing_review_months_last_year = list(COMPLETE_MONTHS_LAST_YEAR - review_months_last_year)
            
#             # Add to summary DataFrame
#             new_row = pd.DataFrame([{
#                 'Room_id': room_id,
#                 'Listing_url': room_url,
#                 'Next_30_days_booked_days': days_booked_next30days,
#                 'Next_30_to_60_days_booked_days': days_booked_nextnext30days,
#                 '75_rule_met': rule75_met,
#                 '55_rule_met': rule55_met,
#                 'Warning_level': warning_level,
#                 'Warning_type': warning_type,
#                 'Warning_message': warning_message,
#                 'Rating': rating,
#                 'Review_count': review_count,
#                 'Review_months_this_year': list(review_months_this_year),
#                 'Review_months_last_year': list(review_months_last_year),
#                 'Missing_review_months_this_year': missing_review_months_this_year,
#                 'Missing_review_months_last_year': missing_review_months_last_year,
#                 'Total_missing_review_months_this_year': len(missing_review_months_this_year),
#                 'Total_missing_review_months_last_year': len(missing_review_months_last_year),
#                 'Latitude': latitude,
#                 'Longitude': longitude,
#                 'Grid_index': grid_index,
#             }])
            
#             summary_df = pd.concat([summary_df, new_row], ignore_index=True)
            
#         else:
#             # 75% rule not met - skip this listing
#             listings_failed_75_rule += 1
            
#     except Exception as e:
#         error_msg = f"Error processing room_id {result.get('room_id', 'unknown')}: {str(e)}"
#         print(f"  ‚ö†Ô∏è {error_msg}")
#         enrichment_errors.append(error_msg)

# print("\n" + "=" * 70)
# print("‚úÖ ENRICHMENT COMPLETE")
# print("=" * 70)
# print(f"Total discovered listings: {len(all_results)}")
# print(f"Passed 75% rule (enriched): {listings_passed_75_rule}")
# print(f"Failed 75% rule (skipped): {listings_failed_75_rule}")
# print(f"Errors during enrichment: {len(enrichment_errors)}")
# print(f"Final qualified listings: {len(summary_df)}")
# print("=" * 70)


In [None]:
# save_filename = f"./Output-Summary-Data/{region_name}_results_grid_{grid_start_index}_to_{grid_end_index}.csv"
# summary_df.to_csv(save_filename, index=False)
# print(f"Saved to: {save_filename}")

# Step 1: discovery all listings

In [None]:
# # ============================================================================
# # OPTION A: USE IMPROVED DISCOVERY SYSTEM (RECOMMENDED)
# # ============================================================================

# from discovery import discover_all_grids, DiscoveryConfig, BBox, AirbnbDiscoveryEngine

# # Configure discovery parameters
# discovery_config = DiscoveryConfig(
#     # Rate limiting (conservative to avoid bans)
#     requests_per_minute=12,
    
#     # Subdivision strategy
#     max_results_before_subdivide=280,  # Subdivide if we hit ~280+ results (likely capped)
#     min_bbox_size_degrees=0.001,  # Stop subdividing below ~100m
#     max_subdivision_depth=4,
    
#     # Multi-pass strategy (catches rotated listings)
#     num_discovery_passes=2,  # Run 3 passes with different date windows
#     alternate_checkin_offsets=[14, 30],  # Try +14, +21, +30 days ahead

#     # User-agent rotation (anti-detection)
#     rotate_user_agents=True,
    
#     # Search parameters
#     price_min=300,
#     price_max=10000,
#     currency="USD",
    
#     # Caching for resume capability
#     cache_dir=f"Data/{region_name}/discovery_cache",
#     enable_cache=False,
    
#     # Stats output
#     stats_file=f"Data/{region_name}/discovery_stats.json",
# )

# print("üîç Starting improved discovery system...")
# print(f"Region: {region_name}")
# print(f"Grid cells to search: {len(gird_coords_df)}")
# print(f"Discovery passes per cell: {discovery_config.num_discovery_passes}")
# print("=" * 70)

# # Run discovery across all grids
# discovered_listings, engine = discover_all_grids(gird_coords_df, region_name, discovery_config)

# print(f"\n‚úÖ Discovery complete!")
# print(f"üìä Total unique listings: {len(discovered_listings)}")
# print(f"üìÅ Saved to: Data/{region_name}/discovered_listings.json")
# print(f"üìà Stats saved to: {discovery_config.stats_file}")

# # Convert discoveries to format compatible with enrichment pipeline
# all_results = [listing.raw_data for listing in discovered_listings.values()]
# print(f"\n‚úì Ready for enrichment pipeline with {len(all_results)} listings")


# Step 2: run analysis based on the discovered listings

In [None]:
# # ============================================================================
# # PHASE 2: ENRICHMENT - Apply 75%/55% Rules & Review Analysis
# # ============================================================================

# # Set today's date for calculations
# today = date.today()
# COMPLETE_MONTHS_THIS_YEAR = set([m for m in range(1, today.month+1)])
# COMPLETE_MONTHS_LAST_YEAR = set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

# print("üî¨ Starting enrichment phase...")
# print(f"Total discovered listings to process: {len(all_results)}")
# print(f"Strategy: Only enrich listings that pass 75% rule")
# print("=" * 70)

# # Initialize summary DataFrame
# summary_df = pd.DataFrame(columns=[
#     'Room_id', 'Listing_url', 
#     'Next_30_days_booked_days', 'Next_30_to_60_days_booked_days',
#     '75_rule_met', '55_rule_met',
#     'Warning_level', 'Warning_type', 'Warning_message',
#     'Rating', 'Review_count',
#     'Review_months_this_year', 'Review_months_last_year',
#     'Missing_review_months_this_year', 'Missing_review_months_last_year',
#     'Total_missing_review_months_this_year', 'Total_missing_review_months_last_year',
#     'Latitude', 'Longitude', 'Grid_index'
# ])

# # Process each discovered listing
# listings_passed_75_rule = 0
# listings_failed_75_rule = 0
# enrichment_errors = []

# for idx, result in enumerate(all_results):
#     try:
#         room_id = result['room_id']
#         room_url = f"https://www.airbnb.com.sg/rooms/{room_id}"
        
#         if (idx + 1) % 10 == 0:
#             print(f"  Processing {idx + 1}/{len(all_results)}... (75% pass: {listings_passed_75_rule}, skip: {listings_failed_75_rule})")
#             # Add random delay every 10 listings to avoid rate limiting
#             delay = random.uniform(2, 5)  # Random delay between 2-5 seconds
#             # print(f"  ‚è∏Ô∏è  Pausing for {delay:.1f}s to avoid rate limits...")
#             time.sleep(delay)
        
#         # Extract basic info
#         rating = result['rating']['value']
#         review_count = result['rating']['reviewCount']
#         latitude = result['coordinates']['latitude']
#         longitude = result['coordinates']['longitud']
#         grid_index = result.get('grid_index', None)
        
#         # Get calendar data (3 months)
#         calendar_info = get_calendar(room_id=room_id)
#         current_month_calendar = calendar_info[0]
#         next_month_calendar = calendar_info[1]
#         next_two_months_calendar = calendar_info[2]
        
#         curr_month_available_dates = get_available_dates(current_month_calendar)
#         next_month_available_dates = get_available_dates(next_month_calendar)
#         next_two_months_available_dates = get_available_dates(next_two_months_calendar)
        
#         # Check 75% rule (next 30 days, leniency=2)
#         rule75_met, days_booked_next30days = check_75_rule(
#             curr_month_available_dates=curr_month_available_dates,
#             next_month_available_dates=next_month_available_dates,
#             leniency=2
#         )
        
#         # ONLY PROCESS IF 75% RULE IS MET
#         if rule75_met:
#             listings_passed_75_rule += 1
            
#             # Check 55% rule (days 31-60, leniency=2)
#             rule55_met, days_booked_nextnext30days = check_55_rule(
#                 next_month_available_dates=next_month_available_dates,
#                 next_two_months_available_dates=next_two_months_available_dates,
#                 leniency=2
#             )
            
#             # Get reviews data
#             review_months = analyze_reviews(room_id)
            
#             if today.year in review_months:
#                 review_months_this_year = review_months[today.year]
#             else:
#                 review_months_this_year = set()
            
#             if today.year - 1 in review_months:
#                 review_months_last_year = review_months[today.year - 1]
#             else:
#                 review_months_last_year = set()
            
#             # Check review frequency (leniency=3)
#             warning_level, warning_type, warning_message = check_reviews_count(
#                 review_months_this_year=review_months_this_year,
#                 review_months_last_year=review_months_last_year,
#                 leniency=3
#             )
            
#             missing_review_months_this_year = list(COMPLETE_MONTHS_THIS_YEAR - review_months_this_year)
#             missing_review_months_last_year = list(COMPLETE_MONTHS_LAST_YEAR - review_months_last_year)
            
#             # Add to summary DataFrame
#             new_row = pd.DataFrame([{
#                 'Room_id': room_id,
#                 'Listing_url': room_url,
#                 'Next_30_days_booked_days': days_booked_next30days,
#                 'Next_30_to_60_days_booked_days': days_booked_nextnext30days,
#                 '75_rule_met': rule75_met,
#                 '55_rule_met': rule55_met,
#                 'Warning_level': warning_level,
#                 'Warning_type': warning_type,
#                 'Warning_message': warning_message,
#                 'Rating': rating,
#                 'Review_count': review_count,
#                 'Review_months_this_year': list(review_months_this_year),
#                 'Review_months_last_year': list(review_months_last_year),
#                 'Missing_review_months_this_year': missing_review_months_this_year,
#                 'Missing_review_months_last_year': missing_review_months_last_year,
#                 'Total_missing_review_months_this_year': len(missing_review_months_this_year),
#                 'Total_missing_review_months_last_year': len(missing_review_months_last_year),
#                 'Latitude': latitude,
#                 'Longitude': longitude,
#                 'Grid_index': grid_index,
#             }])
            
#             summary_df = pd.concat([summary_df, new_row], ignore_index=True)
            
#         else:
#             # 75% rule not met - skip this listing
#             listings_failed_75_rule += 1
            
#     except Exception as e:
#         error_msg = f"Error processing room_id {result.get('room_id', 'unknown')}: {str(e)}"
#         print(f"  ‚ö†Ô∏è {error_msg}")
#         enrichment_errors.append(error_msg)

# print("\n" + "=" * 70)
# print("‚úÖ ENRICHMENT COMPLETE")
# print("=" * 70)
# print(f"Total discovered listings: {len(all_results)}")
# print(f"Passed 75% rule (enriched): {listings_passed_75_rule}")
# print(f"Failed 75% rule (skipped): {listings_failed_75_rule}")
# print(f"Errors during enrichment: {len(enrichment_errors)}")
# print(f"Final qualified listings: {len(summary_df)}")
# print("=" * 70)


In [None]:
# # ============================================================================
# # SAVE RESULTS TO EXCEL
# # ============================================================================

# output_filename = f"Data/{region_name}_enriched_listings_75_55_rule.xlsx"

# print(f"üíæ Saving results to Excel...")
# print(f"   File: {output_filename}")
# print(f"   Rows: {len(summary_df)}")

# summary_df.to_excel(output_filename, index=False)

# print(f"‚úÖ Saved successfully!")
# print()
# print("üìä Summary DataFrame preview:")
# display(summary_df.head())


In [None]:
# # ============================================================================
# # ANALYSIS: View Statistics on Enriched Listings
# # ============================================================================

# print("=" * 70)
# print("üìà ENRICHED LISTINGS ANALYSIS")
# print("=" * 70)

# if len(summary_df) > 0:
#     print(f"\nüéØ Occupancy Rules:")
#     print(f"   Listings meeting 75% rule: {summary_df['75_rule_met'].sum()} (100% - by design)")
#     print(f"   Listings meeting 55% rule: {summary_df['55_rule_met'].sum()} ({100*summary_df['55_rule_met'].sum()/len(summary_df):.1f}%)")
    
#     print(f"\nüìä Booking Statistics:")
#     print(f"   Avg days booked (next 30 days): {summary_df['Next_30_days_booked_days'].mean():.1f}")
#     print(f"   Avg days booked (days 31-60): {summary_df['Next_30_to_60_days_booked_days'].mean():.1f}")
    
#     print(f"\n‚≠ê Rating Statistics:")
#     print(f"   Avg rating: {summary_df['Rating'].mean():.2f}")
#     print(f"   Avg review count: {summary_df['Review_count'].mean():.1f}")
    
#     print(f"\n‚ö†Ô∏è Review Warnings:")
#     high_warnings = summary_df['Warning_level'].value_counts().get('High', 0)
#     print(f"   Listings with High warnings: {high_warnings} ({100*high_warnings/len(summary_df):.1f}%)")
    
#     if high_warnings > 0:
#         print(f"\n   Warning breakdown:")
#         warning_types = summary_df[summary_df['Warning_level'] == 'High']['Warning_type'].value_counts()
#         for warning_type, count in warning_types.items():
#             print(f"     - {warning_type}: {count}")
    
#     print(f"\nüóìÔ∏è Review Coverage:")
#     print(f"   Avg months with reviews (this year): {summary_df['Review_months_this_year'].apply(len).mean():.1f}")
#     print(f"   Avg months with reviews (last year): {summary_df['Review_months_last_year'].apply(len).mean():.1f}")
    
# else:
#     print("\n‚ö†Ô∏è No listings passed the 75% rule!")

# print("\n" + "=" * 70)


In [None]:
# today = date.today()
# COMPLETE_MONTHS_THIS_YEAR = set([m for m in range(1, today.month+1)])
# COMPLETE_MONTHS_LAST_YEAR = set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

In [None]:
# search_results_dir = f"Data/{region_name}_search_all_results"
# warnings_dir = f"Data/{region_name}_warnings"

# os.makedirs(search_results_dir, exist_ok=True)
# os.makedirs(warnings_dir, exist_ok=True)

# # Initialize warnings file
# warnings_file = os.path.join(warnings_dir, "search_all_warnings.txt")
# with open(warnings_file, 'w', encoding='utf-8') as f:
#     f.write("=== Grid Index Mismatch Warnings ===\n")
#     f.write(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")


# all_results = []

# for index, (ne_lat, ne_long, sw_lat, sw_long) in enumerate(bounding_boxes):
#     grid_index = index + 1

#     results = search_all(
#         check_in="",
#         check_out= "",
#         ne_lat=ne_lat, ne_long=ne_long,
#         sw_lat=sw_lat, sw_long=sw_long,
#         zoom_value=0, # zoom_value has no effect on the results
#         price_min=300,
#         price_max=10000,
#         currency="USD"
#     )
#     print(f"Grid index {index} : Found {len(results)} results.")
    
#     # Save each result as individual JSON files
#     for result_idx, result in enumerate(results):

#         room_id = result['room_id']
#         res_lat = result['coordinates']['latitude']
#         res_lon = result['coordinates']['longitud']
#         grid_verify = find_grid_for_coordinate(res_lat, res_lon)

#         if not grid_verify:
#             warning = (f"‚ö†Ô∏è Custom Error: Room ID ({room_id}) do not belong to any grid in gird_coords_df!!!!! latitude: {res_lat} and longitude: {res_lon}")
#             print(f"\n{warning}\n")

#         if grid_verify != grid_index:
#             warning = f"‚ö†Ô∏è Custom Warning: Grid Number Mistmatch in room ID ({room_id}) : grid_index ({grid_index}) != grid_verify ({grid_verify}). !!!!!\n grid_index is corrected to be {grid_verify}."
#             print(f"\n{warning}\n")

#             # Log warning to file
#             with open(warnings_file, 'a', encoding='utf-8') as f:
#                 f.write(f"Grid {grid_index} -> {grid_verify}: Room ID {room_id} at ({res_lat}, {res_lon})\n")

#             grid_index = grid_verify # Correct grid index

#         result["grid_index"] = grid_index
#         filename = f"grid_{grid_index}_listing_{room_id}.json"
#         filepath = os.path.join(search_results_dir, filename)
        
#         # try:
#         #     with open(filepath, 'w', encoding='utf-8') as f:
#         #         json.dump(result, f, indent=2, ensure_ascii=False)
#         # except Exception as e:
#         #     print(f"  Error saving {filename}: {e}")

#         all_results.append(result)
    
#     # all_results.extend(results)


# # Add summary to warnings file
# with open(warnings_file, 'a', encoding='utf-8') as f:
#     f.write(f"\n=== Summary ===\n")
#     f.write(f"Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
#     f.write(f"Total results processed: {len(all_results)}\n")


# print("="*50)
# print(f"Total number of results: {len(all_results)}")
# print(f"Results saved to: {search_results_dir}")

In [None]:
# folder_path = search_results_dir
# all_results = load_grid_json_files(folder_path)
# print(f"Loaded {len(all_results)} files.")

In [None]:
# # Chunked Processing Configuration
# CHUNK_SIZE = 20
# PROCESSED_DIR = "Data/listing_analysis"
# PROGRESS_FILE = os.path.join(PROCESSED_DIR, "progress.json")

# # Create directory for processed chunks
# os.makedirs(PROCESSED_DIR, exist_ok=True)



# def save_chunk(chunk_id, listings_data, failed_listings, start_idx, end_idx):
#     """Save a chunk of processed listings to JSON file"""
#     chunk_data = {
#         "chunk_info": {
#             "chunk_id": chunk_id,
#             "start_index": start_idx,
#             "end_index": end_idx,
#             "processed_at": datetime.now().isoformat(),
#             "total_listings": len(listings_data) + len(failed_listings),
#             "successful_listings": len(listings_data),
#             "failed_listings": len(failed_listings)
#         },
#         "listings": listings_data,
#         "failed_listings": failed_listings
#     }
    
#     chunk_filename = f"chunk_{chunk_id:03d}.json"
#     chunk_filepath = os.path.join(PROCESSED_DIR, chunk_filename)
    
#     try:
#         with open(chunk_filepath, 'w', encoding='utf-8') as f:
#             json.dump(chunk_data, f, indent=2, ensure_ascii=False)
#         return True
#     except Exception as e:
#         print(f"‚ùå Error saving chunk {chunk_id}: {e}")
#         return False



# def update_progress(chunk_id, total_chunks):
#     """Update progress tracking file"""
#     progress_data = {
#         "total_listings": len(all_results),
#         "chunk_size": CHUNK_SIZE,
#         "total_chunks": total_chunks,
#         "completed_chunks": list(range(chunk_id + 1)),
#         "current_chunk": chunk_id,
#         "last_updated": datetime.now().isoformat()
#     }
    
#     try:
#         with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
#             json.dump(progress_data, f, indent=2, ensure_ascii=False)
#     except Exception as e:
#         print(f"‚ö†Ô∏è Warning: Could not update progress file: {e}")



# def load_progress():
#     """Load existing progress if available"""
#     if os.path.exists(PROGRESS_FILE):
#         try:
#             with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
#                 return json.load(f)
#         except Exception as e:
#             print(f"‚ö†Ô∏è Warning: Could not load progress file: {e}")
#     return None



# def process_listing(result, idx):
#     """Process a single listing and return the result or error"""
#     try:
#         room_id = result['room_id']
#         room_url = f"https://www.airbnb.com.sg/rooms/{room_id}"

#         # print(f"  Processing Index: {idx}, Room ID: {room_id}")

#         rating = result['rating']['value']
#         review_count = result['rating']['reviewCount']
#         latitude = result['coordinates']['latitude']
#         longitude = result['coordinates']['longitud']
#         grid_index = result['grid_index']
        
#         calendar_info = get_calendar(room_id=room_id)
#         current_month_calendar = calendar_info[0]
#         next_month_calendar = calendar_info[1]
#         next_two_months_calendar = calendar_info[2]

#         curr_month_available_dates = get_available_dates(current_month_calendar)
#         next_month_available_dates = get_available_dates(next_month_calendar)
#         next_two_months_available_dates = get_available_dates(next_two_months_calendar)

#         rule75_met, days_booked_next30days = check_75_rule(
#             curr_month_available_dates = curr_month_available_dates, 
#             next_month_available_dates = next_month_available_dates,
#             leniency=2
#         )

#         rule55_met, days_booked_nextnext30days = check_55_rule(
#             next_month_available_dates = next_month_available_dates,
#             next_two_months_available_dates = next_two_months_available_dates,
#             leniency=2
#         )

#         # Only process if 75 rule is met
#         if rule75_met:
#             review_months = analyze_reviews(room_id)

#             if today.year in review_months:
#                 review_months_this_year = review_months[today.year]
#             else:
#                 review_months_this_year = set()

#             if today.year - 1 in review_months:
#                 review_months_last_year = review_months[today.year - 1]
#             else:
#                 review_months_last_year = set()

#             # Review count check
#             warning_level, warning_type, warning_message = check_reviews_count(
#                 review_months_this_year = review_months_this_year,
#                 review_months_last_year = review_months_last_year, 
#                 leniency=3
#             )

#             missing_review_months_this_year = list(COMPLETE_MONTHS_THIS_YEAR - review_months_this_year) 
#             missing_review_months_last_year = list(COMPLETE_MONTHS_LAST_YEAR - review_months_last_year)

#             listing_data = {
#                 "room_id": room_id,
#                 "listing_url": room_url,
#                 "next_30_days_booked_days": days_booked_next30days,
#                 "next_30_to_60_days_booked_days": days_booked_nextnext30days,
#                 "rule_75_met": rule75_met,
#                 "rule_55_met": rule55_met,
#                 "warning_level": warning_level,
#                 "warning_type": warning_type,
#                 "warning_message": warning_message,
#                 "rating": rating,
#                 "review_count": review_count,
#                 "review_months_this_year": list(review_months_this_year),
#                 "review_months_last_year": list(review_months_last_year),
#                 "missing_review_months_this_year": missing_review_months_this_year,
#                 "missing_review_months_last_year": missing_review_months_last_year,
#                 "total_missing_review_months_this_year": len(missing_review_months_this_year),
#                 "total_missing_review_months_last_year": len(missing_review_months_last_year),
#                 "latitude": latitude,
#                 "longitude": longitude,
#                 "grid_index": grid_index,
                
#                 "processing_status": "success"
#             }
            
#             return listing_data, None
#         else:
#             return None, None  # Skip listings that don't meet 75 rule
            
#     except Exception as e:
#         error_data = {
#             "room_id": result.get('room_id', f'unknown_{idx}'),
#             "error_message": str(e),
#             "processing_status": "failed"
#         }
#         return None, error_data

# print(f"üìÅ Processing directory: {PROCESSED_DIR}")
# print(f"üìä Total listings to process: {len(all_results)}")
# print(f"üì¶ Chunk size: {CHUNK_SIZE}")
# print("="*60)


In [None]:
# summary_df = pd.DataFrame()
# summary_df['Room_id'] = np.NaN
# summary_df['Listing_url'] = np.NaN

# summary_df['Next_30_days_booked_days'] = np.NaN
# summary_df['Next_30_to_60_days_booked_days'] = np.NaN
# summary_df['75_rule_met'] = False
# summary_df['55_rule_met'] = False

# summary_df['Warning_level'] = np.NaN
# summary_df['Warning_type'] = np.NaN
# summary_df['Warning_message'] = np.NaN

# summary_df['Rating'] = np.NaN
# summary_df['Review_count'] = np.NaN

# summary_df['Review_months_this_year'] = []
# summary_df['Review_months_last_year'] = []
# summary_df['Missing_review_months_this_year'] = []
# summary_df['Missing_review_months_last_year'] = []
# summary_df['Total_missing_review_months_this_year'] = np.NaN
# summary_df['Total_missing_review_months_last_year'] = np.NaN

# summary_df['Latitiude'] = []
# summary_df['Longitude'] = []

# summary_df['Grid_index'] = np.NaN

# summary_df

In [None]:
# # Main Chunked Processing Loop

# total_listings = len(all_results)
# total_chunks = (total_listings + CHUNK_SIZE - 1) // CHUNK_SIZE  # Ceiling division

# print(f"Starting chunked processing...")
# print(f"Total listings: {total_listings}")
# print(f"Total chunks: {total_chunks}")
# print(f"Output directory: {PROCESSED_DIR}")
# print("="*60)

# # Load existing progress if available
# # progress = load_progress()

# start_chunk = 0

# # Process each chunk
# for chunk_id in range(start_chunk, total_chunks):
#     print(f"Processing Chunk {chunk_id + 1}/{total_chunks}")
    
#     start_idx = chunk_id * CHUNK_SIZE
#     end_idx = min(start_idx + CHUNK_SIZE, total_listings)
#     chunk_listings = all_results[start_idx:end_idx]
        
#     chunk_results = []
#     chunk_errors = []
    
#     # Process each listing in the chunk
#     for i, result in enumerate(chunk_listings):
#         listing_idx = start_idx + i
#         listing_data, error_data = process_listing(result, listing_idx)
        
#         if listing_data:
#             chunk_results.append(listing_data)
#         elif error_data:
#             chunk_errors.append(error_data)
    
#     # Save chunk results
#     success = save_chunk(chunk_id, chunk_results, chunk_errors, start_idx, end_idx-1)
    
#     if success:
#         update_progress(chunk_id, total_chunks)
#     else:
#         break
    
#     # Progress summary
#     total_processed = (chunk_id + 1) * CHUNK_SIZE
#     if total_processed > total_listings:
#         total_processed = total_listings
    
# print("\n" + "="*60)
# print("üéâ Chunked processing completed!")
# print(f"üìÅ Results saved in: {PROCESSED_DIR}")
# print(f"üìã Progress file: {PROGRESS_FILE}")


In [None]:
# # Consolidate all chunks into final Excel file
# def consolidate_chunks_to_excel():
#     """Combine all chunk JSON files into a single Excel file"""
#     print("üîÑ Consolidating chunks into Excel file...")
    
#     all_consolidated_data = []
#     total_chunks_processed = 0
#     total_listings_consolidated = 0
    
#     # Get all chunk files
#     chunk_files = [f for f in os.listdir(PROCESSED_DIR) if f.startswith('chunk_') and f.endswith('.json')]
#     chunk_files.sort()  # Sort to process in order
    
#     print(f"üìÅ Found {len(chunk_files)} chunk files to consolidate")
    
#     for chunk_file in chunk_files:
#         chunk_filepath = os.path.join(PROCESSED_DIR, chunk_file)
        
#         try:
#             with open(chunk_filepath, 'r', encoding='utf-8') as f:
#                 chunk_data = json.load(f)
            
#             # Extract listings from chunk
#             listings = chunk_data.get('listings', [])
#             all_consolidated_data.extend(listings)
            
#             total_chunks_processed += 1
#             total_listings_consolidated += len(listings)
            
#             print(f"  ‚úÖ Processed {chunk_file}: {len(listings)} listings")
            
#         except Exception as e:
#             print(f"  ‚ùå Error processing {chunk_file}: {e}")
    
#     if all_consolidated_data:
#         # Create DataFrame
#         consolidated_df = pd.DataFrame(all_consolidated_data)
        
#         # Save to Excel
#         output_filename = "airbnb_dataset_75_55_rule_met_consolidated.xlsx"
#         output_filepath = os.path.join(PROCESSED_DIR, output_filename)
        
#         consolidated_df.to_excel(output_filepath, index=False)
        
#         print(f"\nüéâ Consolidation completed!")
#         print(f"üìä Total chunks processed: {total_chunks_processed}")
#         print(f"üìä Total listings consolidated: {total_listings_consolidated}")
#         print(f"üìÅ Output file: {output_filepath}")
        
#         return consolidated_df
#     else:
#         print("‚ùå No data found to consolidate")
#         return None

# # Run consolidation
# final_df = consolidate_chunks_to_excel()


In [None]:
# summary_df.to_excel("airbnb_dataset_75_55_rule_met.xlsx", index=False)

# Target Specific Regions Only

In [None]:
# # grd_id = 372

# # target_coords = gird_coords_df[gird_coords_df['grid_id'] == grd_id]

# # gird_target = target_coords['grid_id'].values[0]
# # ne_lat_target = target_coords['ne_lat'].values[0]
# # ne_long_target = target_coords['ne_long'].values[0]
# # sw_lat_target = target_coords['sw_lat'].values[0]
# # sw_long_target = target_coords['sw_long'].values[0]



# ne_lat_target, ne_long_target = 41.95433, -87.646
# sw_lat_target, sw_long_target = 41.94067, -87.674

# print("Target Region Searching ...")
# target_results = search_all(
#     check_in="",
#     check_out= "",
#     ne_lat=ne_lat_target, ne_long=ne_long_target,
#     sw_lat=sw_lat_target, sw_long=sw_long_target,
#     zoom_value=10, # zoom_value has no effect on the results
#     price_min=300,
#     price_max=10000,
#     currency="USD"
# )
# print(f"Found {len(target_results)} results.")

In [None]:
# def process_target_listings(target_results):
#     for result in target_results:
#         room_id = result['room_id']
#         room_url = f"https://www.airbnb.com.sg/rooms/{room_id}"

#         print(f"  Processing Index: {idx}, Room ID: {room_id}")

#         rating = result['rating']['value']
#         review_count = result['rating']['reviewCount']
#         latitude = result['coordinates']['latitude']
#         longitude = result['coordinates']['longitud']
#         grid_index = result['grid_index']
        
#         calendar_info = get_calendar(room_id=room_id)
#         current_month_calendar = calendar_info[0]
#         next_month_calendar = calendar_info[1]
#         next_two_months_calendar = calendar_info[2]

#         curr_month_available_dates = get_available_dates(current_month_calendar)
#         next_month_available_dates = get_available_dates(next_month_calendar)
#         next_two_months_available_dates = get_available_dates(next_two_months_calendar)

#         rule75_met, days_booked_next30days = check_75_rule(
#             curr_month_available_dates = curr_month_available_dates, 
#             next_month_available_dates = next_month_available_dates,
#             leniency=2
#         )

#         rule55_met, days_booked_nextnext30days = check_55_rule(
#             next_month_available_dates = next_month_available_dates,
#             next_two_months_available_dates = next_two_months_available_dates,
#             leniency=2
#         )

#         # Only process if 75 rule is met
#         if rule75_met:
#             review_months = analyze_reviews(room_id)

#             if today.year in review_months:
#                 review_months_this_year = review_months[today.year]
#             else:
#                 review_months_this_year = set()

#             if today.year - 1 in review_months:
#                 review_months_last_year = review_months[today.year - 1]
#             else:
#                 review_months_last_year = set()

#             # Review count check
#             warning_level, warning_type, warning_message = check_reviews_count(
#                 review_months_this_year = review_months_this_year,
#                 review_months_last_year = review_months_last_year, 
#                 leniency=3
#             )

#             missing_review_months_this_year = list(COMPLETE_MONTHS_THIS_YEAR - review_months_this_year) 
#             missing_review_months_last_year = list(COMPLETE_MONTHS_LAST_YEAR - review_months_last_year)

#             listing_data = {
#                 "room_id": room_id,
#                 "listing_url": room_url,
#                 "next_30_days_booked_days": days_booked_next30days,
#                 "next_30_to_60_days_booked_days": days_booked_nextnext30days,
#                 "rule_75_met": rule75_met,
#                 "rule_55_met": rule55_met,
#                 "warning_level": warning_level,
#                 "warning_type": warning_type,
#                 "warning_message": warning_message,
#                 "rating": rating,
#                 "review_count": review_count,
#                 "review_months_this_year": list(review_months_this_year),
#                 "review_months_last_year": list(review_months_last_year),
#                 "missing_review_months_this_year": missing_review_months_this_year,
#                 "missing_review_months_last_year": missing_review_months_last_year,
#                 "total_missing_review_months_this_year": len(missing_review_months_this_year),
#                 "total_missing_review_months_last_year": len(missing_review_months_last_year),
#                 "latitude": latitude,
#                 "longitude": longitude,
#                 "grid_index": grid_index,
                
#                 "processing_status": "success"
#             }
            
#             return listing_data, None
#         else: return None, None