In [21]:
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def fix_encoding(text):
    """Handle common encoding issues in Tunisian/French text"""
    if not text:
        return ""
        
    # Try different encoding approaches
    for encoding in ['utf-8', 'latin1', 'windows-1252']:
        try:
            decoded = text.encode('raw_unicode_escape').decode(encoding)
            return decoded
        except:
            continue
    
    # Return original if all decoding fails
    return text

def clean_text(text):
    """Clean text by removing extra spaces and fixing encoding issues"""
    if not text:
        return ""
    
    # Apply encoding fixes first
    text = fix_encoding(text)
            
    # Then clean whitespace and special characters
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Common character replacements
    replacements = {
        'Ã©': 'é', 'Ã¨': 'è', 'Ã´': 'ô', 'Ã§': 'ç',
        'Ã¢': 'â', 'Ã¯': 'ï', 'Ãª': 'ê', 'Ã ': 'à',
        'Ã®': 'î', 'Ã»': 'û', 'Ã«': 'ë', 'Ã¹': 'ù',
        'Â': '', 'â': "'", 'Ã¤': 'ä', 'Ã¶': 'ö',
        'Ã¼': 'ü', 'Ã': 'À', 'Ã': 'É'
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
        
    return text

def extract_pieces(soup, html_content):
    """Extract the number of pieces/rooms from various possible HTML structures"""
    result = None
    
    # Try direct approach first with encoding variations
    piece_variants = [
        'PiÃ©ces (Totale)', 'Piéces (Totale)', 'Pieces (Totale)', 
        'PiÃ©ces', 'Piéces', 'Pieces', 'Pièces', 'Pièce'
    ]
    
    # First try: Standard BeautifulSoup approach with various text patterns
    for variant in piece_variants:
        piece_element = soup.find('span', string=lambda text: text and variant in text if text else False)
        if piece_element and piece_element.find_next('strong'):
            result = piece_element.find_next('strong').get_text(strip=True)
            print(f"Found pieces with variant '{variant}': {result}")
            return result
    
    # Second try: More flexible regex pattern on HTML content
    patterns = [
        r'<span>Pi[éÃ]ces \(Totale\) : </span>\s*<strong>(\d+)',
        r'<span>Pi[éÃ]ces : </span>\s*<strong>(\d+)',
        r'<span>[^<]*Pi[éÃ]ces[^<]*</span>\s*<strong>(\d+)',
        r'Pi[éÃ]ces[^:]*:\s*</span>\s*<strong>(\d+)',
        r'Pi[èéÃ]ces.*?</span>\s*<strong>(\d+)'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, html_content, re.IGNORECASE)
        if match:
            result = match.group(1)
            print(f"Found pieces with regex pattern: {result}")
            return result
    
    # Third try: Look for div content with pieces information
    divs = soup.select('.block-over')
    for div in divs:
        text = div.get_text(' ', strip=True)
        if any(variant.lower() in text.lower() for variant in piece_variants):
            # Extract the number using regex
            match = re.search(r'(\d+)', text)
            if match:
                result = match.group(1)
                print(f"Found pieces from div text: {result}")
                return result
                
    # Fourth try: Direct search in block-over divs
    for div in soup.select('.block-over'):
        span = div.select_one('span')
        if span and any(piece_term in span.get_text().lower() for piece_term in ['pièce', 'piece', 'piéce']):
            strong = div.select_one('strong')
            if strong:
                result = strong.get_text(strip=True)
                print(f"Found pieces from direct div search: {result}")
                return result
    
    return result

def scrape_property_details(url, title, max_retries=3):
    # Set up session with retry logic
    session = requests.Session()
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'fr-FR,fr;q=0.9',
        'Connection': 'keep-alive'
    }

    try:
        print(f"Scraping details for: {title}")

        # Add a small delay before requesting the details page
        time.sleep(random.uniform(0.5, 1.5))

        # Fetch the page with retry logic
        for attempt in range(max_retries):
            try:
                response = session.get(url, headers=headers, timeout=30)
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as e:
                if attempt < max_retries - 1:
                    backoff_time = (2 ** attempt) * random.uniform(1, 2)
                    print(f"Failed to fetch details (attempt {attempt+1}): {e}. Retrying in {backoff_time:.2f}s...")
                    time.sleep(backoff_time)
                else:
                    print(f"Failed to fetch property details after {max_retries} attempts: {e}")
                    return None
        
        # Try different encodings
        for encoding in ['utf-8', 'latin1', 'windows-1252']:
            try:
                response.encoding = encoding
                html_content = response.text
                soup = BeautifulSoup(html_content, 'html.parser')
                break
            except UnicodeDecodeError:
                continue

        # Initialize result dictionary
        result = {
            "property_type": "Appartement",
            "property_details": {
                "chambres": None,
                "salle_de_bain": None,
                "pieces": None,
                "surface_habitable": None,
                "surface_terrain": None,
                "annee_construction": None
            },
            "options": [],
            "description": None,
            "images": []
        }

        # Property Details
        try:
            # Extract all property detail rows to debug
            detail_rows = soup.select('.detail-piece span')
            property_details_raw = {}
            
            # Print the raw details for debugging
            for detail in detail_rows:
                detail_text = detail.get_text().strip()
                if detail_text and ':' not in detail_text:
                    continue
                
                if detail.find_next('strong'):
                    key = detail_text.replace(':', '').strip()
                    value = detail.find_next('strong').get_text().strip()
                    property_details_raw[key] = value
                    print(f"Found detail: {key} = {value}")

            # Get pieces using the specialized function
            result["property_details"]["pieces"] = extract_pieces(soup, html_content)
            print(f"Extracted pieces: {result['property_details']['pieces']}")
            
            # Chambres (Bedrooms)
            if result["property_details"]["chambres"] is None:
                chambres_element = soup.find('span', string=lambda text: text and ('Chambres' in text or 'chambres' in text) if text else False)
                if chambres_element and chambres_element.find_next('strong'):
                    result["property_details"]["chambres"] = chambres_element.find_next('strong').get_text(strip=True)
                else:
                    # Fallback to regex for chambres
                    chambres_match = re.search(r'Chambres\s*:\s*</span>\s*<strong>(\d+)', html_content)
                    if chambres_match:
                        result["property_details"]["chambres"] = chambres_match.group(1)
                    else:
                        # Try another approach with block-over divs
                        for div in soup.select('.block-over'):
                            if div.select_one('span') and 'chambre' in div.select_one('span').get_text().lower():
                                strong = div.select_one('strong')
                                if strong:
                                    result["property_details"]["chambres"] = strong.get_text(strip=True)
                                    break

            # Salle de bain (Bathroom)
            if result["property_details"]["salle_de_bain"] is None:
                sdb_element = soup.find('span', string=lambda text: text and ('Salle de bain' in text or 'salle de bain' in text) if text else False)
                if sdb_element and sdb_element.find_next('strong'):
                    result["property_details"]["salle_de_bain"] = sdb_element.find_next('strong').get_text(strip=True)
                else:
                    # Fallback to regex for salle de bain
                    salle_match = re.search(r'Salle de bain\s*:\s*</span>\s*<strong>(\d+)', html_content)
                    if salle_match:
                        result["property_details"]["salle_de_bain"] = salle_match.group(1)
                    else:
                        # Try another approach with block-over divs
                        for div in soup.select('.block-over'):
                            if div.select_one('span') and 'salle de bain' in div.select_one('span').get_text().lower():
                                strong = div.select_one('strong')
                                if strong:
                                    result["property_details"]["salle_de_bain"] = strong.get_text(strip=True)
                                    break

            # Surface habitable (Living area)
            if result["property_details"]["surface_habitable"] is None:
                surf_hab_element = soup.find('span', string=lambda text: text and ('Surf habitable' in text or 'Surface habitable' in text) if text else False)
                if surf_hab_element and surf_hab_element.find_next('strong'):
                    result["property_details"]["surface_habitable"] = clean_text(surf_hab_element.find_next('strong').get_text(strip=True))
                else:
                    # Fallback to regex
                    surf_hab_match = re.search(r'Surf habitable\s*:\s*</span>\s*<strong>([^<]+)', html_content)
                    if surf_hab_match:
                        result["property_details"]["surface_habitable"] = clean_text(surf_hab_match.group(1))
                    else:
                        # Try another approach with block-over divs
                        for div in soup.select('.block-over'):
                            if div.select_one('span') and any(term in div.select_one('span').get_text().lower() for term in ['surf habitable', 'surface habitable']):
                                strong = div.select_one('strong')
                                if strong:
                                    result["property_details"]["surface_habitable"] = clean_text(strong.get_text(strip=True))
                                    break

            # Surface terrain (Land area)
            if result["property_details"]["surface_terrain"] is None:
                surf_terr_element = soup.find('span', string=lambda text: text and ('Surf terrain' in text or 'Surface terrain' in text) if text else False)
                if surf_terr_element and surf_terr_element.find_next('strong'):
                    result["property_details"]["surface_terrain"] = clean_text(surf_terr_element.find_next('strong').get_text(strip=True))
                else:
                    # Fallback to regex
                    surf_terr_match = re.search(r'Surf terrain\s*:\s*</span>\s*<strong>([^<]+)', html_content)
                    if surf_terr_match:
                        result["property_details"]["surface_terrain"] = clean_text(surf_terr_match.group(1))
                    else:
                        # Try another approach with block-over divs
                        for div in soup.select('.block-over'):
                            if div.select_one('span') and any(term in div.select_one('span').get_text().lower() for term in ['surf terrain', 'surface terrain']):
                                strong = div.select_one('strong')
                                if strong:
                                    result["property_details"]["surface_terrain"] = clean_text(strong.get_text(strip=True))
                                    break

            # Année construction (Construction year)
            if result["property_details"]["annee_construction"] is None:
                annee_element = soup.find('span', string=lambda text: text and ('Année construction' in text or 'Annee construction' in text) if text else False)
                if annee_element and annee_element.find_next('strong'):
                    result["property_details"]["annee_construction"] = clean_text(annee_element.find_next('strong').get_text(strip=True))
                else:
                    # Try multiple regex patterns for année construction
                    annee_patterns = [
                        r'Ann[ée]e construction\s*:\s*</span>\s*<strong>([^<]+)',
                        r'Ann[ée]e de construction\s*:\s*</span>\s*<strong>([^<]+)'
                    ]
                    
                    for pattern in annee_patterns:
                        annee_match = re.search(pattern, html_content, re.IGNORECASE)
                        if annee_match:
                            result["property_details"]["annee_construction"] = clean_text(annee_match.group(1))
                            break
                    
                    if result["property_details"]["annee_construction"] is None:
                        # Try another approach with block-over divs
                        for div in soup.select('.block-over'):
                            if div.select_one('span') and any(term in div.select_one('span').get_text().lower() for term in ['année construction', 'annee construction']):
                                strong = div.select_one('strong')
                                if strong:
                                    result["property_details"]["annee_construction"] = clean_text(strong.get_text(strip=True))
                                    break
                            
            # Debug message to track extraction success
            print(f"Extracted details - Rooms: {result['property_details']['pieces']}, Year: {result['property_details']['annee_construction']}")
            
        except Exception as e:
            print(f"Error getting property details: {e}")

        # Options
        try:
            # Try direct BeautifulSoup approach first
            option_elements = soup.select('span.col-md-4 strong i.fa-check-square')
            if option_elements:
                options = []
                for element in option_elements:
                    parent = element.parent
                    if parent and parent.get_text():
                        option_text = clean_text(parent.get_text().replace('✓', '').strip())
                        if option_text:
                            options.append(option_text)
                
                if options:
                    result["options"] = options
                else:
                    # Fallback to regex
                    options_matches = re.findall(r'<span class="col-md-4[^>]+><strong>\s*<i class="fa fa-check-square"></i>\s*([^<]+)</strong>', html_content)
                    if options_matches:
                        result["options"] = [clean_text(opt) for opt in options_matches if opt.strip()]
                        
            # If still no options, try another approach with span-opts class
            if not result["options"]:
                option_spans = soup.select('span.span-opts strong')
                if option_spans:
                    result["options"] = [clean_text(span.get_text().replace('✓', '').strip()) for span in option_spans if span.get_text().strip()]
            
        except Exception as e:
            print(f"Error getting options: {e}")

        # Description
        try:
            desc = soup.find('p', itemprop='text')
            if desc:
                result["description"] = clean_text(desc.get_text())
            else:
                # Try finding the description in other ways
                desc_element = soup.find('div', class_='detail-desc')
                if desc_element:
                    p_elements = desc_element.find_all('p')
                    if p_elements:
                        result["description"] = clean_text(" ".join([p.get_text() for p in p_elements]))
        except Exception as e:
            print(f"Error getting description: {e}")

        # Images
        try:
            images = soup.select('div.slider-product img[src]')
            if images:
                result["images"] = list(set([img['src'] for img in images]))
        except Exception as e:
            print(f"Error getting images: {e}")

        return result

    except Exception as e:
        print(f"Error scraping property details: {e}")
        return None

def scrape_property_listings(base_url, page_number, max_retries=3):
    # Construct URL with page number
    url = f"{base_url}?l=0&page={page_number}&tri=1"

    # Set up session with retry logic
    session = requests.Session()
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=1,  # Will sleep for 1s, then 2s, then 4s between retries
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'fr-FR,fr;q=0.9',
        'Connection': 'keep-alive'
    }

    try:
        # Fetch the page with retry logic
        for attempt in range(max_retries):
            try:
                print(f"Scraping page: {url} (Attempt {attempt+1}/{max_retries})")
                response = session.get(url, headers=headers, timeout=30)  # Increased timeout to 30s
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as e:
                if attempt < max_retries - 1:
                    backoff_time = (2 ** attempt) * random.uniform(1, 2)
                    print(f"Error fetching the page (attempt {attempt+1}): {e}")
                    print(f"Waiting {backoff_time:.2f} seconds before retrying...")
                    time.sleep(backoff_time)
                else:
                    print(f"Failed to fetch page {page_number} after {max_retries} attempts")
                    return []

        # Try different encodings
        for encoding in ['utf-8', 'latin1', 'windows-1252']:
            try:
                response.encoding = encoding
                html_content = response.text
                soup = BeautifulSoup(html_content, 'html.parser')
                break
            except UnicodeDecodeError:
                continue

        # Find all property listings
        listings = soup.find_all('div', class_='li-item-list')
        
        if not listings:
            print(f"Warning: No listings found on page {page_number}. Response length: {len(response.text)}")
            # Try to save the HTML for debugging
            with open(f'debug_page_{page_number}.html', 'w', encoding='utf-8') as f:
                f.write(response.text)
            
        properties = []

        for listing in listings:
            try:
                # Extract basic listing information
                title_element = listing.find('a', class_='li-item-list-title')
                title = title_element.get_text(strip=True) if title_element else "No title found"
                title = clean_text(title)

                # Premium tag might not exist for all listings
                premium_tag = listing.find('span', class_='strap-premium')
                premium = premium_tag.get_text(strip=True) if premium_tag else "Standard"

                price_element = listing.find('span', class_='item-box-price')
                price = price_element.get_text(strip=True) if price_element else "Price not specified"
                
                link_element = listing.find('a', class_='li-item-list-title')
                link = link_element['href'] if link_element and 'href' in link_element.attrs else None

                # More robust location finding
                location_element = (
                    listing.find('p', class_='location') or 
                    (listing.find('i', class_='fa-map-marker').find_parent() if listing.find('i', class_='fa-map-marker') else None)
                )
                
                location = "Location not specified"
                if location_element:
                    location = location_element.get_text(' ', strip=True)
                    location = clean_text(location)

                # If location contains comma, split into neighborhood and city
                neighborhood, city = None, None
                if location and ',' in location:
                    parts = [part.strip() for part in location.split(',')]
                    neighborhood = parts[0] if len(parts) > 0 else None
                    city = parts[1] if len(parts) > 1 else None

                # Skip if we don't have a valid link
                if not link:
                    print(f"Skipping listing with no link: {title}")
                    continue

                # Scrape full property details
                full_property_details = scrape_property_details(link, title)

                # Create property dictionary
                property_data = {
                    "title": title,
                    "premium": premium,
                    "price": price,
                    "link": link,
                    "location": location,
                    "neighborhood": neighborhood,
                    "city": city
                }
                
                # Add property details if available
                if full_property_details:
                    property_data.update(full_property_details)

                properties.append(property_data)

                # Add more randomized delay between requests to property details
                time.sleep(random.uniform(1.5, 3.5))

            except Exception as e:
                print(f"Error processing a listing: {e}")
                continue

        return properties

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Function to save partial data periodically
def save_partial_data(properties, filename='menzili_properties_partial.json'):
    try:
        json_output = json.dumps(properties, indent=2, ensure_ascii=False)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(json_output)
        print(f"Saved {len(properties)} properties to {filename}")
    except Exception as e:
        print(f"Error saving partial data: {e}")

# Main execution
if __name__ == "__main__":
    # You can change this URL to scrape maisons (houses) instead of apartments
    base_url = "https://www.menzili.tn/immo/vente-appartement-neuf-tunisie"
    # Uncomment the line below to scrape houses instead
    # base_url = "https://www.menzili.tn/immo/vente-maison-tunisie"
    
    all_properties = []
    save_interval = 10  # Save data every 10 pages
    
    # For initial testing, start with just a few pages
    # Once verified working, expand to more pages
    start_page = 1
    end_page = 366  # Start with just 5 pages for testing
    
    try:
        for page in range(start_page, end_page + 1):
            try:
                # Scrape properties from current page
                page_properties = scrape_property_listings(base_url, page)

                # Add properties to the main list
                all_properties.extend(page_properties)

                # Print progress
                print(f"Completed page {page}. Total properties so far: {len(all_properties)}")

                # Save partial data periodically
                if page % save_interval == 0:
                    save_partial_data(all_properties)

                # More randomized delay between pages
                delay = random.uniform(3, 7)
                print(f"Waiting {delay:.2f} seconds before next page...")
                time.sleep(delay)

            except Exception as e:
                print(f"Error scraping page {page}: {e}")
                # Save partial data on error
                save_partial_data(all_properties, f'menzili_properties_error_at_page_{page}.json')
                continue

    except KeyboardInterrupt:
        print("\nScraping interrupted by user. Saving collected data...")
        save_partial_data(all_properties, 'menzili_properties_interrupted.json')
    
    # Save the final data
    if all_properties:
        # Convert to JSON
        json_output = json.dumps(all_properties, indent=2, ensure_ascii=False)

        # Save to file
        with open('menzili_properties_data.json', 'w', encoding='utf-8') as f:
            f.write(json_output)

        print("Scraping completed. Data saved to menzili_properties_data.json")
        print(f"Total properties scraped: {len(all_properties)}")
    else:
        print("No properties were scraped. Check the website availability or your network connection.")

Scraping page: https://www.menzili.tn/immo/vente-appartement-neuf-tunisie?l=0&page=1&tri=1 (Attempt 1/3)
Scraping details for: A vendre Appartement au lac 2
Found pieces with variant 'Piéces (Totale)': 4
Extracted pieces: 4
Extracted details - Rooms: 4, Year: None
Scraping details for: Appartement S3 à vendre à la Marsa
Found pieces with variant 'Piéces (Totale)': 4
Extracted pieces: 4
Extracted details - Rooms: 4, Year: None
Scraping details for: Appartement S+1 à La Marsa MAV1613
Extracted pieces: None
Extracted details - Rooms: None, Year: None
Scraping details for: À VENDRE appartement à la SOUKRA
Found pieces with variant 'Piéces (Totale)': 3
Extracted pieces: 3
Extracted details - Rooms: 3, Year: 2022
Scraping details for: Un appartement S+1 tout neuf à vendre à AFH Mrezga 51355351
Found pieces with variant 'Piéces (Totale)': 2
Extracted pieces: 2
Extracted details - Rooms: 2, Year: 2024
Scraping details for: A vendre appartement au centre urbain Nord Tunis
Found pieces with vari

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def fix_encoding(text):
    """Handle common encoding issues in Tunisian/French text"""
    if not text:
        return ""
        
    # Try different encoding approaches
    for encoding in ['utf-8', 'latin1', 'windows-1252']:
        try:
            decoded = text.encode('raw_unicode_escape').decode(encoding)
            return decoded
        except:
            continue
    
    # Return original if all decoding fails
    return text

def clean_text(text):
    """Clean text by removing extra spaces and fixing encoding issues"""
    if not text:
        return ""
    
    # Apply encoding fixes first
    text = fix_encoding(text)
            
    # Then clean whitespace and special characters
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Common character replacements
    replacements = {
        'Ã©': 'é', 'Ã¨': 'è', 'Ã´': 'ô', 'Ã§': 'ç',
        'Ã¢': 'â', 'Ã¯': 'ï', 'Ãª': 'ê', 'Ã ': 'à',
        'Ã®': 'î', 'Ã»': 'û', 'Ã«': 'ë', 'Ã¹': 'ù',
        'Â': '', 'â': "'", 'Ã¤': 'ä', 'Ã¶': 'ö',
        'Ã¼': 'ü', 'Ã': 'À', 'Ã': 'É'
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
        
    return text

def extract_pieces(soup, html_content):
    """Extract the number of pieces/rooms from various possible HTML structures"""
    result = None
    
    # Try direct approach first with encoding variations
    piece_variants = [
        'PiÃ©ces (Totale)', 'Piéces (Totale)', 'Pieces (Totale)', 
        'PiÃ©ces', 'Piéces', 'Pieces', 'Pièces', 'Pièce'
    ]
    
    # First try: Standard BeautifulSoup approach with various text patterns
    for variant in piece_variants:
        piece_element = soup.find('span', string=lambda text: text and variant in text if text else False)
        if piece_element and piece_element.find_next('strong'):
            result = piece_element.find_next('strong').get_text(strip=True)
            print(f"Found pieces with variant '{variant}': {result}")
            return result
    
    # Second try: More flexible regex pattern on HTML content
    patterns = [
        r'<span>Pi[éÃ]ces \(Totale\) : </span>\s*<strong>(\d+)',
        r'<span>Pi[éÃ]ces : </span>\s*<strong>(\d+)',
        r'<span>[^<]*Pi[éÃ]ces[^<]*</span>\s*<strong>(\d+)',
        r'Pi[éÃ]ces[^:]*:\s*</span>\s*<strong>(\d+)',
        r'Pi[èéÃ]ces.*?</span>\s*<strong>(\d+)'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, html_content, re.IGNORECASE)
        if match:
            result = match.group(1)
            print(f"Found pieces with regex pattern: {result}")
            return result
    
    # Third try: Look for div content with pieces information
    divs = soup.select('.block-over')
    for div in divs:
        text = div.get_text(' ', strip=True)
        if any(variant.lower() in text.lower() for variant in piece_variants):
            # Extract the number using regex
            match = re.search(r'(\d+)', text)
            if match:
                result = match.group(1)
                print(f"Found pieces from div text: {result}")
                return result
                
    # Fourth try: Direct search in block-over divs
    for div in soup.select('.block-over'):
        span = div.select_one('span')
        if span and any(piece_term in span.get_text().lower() for piece_term in ['pièce', 'piece', 'piéce']):
            strong = div.select_one('strong')
            if strong:
                result = strong.get_text(strip=True)
                print(f"Found pieces from direct div search: {result}")
                return result
    
    return result

def scrape_property_details(url, title, max_retries=3):
    # Set up session with retry logic
    session = requests.Session()
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'fr-FR,fr;q=0.9',
        'Connection': 'keep-alive'
    }

    try:
        print(f"Scraping details for: {title}")

        # Add a small delay before requesting the details page
        time.sleep(random.uniform(0.5, 1.5))

        # Fetch the page with retry logic
        for attempt in range(max_retries):
            try:
                response = session.get(url, headers=headers, timeout=30)
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as e:
                if attempt < max_retries - 1:
                    backoff_time = (2 ** attempt) * random.uniform(1, 2)
                    print(f"Failed to fetch details (attempt {attempt+1}): {e}. Retrying in {backoff_time:.2f}s...")
                    time.sleep(backoff_time)
                else:
                    print(f"Failed to fetch property details after {max_retries} attempts: {e}")
                    return None
        
        # Try different encodings
        for encoding in ['utf-8', 'latin1', 'windows-1252']:
            try:
                response.encoding = encoding
                html_content = response.text
                soup = BeautifulSoup(html_content, 'html.parser')
                break
            except UnicodeDecodeError:
                continue

        # Initialize result dictionary
        result = {
            "property_type": "Appartement",
            "property_details": {
                "chambres": None,
                "salle_de_bain": None,
                "pieces": None,
                "surface_habitable": None,
                "surface_terrain": None,
                "annee_construction": None
            },
            "options": [],
            "description": None,
            "images": []
        }

        # Property Details
        try:
            # Extract all property detail rows to debug
            detail_rows = soup.select('.detail-piece span')
            property_details_raw = {}
            
            # Print the raw details for debugging
            for detail in detail_rows:
                detail_text = detail.get_text().strip()
                if detail_text and ':' not in detail_text:
                    continue
                
                if detail.find_next('strong'):
                    key = detail_text.replace(':', '').strip()
                    value = detail.find_next('strong').get_text().strip()
                    property_details_raw[key] = value
                    print(f"Found detail: {key} = {value}")

            # Get pieces using the specialized function
            result["property_details"]["pieces"] = extract_pieces(soup, html_content)
            print(f"Extracted pieces: {result['property_details']['pieces']}")
            
            # Chambres (Bedrooms)
            if result["property_details"]["chambres"] is None:
                chambres_element = soup.find('span', string=lambda text: text and ('Chambres' in text or 'chambres' in text) if text else False)
                if chambres_element and chambres_element.find_next('strong'):
                    result["property_details"]["chambres"] = chambres_element.find_next('strong').get_text(strip=True)
                else:
                    # Fallback to regex for chambres
                    chambres_match = re.search(r'Chambres\s*:\s*</span>\s*<strong>(\d+)', html_content)
                    if chambres_match:
                        result["property_details"]["chambres"] = chambres_match.group(1)
                    else:
                        # Try another approach with block-over divs
                        for div in soup.select('.block-over'):
                            if div.select_one('span') and 'chambre' in div.select_one('span').get_text().lower():
                                strong = div.select_one('strong')
                                if strong:
                                    result["property_details"]["chambres"] = strong.get_text(strip=True)
                                    break

            # Salle de bain (Bathroom)
            if result["property_details"]["salle_de_bain"] is None:
                sdb_element = soup.find('span', string=lambda text: text and ('Salle de bain' in text or 'salle de bain' in text) if text else False)
                if sdb_element and sdb_element.find_next('strong'):
                    result["property_details"]["salle_de_bain"] = sdb_element.find_next('strong').get_text(strip=True)
                else:
                    # Fallback to regex for salle de bain
                    salle_match = re.search(r'Salle de bain\s*:\s*</span>\s*<strong>(\d+)', html_content)
                    if salle_match:
                        result["property_details"]["salle_de_bain"] = salle_match.group(1)
                    else:
                        # Try another approach with block-over divs
                        for div in soup.select('.block-over'):
                            if div.select_one('span') and 'salle de bain' in div.select_one('span').get_text().lower():
                                strong = div.select_one('strong')
                                if strong:
                                    result["property_details"]["salle_de_bain"] = strong.get_text(strip=True)
                                    break

            # Surface habitable (Living area)
            if result["property_details"]["surface_habitable"] is None:
                surf_hab_element = soup.find('span', string=lambda text: text and ('Surf habitable' in text or 'Surface habitable' in text) if text else False)
                if surf_hab_element and surf_hab_element.find_next('strong'):
                    result["property_details"]["surface_habitable"] = clean_text(surf_hab_element.find_next('strong').get_text(strip=True))
                else:
                    # Fallback to regex
                    surf_hab_match = re.search(r'Surf habitable\s*:\s*</span>\s*<strong>([^<]+)', html_content)
                    if surf_hab_match:
                        result["property_details"]["surface_habitable"] = clean_text(surf_hab_match.group(1))
                    else:
                        # Try another approach with block-over divs
                        for div in soup.select('.block-over'):
                            if div.select_one('span') and any(term in div.select_one('span').get_text().lower() for term in ['surf habitable', 'surface habitable']):
                                strong = div.select_one('strong')
                                if strong:
                                    result["property_details"]["surface_habitable"] = clean_text(strong.get_text(strip=True))
                                    break

            # Surface terrain (Land area)
            if result["property_details"]["surface_terrain"] is None:
                surf_terr_element = soup.find('span', string=lambda text: text and ('Surf terrain' in text or 'Surface terrain' in text) if text else False)
                if surf_terr_element and surf_terr_element.find_next('strong'):
                    result["property_details"]["surface_terrain"] = clean_text(surf_terr_element.find_next('strong').get_text(strip=True))
                else:
                    # Fallback to regex
                    surf_terr_match = re.search(r'Surf terrain\s*:\s*</span>\s*<strong>([^<]+)', html_content)
                    if surf_terr_match:
                        result["property_details"]["surface_terrain"] = clean_text(surf_terr_match.group(1))
                    else:
                        # Try another approach with block-over divs
                        for div in soup.select('.block-over'):
                            if div.select_one('span') and any(term in div.select_one('span').get_text().lower() for term in ['surf terrain', 'surface terrain']):
                                strong = div.select_one('strong')
                                if strong:
                                    result["property_details"]["surface_terrain"] = clean_text(strong.get_text(strip=True))
                                    break

            # Année construction (Construction year)
            if result["property_details"]["annee_construction"] is None:
                annee_element = soup.find('span', string=lambda text: text and ('Année construction' in text or 'Annee construction' in text) if text else False)
                if annee_element and annee_element.find_next('strong'):
                    result["property_details"]["annee_construction"] = clean_text(annee_element.find_next('strong').get_text(strip=True))
                else:
                    # Try multiple regex patterns for année construction
                    annee_patterns = [
                        r'Ann[ée]e construction\s*:\s*</span>\s*<strong>([^<]+)',
                        r'Ann[ée]e de construction\s*:\s*</span>\s*<strong>([^<]+)'
                    ]
                    
                    for pattern in annee_patterns:
                        annee_match = re.search(pattern, html_content, re.IGNORECASE)
                        if annee_match:
                            result["property_details"]["annee_construction"] = clean_text(annee_match.group(1))
                            break
                    
                    if result["property_details"]["annee_construction"] is None:
                        # Try another approach with block-over divs
                        for div in soup.select('.block-over'):
                            if div.select_one('span') and any(term in div.select_one('span').get_text().lower() for term in ['année construction', 'annee construction']):
                                strong = div.select_one('strong')
                                if strong:
                                    result["property_details"]["annee_construction"] = clean_text(strong.get_text(strip=True))
                                    break
                            
            # Debug message to track extraction success
            print(f"Extracted details - Rooms: {result['property_details']['pieces']}, Year: {result['property_details']['annee_construction']}")
            
        except Exception as e:
            print(f"Error getting property details: {e}")

        # Options
        try:
            # Try direct BeautifulSoup approach first
            option_elements = soup.select('span.col-md-4 strong i.fa-check-square')
            if option_elements:
                options = []
                for element in option_elements:
                    parent = element.parent
                    if parent and parent.get_text():
                        option_text = clean_text(parent.get_text().replace('✓', '').strip())
                        if option_text:
                            options.append(option_text)
                
                if options:
                    result["options"] = options
                else:
                    # Fallback to regex
                    options_matches = re.findall(r'<span class="col-md-4[^>]+><strong>\s*<i class="fa fa-check-square"></i>\s*([^<]+)</strong>', html_content)
                    if options_matches:
                        result["options"] = [clean_text(opt) for opt in options_matches if opt.strip()]
                        
            # If still no options, try another approach with span-opts class
            if not result["options"]:
                option_spans = soup.select('span.span-opts strong')
                if option_spans:
                    result["options"] = [clean_text(span.get_text().replace('✓', '').strip()) for span in option_spans if span.get_text().strip()]
            
        except Exception as e:
            print(f"Error getting options: {e}")

        # Description
        try:
            desc = soup.find('p', itemprop='text')
            if desc:
                result["description"] = clean_text(desc.get_text())
            else:
                # Try finding the description in other ways
                desc_element = soup.find('div', class_='detail-desc')
                if desc_element:
                    p_elements = desc_element.find_all('p')
                    if p_elements:
                        result["description"] = clean_text(" ".join([p.get_text() for p in p_elements]))
        except Exception as e:
            print(f"Error getting description: {e}")

        # Images
        try:
            images = soup.select('div.slider-product img[src]')
            if images:
                result["images"] = list(set([img['src'] for img in images]))
        except Exception as e:
            print(f"Error getting images: {e}")

        return result

    except Exception as e:
        print(f"Error scraping property details: {e}")
        return None

def scrape_property_listings(base_url, page_number, max_retries=3):
    # Construct URL with page number
    url = f"{base_url}?l=0&page={page_number}&tri=1"

    # Set up session with retry logic
    session = requests.Session()
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=1,  # Will sleep for 1s, then 2s, then 4s between retries
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'fr-FR,fr;q=0.9',
        'Connection': 'keep-alive'
    }

    try:
        # Fetch the page with retry logic
        for attempt in range(max_retries):
            try:
                print(f"Scraping page: {url} (Attempt {attempt+1}/{max_retries})")
                response = session.get(url, headers=headers, timeout=30)  # Increased timeout to 30s
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as e:
                if attempt < max_retries - 1:
                    backoff_time = (2 ** attempt) * random.uniform(1, 2)
                    print(f"Error fetching the page (attempt {attempt+1}): {e}")
                    print(f"Waiting {backoff_time:.2f} seconds before retrying...")
                    time.sleep(backoff_time)
                else:
                    print(f"Failed to fetch page {page_number} after {max_retries} attempts")
                    return []

        # Try different encodings
        for encoding in ['utf-8', 'latin1', 'windows-1252']:
            try:
                response.encoding = encoding
                html_content = response.text
                soup = BeautifulSoup(html_content, 'html.parser')
                break
            except UnicodeDecodeError:
                continue

        # Find all property listings
        listings = soup.find_all('div', class_='li-item-list')
        
        if not listings:
            print(f"Warning: No listings found on page {page_number}. Response length: {len(response.text)}")
            # Try to save the HTML for debugging
            with open(f'debug_page_{page_number}.html', 'w', encoding='utf-8') as f:
                f.write(response.text)
            
        properties = []

        for listing in listings:
            try:
                # Extract basic listing information
                title_element = listing.find('a', class_='li-item-list-title')
                title = title_element.get_text(strip=True) if title_element else "No title found"
                title = clean_text(title)

                # Premium tag might not exist for all listings
                premium_tag = listing.find('span', class_='strap-premium')
                premium = premium_tag.get_text(strip=True) if premium_tag else "Standard"

                price_element = listing.find('span', class_='item-box-price')
                price = price_element.get_text(strip=True) if price_element else "Price not specified"
                
                link_element = listing.find('a', class_='li-item-list-title')
                link = link_element['href'] if link_element and 'href' in link_element.attrs else None

                # More robust location finding
                location_element = (
                    listing.find('p', class_='location') or 
                    (listing.find('i', class_='fa-map-marker').find_parent() if listing.find('i', class_='fa-map-marker') else None)
                )
                
                location = "Location not specified"
                if location_element:
                    location = location_element.get_text(' ', strip=True)
                    location = clean_text(location)

                # If location contains comma, split into neighborhood and city
                neighborhood, city = None, None
                if location and ',' in location:
                    parts = [part.strip() for part in location.split(',')]
                    neighborhood = parts[0] if len(parts) > 0 else None
                    city = parts[1] if len(parts) > 1 else None

                # Skip if we don't have a valid link
                if not link:
                    print(f"Skipping listing with no link: {title}")
                    continue

                # Scrape full property details
                full_property_details = scrape_property_details(link, title)

                # Create property dictionary
                property_data = {
                    "title": title,
                    "premium": premium,
                    "price": price,
                    "link": link,
                    "location": location,
                    "neighborhood": neighborhood,
                    "city": city
                }
                
                # Add property details if available
                if full_property_details:
                    property_data.update(full_property_details)

                properties.append(property_data)

                # Add more randomized delay between requests to property details
                time.sleep(random.uniform(1.5, 3.5))

            except Exception as e:
                print(f"Error processing a listing: {e}")
                continue

        return properties

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Function to save partial data periodically
def save_partial_data(properties, filename='menzili_properties_partial_rest.json'):
    try:
        json_output = json.dumps(properties, indent=2, ensure_ascii=False)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(json_output)
        print(f"Saved {len(properties)} properties to {filename}")
    except Exception as e:
        print(f"Error saving partial data: {e}")

# Main execution
if __name__ == "__main__":
    # You can change this URL to scrape maisons (houses) instead of apartments
    base_url = "https://www.menzili.tn/immo/vente-appartement-neuf-tunisie"
    # Uncomment the line below to scrape houses instead
    # base_url = "https://www.menzili.tn/immo/vente-maison-tunisie"
    
    all_properties = []
    save_interval = 5  # Save data every 5 pages
    
    # Scrape pages from 241 to 280
    start_page = 241
    end_page = 280
    
    try:
        for page in range(start_page, end_page + 1):
            try:
                # Scrape properties from current page
                page_properties = scrape_property_listings(base_url, page)

                # Add properties to the main list
                all_properties.extend(page_properties)

                # Print progress
                print(f"Completed page {page}. Total properties so far: {len(all_properties)}")

                # Save partial data periodically
                if page % save_interval == 0 or page == end_page:
                    save_partial_data(all_properties)

                # More randomized delay between pages
                delay = random.uniform(3, 7)
                print(f"Waiting {delay:.2f} seconds before next page...")
                time.sleep(delay)

            except Exception as e:
                print(f"Error scraping page {page}: {e}")
                # Save partial data on error
                save_partial_data(all_properties, f'menzili_properties_error_at_page_{page}.json')
                continue

    except KeyboardInterrupt:
        print("\nScraping interrupted by user. Saving collected data...")
        save_partial_data(all_properties, 'menzili_properties_interrupted.json')
    
    # Save the final data
    if all_properties:
        # Convert to JSON
        json_output = json.dumps(all_properties, indent=2, ensure_ascii=False)

        # Save to file
        with open('menzili_properties_data_rest.json', 'w', encoding='utf-8') as f:
            f.write(json_output)

        print("Scraping completed. Data saved to menzili_properties_data_rest.json")
        print(f"Total properties scraped: {len(all_properties)}")
    else:
        print("No properties were scraped. Check the website availability or your network connection.")

Scraping page: https://www.menzili.tn/immo/vente-appartement-neuf-tunisie?l=0&page=241&tri=1 (Attempt 1/3)
Scraping details for: Appartement de type S2 bien situé
Found pieces with variant 'Piéces (Totale)': 3
Extracted pieces: 3
Extracted details - Rooms: 3, Year: None
Scraping details for: A vendre appartement haut standing a rades meliane
Found pieces with variant 'Piéces (Totale)': 4
Extracted pieces: 4
Extracted details - Rooms: 4, Year: 2014
Scraping details for: Appartement neuf S+3 aux Jinenes Ennasr
Found pieces with variant 'Piéces (Totale)': 4
Extracted pieces: 4
Extracted details - Rooms: 4, Year: 2020
Scraping details for: DUPLEXE MIRIAM (Réf: V1086)
Extracted pieces: None
Extracted details - Rooms: None, Year: None
Scraping details for: Appartement a vendre de 110 m²
Found pieces with variant 'Piéces (Totale)': 3
Extracted pieces: 3
Extracted details - Rooms: 3, Year: 2010
Scraping details for: Triplex SANA (Réf: V1201)
Extracted pieces: None
Extracted details - Rooms: No