In [1]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import time
import random
import urllib.parse

print("🚀 Libraries imported successfully!")

def extract_detailed_characteristics(url, headers):
    try:
        # URL validation
        if not url or not url.startswith('http'):
            print(f"❌ Invalid URL: {url}")
            return {}
        
        # Send request with headers
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find characteristics block
        char_block = soup.find('div', class_='blockProp caractBlockProp')
        
        if not char_block:
            return {}
        
        # Extract main features
        detailed_chars = {}
        main_features = char_block.find_all('div', class_='adMainFeature')
        
        for feature in main_features:
            label_elem = feature.find('p', class_='adMainFeatureContentLabel')
            value_elem = feature.find('p', class_='adMainFeatureContentValue')
            
            if label_elem and value_elem:
                label = label_elem.get_text(strip=True)
                value = value_elem.get_text(strip=True)
                detailed_chars[label] = value
        
        return detailed_chars
    
    except Exception as e:
        print(f"❌ Error extracting characteristics: {str(e)}")
        return {}

def scrape_mubawab_listings(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        # Send HTTP request
        print(f"🌐 Fetching page: {url}")
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Parse HTML content
        print("📋 Parsing HTML...")
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find listing containers
        listings = soup.find_all('div', class_='listingBox')
        print(f"🔍 Found {len(listings)} listings")

        data = []
        
        for i, listing in enumerate(listings):
            try:
                # Extract basic property information
                title_elem = listing.find('h2', class_='listingTit')
                price_elem = listing.find('span', class_='priceTag')
                location_elem = listing.find('span', class_='listingH3')
                
                # Print the title being scraped
                title_text = title_elem.get_text(strip=True) if title_elem else 'No Title'
                print(f"📑 Scraping house #{i+1}: {title_text}")

                # Link and ID extraction
                link_elem = listing.find('a', href=True)
                ad_id_elem = listing.find('input', class_='adId')

                # Image URL extraction
                image_elem = listing.find('img', class_='sliderImage')
                image_url = None
                if image_elem:
                    image_url = (image_elem.get('data-lazy') or
                                 image_elem.get('src') or
                                 image_elem.get('data-src'))

                # Full link handling
                full_link = ''
                if link_elem:
                    href = link_elem.get('href', '').strip()
                    if href:
                        full_link = urllib.parse.urljoin('https://www.mubawab.tn', href) if not href.startswith('http') else href

                # Detailed characteristics extraction
                detailed_chars = {}
                if full_link:
                    print(f"  ↪ Getting details from: {full_link}")
                    time.sleep(random.uniform(0.5, 1.5))  # Respectful scraping
                    detailed_chars = extract_detailed_characteristics(full_link, headers)

                # Prepare listing data
                listing_data = {
                    'title': title_text,
                    'price': price_elem.get_text(strip=True) if price_elem else 'No Price',
                    'location': location_elem.get_text(strip=True) if location_elem else 'No Location',
                    'link': full_link,
                    'ad_id': ad_id_elem['value'] if ad_id_elem else 'No Ad ID',
                    'image_url': image_url or 'No Image',
                    'detailed_characteristics': detailed_chars
                }

                # Property features extraction
                property_features = {}
                feature_mapping = {
                    'icon-triangle': 'area',
                    'icon-house-boxes': 'total_rooms',
                    'icon-bed': 'bedrooms',
                    'icon-bath': 'bathrooms'
                }

                detail_features = listing.find_all('div', class_='adDetailFeature')
                for feature in detail_features:
                    icon = feature.find('i')
                    text = feature.find('span')

                    if icon and text:
                        icon_classes = icon.get('class', [])
                        for icon_class in icon_classes:
                            if icon_class in feature_mapping:
                                feature_key = feature_mapping[icon_class]
                                feature_value = text.get_text(strip=True)
                                property_features[feature_key] = feature_value
                                break

                # Ensure features are populated
                listing_data['features'] = property_features

                # Extract amenities
                amenities = listing.find_all('div', class_='adFeature')
                amenities_list = [amenity.get_text(strip=True) for amenity in amenities if amenity.get_text(strip=True)]
                listing_data['amenities'] = amenities_list

                listing_data['scraped_at'] = datetime.now().isoformat()

                data.append(listing_data)
                print(f"  ✅ Successfully scraped: {title_text}")

            except Exception as e:
                print(f"❌ Error processing a listing: {str(e)}")
                continue

        print(f"🏠 Completed page with {len(data)} houses successfully scraped")
        return data

    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching the page: {e}")
        return None
    except Exception as e:
        print(f"❌ An error occurred: {e}")
        return None

def scrape_multiple_pages():
    # Create a timestamp for the output file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    json_filename = f'mubawab_properties_{timestamp}.json'
    
    # List to store all listings
    all_listings = []
    total_scraped = 0
    
    # Loop through pages
    for i in range(1, 291):  # Adjust range as needed
        url = f"https://www.mubawab.tn/fr/cc/immobilier-a-vendre-all:sc:apartment-sale,farm-sale,house-sale,other-sale,villa-sale:p:{i}"
        
        print(f"\n{'='*50}")
        print(f"🔄 Starting to scrape page {i} of 290")
        print(f"{'='*50}\n")
        
        # Scrape listings for current page
        listings_data = scrape_mubawab_listings(url)
        
        if listings_data:
            # Extend the all_listings with current page's listings
            all_listings.extend(listings_data)
            total_scraped += len(listings_data)
            print(f"📊 Processed page {i}: Added {len(listings_data)} listings")
            print(f"📈 Total properties scraped so far: {total_scraped}")
        else:
            print(f"❌ Failed to scrape page {i}")
        
        # Add a pause between pages to be respectful
        if i < 290:  # Don't wait after the last page
            wait_time = random.uniform(2, 5)
            print(f"⏱️ Waiting {wait_time:.1f} seconds before the next page...")
            time.sleep(wait_time)
    
    # Save all listings to a single JSON file
    if all_listings:
        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump(all_listings, f, ensure_ascii=False, indent=2)
        
        print(f"\n✅ Successfully scraped {len(all_listings)} total listings")
        print(f"💾 Data saved to '{json_filename}'")
        
        # Print first listing as sample
        print("\n📝 Sample listing:")
        print(json.dumps(all_listings[0], indent=2, ensure_ascii=False))
    else:
        print("\n❌ Scraping failed. No data was saved.")

# Execute the scraping process
if __name__ == "__main__":
    print("🏘️ Starting Mubawab property scraper")
    print("⏰ Process started at:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    
    start_time = time.time()
    scrape_multiple_pages()
    
    elapsed_time = time.time() - start_time
    hours, remainder = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    
    print("\n⌛ Total execution time: {:02}:{:02}:{:02}".format(
        int(hours), int(minutes), int(seconds)))
    print("🏁 Scraping process completed!")

🚀 Libraries imported successfully!
🏘️ Starting Mubawab property scraper
⏰ Process started at: 2025-03-30 16:47:14

🔄 Starting to scrape page 1 of 290

🌐 Fetching page: https://www.mubawab.tn/fr/cc/immobilier-a-vendre-all:sc:apartment-sale,farm-sale,house-sale,other-sale,villa-sale:p:1
📋 Parsing HTML...
🔍 Found 35 listings
📑 Scraping house #1: Appartement en vente à La Soukra
  ↪ Getting details from: https://www.mubawab.tn/fr/pa/8038583/appartement-en-vente-%C3%A0-la-soukra
  ✅ Successfully scraped: Appartement en vente à La Soukra
📑 Scraping house #2: Superbe appartement à vendre à Ain Zaghouan Nord. Superficie...
  ↪ Getting details from: https://www.mubawab.tn/fr/pa/8036651/superbe-appartement-%C3%A0-vendre-%C3%A0-ain-zaghouan-nord-superficie-132m%C2%B2
  ✅ Successfully scraped: Superbe appartement à vendre à Ain Zaghouan Nord. Superficie...
📑 Scraping house #3: Jolie Appartement à vendre
  ↪ Getting details from: https://www.mubawab.tn/fr/pa/7881341/jolie-appartement-%C3%A0-vendre
