In [1]:
import re
from bs4 import BeautifulSoup
import csv
from time import sleep
import requests
import os
import json
import pandas as pd

In [2]:
def extract_property_info(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None

    property_info = {}

    # 提取 JSON-LD 数据
    json_ld = soup.find('script', type='application/ld+json')
    if json_ld:
        try:
            json_data = json.loads(json_ld.string)
            if not isinstance(json_data, list):
                json_data = [json_data]
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON-LD data from {url}: {e}")
            print("JSON-LD content:", json_ld.string[:500])
            json_data = []
    else:
        print(f"No JSON-LD data found in {url}")
        json_data = []

    # 提取 digitalData
    digital_data_script = soup.find('script', string=re.compile('var digitalData'))
    if digital_data_script:
        digital_data_match = re.search(r'var digitalData = (.+?);', digital_data_script.string, re.DOTALL)
        if digital_data_match:
            try:
                digital_data = json.loads(digital_data_match.group(1))
                #print(f"Digital data structure: {json.dumps(digital_data, indent=2)[:500]}...")  # 打印数据结构
            except json.JSONDecodeError as e:
                print(f"Error decoding digitalData from {url}: {e}")
                print("digitalData content:", digital_data_match.group(1)[:500])
                digital_data = {}
        else:
            print(f"digitalData pattern not found in {url}")
            digital_data = {}
    else:
        print(f"digitalData script not found in {url}")
        digital_data = {}

    # 提取属性数据，添加更多的错误检查
    page_info = digital_data.get('page', {})
    if not isinstance(page_info, dict):
        print(f"Unexpected 'page' structure in digital_data: {type(page_info)}")
        page_info = {}

    property_data = page_info.get('pageInfo', {}).get('property', {})
    if not isinstance(property_data, dict):
        print(f"Unexpected 'property' structure in digital_data: {type(property_data)}")
        property_data = {}

    # 安全地获取属性
    property_info['price'] = property_data.get('price')
    property_info['area'] = 'Not available'
    property_info['bedrooms'] = property_data.get('bedrooms')
    property_info['bathrooms'] = property_data.get('bathrooms')
    property_info['parking'] = property_data.get('parking')
    property_info['agency'] = {
        'id': property_data.get('agencyId'),
        'name': property_data.get('agency')
    }
    property_info['nbn_type'] = page_info.get('pageInfo', {}).get('nbnDetails')
    property_info['property_type'] = property_data.get('primaryPropertyType')

    # 地理坐标
    for item in json_data:
        if isinstance(item, dict) and item.get('@type') == 'Event' and 'location' in item:
            geo = item['location'].get('geo', {})
            property_info['geo'] = {
                'latitude': geo.get('latitude'),
                'longitude': geo.get('longitude')
            }
            break
    
    # 提取地址信息
    for item in json_data:
        if item.get('@type') == 'Residence':
            address = item.get('address', {})
            property_info['location'] = {
                'streetAddress': address.get('streetAddress'),
                'addressLocality': address.get('addressLocality'),
                'addressRegion': address.get('addressRegion'),
                'postalCode': address.get('postalCode')
            }
            break        
    
    # 提取学校信息
    schools = []
    school_elements = soup.find_all(['div', 'label'], class_=['css-1eyghyo', 'domain-checkbox'])
    for school in school_elements:
        school_name = school.find(['h4', 'div'], class_=['css-5w5cop', 'domain-checkbox__label'])
        if school_name:
            schools.append(school_name.text.strip())
    property_info['nearby_schools'] = list(set(schools))

    # 提取邻里年龄分布信息
    age_distribution = {}
    age_rows = soup.find_all('tr', class_='css-1a43shy')
    for row in age_rows:
        age_range = row.find('td', class_='css-1srjr3j')
        percentage = row.find('div', class_='css-199ul8s')
        if age_range and percentage:
            age_distribution[age_range.text.strip()] = percentage.text.strip()
    property_info['age_distribution'] = age_distribution

    return property_info

url = 'https://www.domain.com.au/9b-131-lonsdale-street-melbourne-vic-3000-17186660'
property_info = extract_property_info(url)
print(property_info)

In [3]:
def extract_property_urls(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    property_urls = []

    # 尝试从JSON-LD数据中提取URL
    json_ld_scripts = soup.find_all('script', type='application/ld+json')
    for script in json_ld_scripts:
        try:
            data = json.loads(script.string)
            if isinstance(data, list):
                for item in data:
                    if item.get('@type') == 'Event' and 'url' in item:
                        property_urls.append(item['url'])
            elif isinstance(data, dict) and data.get('@type') == 'Event' and 'url' in data:
                property_urls.append(data['url'])
        except json.JSONDecodeError:
            pass

    all_links = soup.find_all('a', href=True)
    domain_pattern = re.compile(r'https?://www\.domain\.com\.au/.*-\d+$')

    for link in all_links:
        href = link['href']
        # 检查链接是否匹配domain.com.au的房产URL模式
        if domain_pattern.match(href):
            property_urls.append(href)
        # 处理相对URL
        elif href.startswith('/') and '-' in href and href.split('-')[-1].isdigit():
            full_url = f"https://www.domain.com.au{href}"
            if full_url.endswith('/'):
                property_urls.append(full_url)
            else:
                property_urls.append(full_url + '/')

    

    # 去除重复的URL
    property_urls = list(set(property_urls))
    
    filtered_urls = [
        url for url in property_urls 
        if re.search(r'/[\w-]+-\d+$', url) and 'suburb-profile' not in url
    ]
    
    return filtered_urls

base_url = "https://www.domain.com.au/rent/vic/"
urls = extract_property_urlss(base_url)
#print(json.dumps(urls, indent=2))
print(len(urls))

In [4]:
def convert_to_csv(property_l, output_path):
    # 确保property_list是一个列表
    if isinstance(property_l, str):
        property_l = ast.literal_eval(property_l)
        
    property_l = [p for p in property_l if p is not None]

    if not property_l:
        print("No valid properties to write to CSV.")
        return
    
    fieldnames = set()
    for property_info in property_l:
        if isinstance(property_info, dict):
            fieldnames.update(property_info.keys())
        else:
            print(f"Skipping non-dict item: {property_info}")

    # 确保输出目录存在
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # 写入CSV文件
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # 写入表头
        writer.writeheader()
        
        # 写入每个属性的信息
        for property_info in property_l:
            writer.writerow(property_info)

    print(f"CSV file has been created at: {output_path}")


output_path = "../data/landing/properties.csv"

#### 调用函数
convert_property_list_to_csv(a, output_path)

In [5]:
def scrape_properties(base_url, page_l, output_path):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    url_l = []
    property_l = []
    
    for base_url in base_url_l:
        response = requests.get(base_url, headers=headers)
        if response.status_code == 200:
            for page in page_l:
                url = base_url + str(page)
                try:
                    response = requests.get(url, headers=headers, timeout=10)
                    response.raise_for_status()
                    new_urls = extract_property_urls(url)
                    if not new_urls:
                        break
                    url_l.extend(new_urls)
                except RequestException as e:
                    break
           # time.sleep(1)
            
            
       #     for page in page_l:
        #        url = base_url + str(page)
      #          response = requests.get(base_url, headers=headers)
       #         if response.status_code == 200:
        #            url_l.extend(extract_property_urls(url))
      #          else:
       #             break
           
        print(f"number of url = {len(url_l)}")
    url_l = list(set(url_l))
    
    cont = 0
    for a_url in url_l:
        cont += 1
        property_l.append(extract_property_info(a_url))
        if cont % 200 == 0:
            print(f"finish {cont}")
    
    return property_l    
    #convert_to_csv(property_l, output_path)

In [6]:
base_url_l = []
with open('../data/landing/victoria_suburbs_postcodes.csv', 'r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # 跳过标题行
    for row in reader:
        if len(row) >= 2:
            suburb, postcode = row[0], row[1]
            suburb_url = suburb.lower().replace(' ', '-')
            url = f"https://www.domain.com.au/rent/{suburb_url}-vic-{postcode}/?ssubs=0&page="
            base_url_l.append(url)
#print(base_url_l)

In [7]:
output_path = "../data/landing/properties.csv"
#base_url_l = ['https://www.domain.com.au/rent/cranbourne-vic-3977/?ssubs=0&page=']
page_l = range(1,51)
properties = scrape_properties(base_url_l, page_l, output_path)

number of url = 886
number of url = 886
number of url = 923
number of url = 1013
number of url = 1121
number of url = 1527
number of url = 1726
number of url = 1865
number of url = 1882
number of url = 1940
number of url = 1967
number of url = 1979
number of url = 2020
number of url = 2048
number of url = 2067
number of url = 2108
number of url = 2122
number of url = 2251
number of url = 2286
number of url = 2336
number of url = 2372
number of url = 2396
number of url = 2425
number of url = 2439
number of url = 2499
number of url = 2528
number of url = 2569
number of url = 2634
number of url = 2645
number of url = 2666
number of url = 2773
number of url = 2804
number of url = 2993
number of url = 3019
number of url = 3082
number of url = 3165
number of url = 3236
number of url = 3303
number of url = 3315
number of url = 3335
number of url = 3371
number of url = 3420
number of url = 3481
number of url = 3493
number of url = 3524
number of url = 3568
number of url = 3596
number of url = 

In [8]:
convert_to_csv(properties, output_path)

CSV file has been created at: ../data/landing/properties.csv
