In [1]:
import re
from bs4 import BeautifulSoup
import csv
from time import sleep
import requests
import os
import json
import pandas as pd

In [6]:
def extract_property_info(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    #print(soup.prettify())
    
    # 提取 JSON-LD 数据
    json_ld = soup.find('script', type='application/ld+json')
    if json_ld:
        json_data = json.loads(json_ld.string)
    else:
        json_data = []

    # 提取 digitalData (更新这一行以使用 'string' 而不是 'text')
    digital_data_script = soup.find('script', string=re.compile('var digitalData'))
    if digital_data_script:
        digital_data_match = re.search(r'var digitalData = (.+?);', digital_data_script.string, re.DOTALL)
        if digital_data_match:
            digital_data = json.loads(digital_data_match.group(1))
        else:
            digital_data = {}
    else:
        digital_data = {}

    property_info = {}

    # 位置
    for item in json_data:
        if item.get('@type') == 'Residence':
            address = item.get('address', {})
            property_info['location'] = {
                'streetAddress': address.get('streetAddress'),
                'addressLocality': address.get('addressLocality'),
                'addressRegion': address.get('addressRegion'),
                'postalCode': address.get('postalCode')
            }
            break

    # 价格
    property_info['price'] = digital_data.get('page', {}).get('pageInfo', {}).get('property', {}).get('price')

    # 面积 (无法从提供的数据中获取)
    property_info['area'] = 'Not available'

    # 房间数、卫生间数、车位数
    property_data = digital_data.get('page', {}).get('pageInfo', {}).get('property', {})
    property_info['bedrooms'] = property_data.get('bedrooms')
    property_info['bathrooms'] = property_data.get('bathrooms')
    property_info['parking'] = property_data.get('parking')

    # Agency ID 和名字
    property_info['agency'] = {
        'id': property_data.get('agencyId'),
        'name': property_data.get('agency')
    }

    # NBN 类型
    property_info['nbn_type'] = digital_data.get('page', {}).get('pageInfo', {}).get('nbnDetails')

    # 房产类型
    property_info['property_type'] = property_data.get('primaryPropertyType')

    # 地理坐标
    for item in json_data:
        if item.get('@type') == 'Event' and 'location' in item:
            geo = item['location'].get('geo', {})
            property_info['geo'] = {
                'latitude': geo.get('latitude'),
                'longitude': geo.get('longitude')
            }
            break

    # 提取学校信息
    schools = []
    # 查找所有可能包含学校信息的元素
    school_elements = soup.find_all(['div', 'label'], class_=['css-1eyghyo', 'domain-checkbox'])
    for school in school_elements:
        school_name = school.find(['h4', 'div'], class_=['css-5w5cop', 'domain-checkbox__label'])
        if school_name:
            schools.append(school_name.text.strip())

    # 移除重复的学校名称
    schools = list(set(schools))
    property_info['nearby_schools'] = schools

    # 提取邻里年龄分布信息
    age_distribution = {}
    age_rows = soup.find_all('tr', class_='css-1a43shy')
    for row in age_rows:
        age_range = row.find('td', class_='css-1srjr3j')
        percentage = row.find('div', class_='css-199ul8s')
        if age_range and percentage:
            age_distribution[age_range.text.strip()] = percentage.text.strip()
    property_info['age_distribution'] = age_distribution

    return property_info

url = 'https://www.domain.com.au/9b-131-lonsdale-street-melbourne-vic-3000-17186660'
property_info = extract_property_info(url)
print(property_info)

In [8]:
def extract_property_urls(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    property_urls = []

    # 尝试从JSON-LD数据中提取URL
    json_ld_scripts = soup.find_all('script', type='application/ld+json')
    for script in json_ld_scripts:
        try:
            data = json.loads(script.string)
            if isinstance(data, list):
                for item in data:
                    if item.get('@type') == 'Event' and 'url' in item:
                        property_urls.append(item['url'])
            elif isinstance(data, dict) and data.get('@type') == 'Event' and 'url' in data:
                property_urls.append(data['url'])
        except json.JSONDecodeError:
            pass

    # 如果JSON-LD中没有找到URL，尝试从<a>标签中提取
    if not property_urls:
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            if href.startswith('https://www.domain.com.au/') and '-' in href:
                property_urls.append(href)

    # 去除重复的URL
    property_urls = list(set(property_urls))

    return property_urls

base_url = "https://www.domain.com.au/rent/vic/"
urls = extract_property_urls(base_url)
#print(json.dumps(urls, indent=2))
print(urls)

In [9]:
def convert_to_csv(property_l, output_path):
    # 确保property_list是一个列表
    if isinstance(property_l, str):
        property_l = ast.literal_eval(property_l)
    
    # 获取所有可能的键（列名）
    fieldnames = set()
    for property_info in property_l:
        fieldnames.update(property_info.keys())
    fieldnames = sorted(list(fieldnames))

    # 确保输出目录存在
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # 写入CSV文件
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # 写入表头
        writer.writeheader()
        
        # 写入每个属性的信息
        for property_info in property_l:
            writer.writerow(property_info)

    print(f"CSV file has been created at: {output_path}")


output_path = "../data/landing/properties.csv"

#### 调用函数
convert_property_list_to_csv(a, output_path)

In [10]:
def scrape_properties(base_url, page_l, output_path):
    url_l = []
    property_l = []
    
    for page in page_l:
        url = base_url + str(page)
        url_l.extend(extract_property_urls(url))
        
    for a_url in url_l:
        property_l.append(extract_property_info(a_url))
        
    convert_to_csv(property_l, output_path)

In [11]:
base_url = "https://www.domain.com.au/rent/vic/?page="
output_path = "../data/landing/properties.csv"
page_l = range(1,8)
scrape_properties(base_url, page_l, output_path)

CSV file has been created at: ../data/landing/properties.csv


In [10]:
property_df = pd.read_csv("../data/landing/properties.csv")

In [11]:
df_num = len(property_df)
print(df_num)

58


In [13]:
df_cols = property_df.columns.tolist()
print(df_cols)

['address', 'agency', 'agent_names', 'bathrooms', 'bedrooms', 'features', 'nbn_type', 'parking', 'price', 'property_type']


## STOP

In [5]:
def extract_property_info(url):
    #this function aims to extract information for a single property
    #the input url for this function has to be the url for a specfic property
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    print(soup)
    
    # Extract metadata
    metadata = {}
    meta_tags = soup.find_all('meta')
    for tag in meta_tags:
        if 'property' in tag.attrs:
            metadata[tag['property']] = tag.get('content', '')
        elif 'name' in tag.attrs:
            metadata[tag['name']] = tag.get('content', '')

    # Extract key information
    property_info = {
        'address': metadata.get('og:title', '').split(' - ')[0],
        'price': metadata.get('og:description', '').split('$')[1].split('/')[0] if '$' in metadata.get('og:description', '') else '',
        'bedrooms': extract_number(metadata.get('og:description', ''), r'(\d+)\s*bedroom'),
        'bathrooms': extract_number(metadata.get('og:description', ''), r'(\d+)\s*bathroom'),
        'property_type': metadata.get('og:description', '').split('rental ')[1].split(' at')[0] if 'rental' in metadata.get('og:description', '') else '',
        'agency': metadata.get('property:agency', ''),
        'agent_names': metadata.get('property:agentNames', ''),
        'features': metadata.get('property:propertyFeatures', '').split(', '),
        'parking': extract_number(metadata.get('property:parking', ''), r'(\d+)'),
        'nbn_type': metadata.get('property:nbnDetails', ''),
    }
    
    return property_info

def extract_number(text, pattern):
    match = re.search(pattern, text)
    return int(match.group(1)) if match else None

def extract_date(text):
    match = re.search(r'Available from (.+)$', text)
    return match.group(1) if match else None


#'images': [metadata.get(f'twitter:image{i}', '') for i in range(4) if f'twitter:image{i}' in metadata],
#

In [1]:
print('haha')

print("Soup Content (first 1000 characters):")
    print(soup.prettify()[:1000])
    
    print("\nSoup Tags (first 20):")
    for i, tag in enumerate(soup.find_all()):
        if i >= 20:
            break
        print(f"{tag.name}: {tag.get('class', 'No class')} - {tag.get('id', 'No id')}")
    
    print("\nAll unique tag names:")
    print(set(tag.name for tag in soup.find_all()))

haha


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from time import sleep

def view_soup_content(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    print(soup)

def get_html_structure(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    def print_structure(element, indent=0):
        print(' ' * indent + str(element.name))
        for child in element.children:
            if child.name is not None:
                print_structure(child, indent + 2)

    print_structure(soup.body)

def scrape_apartments(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    apartments = []
    
    # 这里需要根据实际网站结构来定位和提取信息
    for apartment in soup.find_all('div', class_='apartment-listing'):
        title = apartment.find('h2', class_='title').text.strip()
        price = apartment.find('span', class_='price').text.strip()
        location = apartment.find('div', class_='location').text.strip()
        
        apartments.append({
            'title': title,
            'price': price,
            'location': location
        })
    
    return apartments

def save_to_csv(apartments, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['title', 'price', 'location'])
        writer.writeheader()
        for apartment in apartments:
            writer.writerow(apartment)

def main():
    base_url = 'https://www.domain.com.au/rent/?state=vic&page='
    #base_url = 'https://www.domain.com.au/1015-338-kings-way-south-melbourne-vic-3205-17186518'
    
    print("Viewing Soup Content:")
    view_soup_content(base_url + '1')# + '1'
    
    # 首先查看HTML结构
    #print("HTML Structure of the first page:")
    #get_html_structure(base_url + '1')
    
    # 等待用户确认是否继续
    input("Press Enter to continue with scraping...")
    
    all_apartments = []
    
    for page in range(1, 6):  # 假设我们爬取前5页
        url = base_url + str(page)
        apartments = scrape_apartments(url)
        all_apartments.extend(apartments)
        print(f'Scraped page {page}, found {len(apartments)} apartments')
        sleep(2)  # 在请求之间等待2秒，以避免对服务器造成过大压力
    
    save_to_csv(all_apartments, 'victoria_apartments.csv')
    print(f'Scraped a total of {len(all_apartments)} apartments')

if __name__ == '__main__':
    main()

Viewing Soup Content:
<!DOCTYPE html>
<html data-build-git-hash="4135e2bc2e79557ea1debc6c6a2b2e220ba80dae" data-build-id="master-6261" data-build-time="Wed Aug 28 2024 14:18:38 GMT+1000 (Australian Eastern Standard Time)" data-version="9.126.0" lang="en-AU"><head><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1" name="viewport"/><title>4000+ Rental Properties in vic | Domain</title><meta content="Domain has 4000+ Rental Properties in vic &amp; surrounding suburbs. View our listings &amp; use our detailed filters to find your perfect home." name="description"/><link href="android-app://com.fairfax.domain/https/www.domain.com.au/" rel="alternate"/><link href="ios-app://319908646/domain.com.au/domainapp" rel="alternate"/><meta content="4000+ Rental Properties in vic | Domain" property="og:title"/><meta content="Domain" property="og:site_name"/><meta property="og:url"/><meta content="Domain has 4000+ Rental Properties in vic &amp; surrounding suburbs. View our list

In [2]:
def scrape(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    properties = []
    
    # 这里需要根据实际网站结构来定位和提取信息
    for prop in soup.find_all('div', class_='apartment-listing'):
        title = apartment.find('h2', class_='title').text.strip()
        price = apartment.find('span', class_='price').text.strip()
        location = apartment.find('div', class_='location').text.strip()
        
        apartments.append({
            'title': title,
            'price': price,
            'location': location
        })
    
    return apartments

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from time import sleep
import os

def view_soup_content(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    print("Soup Content (first 1000 characters):")
    print(soup.prettify()[:1000])
    
    print("\nSoup Tags (first 20):")
    for i, tag in enumerate(soup.find_all()):
        if i >= 20:
            break
        print(f"{tag.name}: {tag.get('class', 'No class')} - {tag.get('id', 'No id')}")
    
    print("\nAll unique tag names:")
    print(set(tag.name for tag in soup.find_all()))
    
def get_apartment_links(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # 假设每个房产链接在一个 <a> 标签中，类名为 'apartment-link'
    links = []
    for a_tag in soup.find_all('a', class_='apartment-link'):
        links.append(a_tag['href'])
    
    return links

In [None]:
def print_structure(element, indent=0):
        print(' ' * indent + str(element.name))
        for child in element.children:
            if child.name is not None:
                print_structure(child, indent + 2)

    print_structure(soup.body)