In [6]:
import requests
from bs4 import BeautifulSoup

# 目标URL
url = "https://www.matthewproctor.com/full_australian_postcodes_vic"

# 获取网页内容
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# 找到表格
table = soup.find('table')

# 提取表格中的数据并存入列表，跳过标题行
data = []
rows = table.find_all('tr')[1:]  # 跳过表格的第一行标题
for row in rows:
    columns = row.find_all('td')
    if len(columns) > 2:  # 确保列数足够
        Postcode = columns[1].text.strip()  # 第二列是邮政编码
        Locality = columns[2].text.strip()  # 第三列是地区
        data.append([Locality, Postcode])

# 输出结果
postcode_locality_list = data



In [8]:
import requests
from bs4 import BeautifulSoup
import csv

# Base URL
base_url = 'https://www.oldlistings.com.au/real-estate/VIC'

# Fake headers to simulate a browser visit

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
    'Referer': 'https://www.oldlistings.com.au',
    'Accept-Language': 'en-US,en;q=0.9'
}

# List of Victoria cities and postcodes
victoria_cities_postcodes = postcode_locality_list  # Assuming postcode_data is the nested list constructed earlier

# Open CSV file to save the extracted data
with open('../data/landing/historical_data.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write headers for CSV
    writer.writerow(['Address', 'Latitude', 'Longitude', 'Bedrooms', 'Bathrooms', 'Property Type', 'Last Advertised Price', 'Price History'])

    # Loop through each city and postcode, generate URL and try scraping
    for city, postcode in victoria_cities_postcodes:
        # Replace spaces in city names with plus signs for the URL
        city_url = city.replace(' ', '+')

        # First, check the first page to see if there is any valid data
        url = f'{base_url}/{city_url}/{postcode}/rent/1'
        print(f'Checking first page: {url}')
        
        # Send request to get the first page content
        response = requests.get(url, headers=headers)
        
        # Check response status
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Assuming there is property information on the page, extract address, price, etc.
            property_div = soup.find('div', class_='property')

            # If the first page doesn't contain any data, skip the current postcode
            if not property_div:
                print(f'No valid property information, skipping postcode: {postcode}')
                continue  # Skip current postcode and move to the next one

        else:
            print(f'Page not found: {url}')
            continue  # If the first page cannot be accessed, skip this postcode

        # If there is data on the first page, continue scraping other pages for the postcode
        for listing_id in range(1, 1001):
            url = f'{base_url}/{city_url}/{postcode}/rent/{listing_id}'
            print(f'Scraping: {url}')
            
            # Send request to get page content
            response = requests.get(url, headers=headers)
            
            # Check response status code
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Assuming there is property information on the page, extract address, price, etc.
                property_div = soup.find('div', class_='property')
                
                if not property_div:
                    print(f'No valid property information, skipping: {url}')
                    continue  # If no data is found on the current page, skip this page

                # Extract latitude and longitude information
                latitude = property_div.get('data-lat', 'No latitude info')
                longitude = property_div.get('data-lng', 'No longitude info')
                
                # Extract address
                address = property_div.find('h2', class_='address').text.strip() if property_div.find('h2', class_='address') else 'No address'

                # Extract number of bedrooms
                bedrooms = property_div.find('p', class_='property-meta bed').text.strip() if property_div.find('p', class_='property-meta bed') else 'No bedroom info'

                # Extract number of bathrooms
                bathrooms = property_div.find('p', class_='property-meta bath').text.strip() if property_div.find('p', class_='property-meta bath') else 'No bathroom info'

                # Extract property type
                property_type = property_div.find('p', class_='property-meta type').text.strip() if property_div.find('p', class_='property-meta type') else 'No type info'

                # Extract last advertised price
                last_price = property_div.find('section', class_='price').find('h3').text.strip() if property_div.find('section', class_='price') else 'No price info'

                # Extract historical price information
                historical_prices_section = property_div.find('section', class_='historical-price')
                historical_prices = historical_prices_section.find_all('li') if historical_prices_section else []

                # Iterate through historical prices
                price_history = []
                for price in historical_prices:
                    date = price.find('span').text.strip()
                    amount = price.contents[1].strip()  # The price is usually the text after the span
                    price_history.append(f"{date}: {amount}")
                
                # Write data to CSV
                writer.writerow([address, latitude, longitude, bedrooms, bathrooms, property_type, last_price, '; '.join(price_history)])

                # Print property information (optional)
                print(f'Address: {address}')
                print(f'Latitude: {latitude}, Longitude: {longitude}')
                print(f'Bedrooms: {bedrooms}')
                print(f'Bathrooms: {bathrooms}')
                print(f'Property Type: {property_type}')
                print(f'Last Advertised Price: {last_price}')
                print("Price History:")
                for history in price_history:
                    print(history)
                print('-' * 40)

            else:
                print(f'Page not found: {url}')


Checking first page: https://www.oldlistings.com.au/real-estate/VIC/MELBOURNE/3000/rent/1
Page not found: https://www.oldlistings.com.au/real-estate/VIC/MELBOURNE/3000/rent/1
Checking first page: https://www.oldlistings.com.au/real-estate/VIC/MELBOURNE/3001/rent/1
Page not found: https://www.oldlistings.com.au/real-estate/VIC/MELBOURNE/3001/rent/1
Checking first page: https://www.oldlistings.com.au/real-estate/VIC/EAST+MELBOURNE/3002/rent/1
Page not found: https://www.oldlistings.com.au/real-estate/VIC/EAST+MELBOURNE/3002/rent/1
Checking first page: https://www.oldlistings.com.au/real-estate/VIC/WEST+MELBOURNE/3003/rent/1
Page not found: https://www.oldlistings.com.au/real-estate/VIC/WEST+MELBOURNE/3003/rent/1
Checking first page: https://www.oldlistings.com.au/real-estate/VIC/MELBOURNE/3004/rent/1
Page not found: https://www.oldlistings.com.au/real-estate/VIC/MELBOURNE/3004/rent/1
Checking first page: https://www.oldlistings.com.au/real-estate/VIC/ST+KILDA+ROAD+CENTRAL/3004/rent/1
Pag

KeyboardInterrupt: 

In [9]:
pip install selenium


Note: you may need to restart the kernel to use updated packages.
