In [1]:
import requests
from bs4 import BeautifulSoup

# 目标URL
url = "https://www.matthewproctor.com/full_australian_postcodes_vic"

# 获取网页内容
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# 找到表格
table = soup.find('table')

# 提取表格中的数据并存入列表，跳过标题行
data = []
rows = table.find_all('tr')[1:]  # 跳过表格的第一行标题
for row in rows:
    columns = row.find_all('td')
    if len(columns) > 2:  # 确保列数足够
        Postcode = columns[1].text.strip()  # 第二列是邮政编码
        Locality = columns[2].text.strip()  # 第三列是地区
        data.append([Locality, Postcode])

# 输出结果
postcode_locality_list = data



In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Base URL
base_url = 'https://www.oldlistings.com.au/real-estate/VIC'

# List of different User-Agent strings for simulating requests from different devices
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15A372 Safari/604.1',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko'
]

# Example city and postcode list
victoria_cities_postcodes = postcode_locality_list  # Replace with actual data

# Function to get the maximum page number
def get_max_page(soup):
    pagination = soup.find('ul', class_='pagination')  # Find the pagination element
    if pagination:
        page_numbers = pagination.find_all('a')  # Get all page links
        max_page = max([int(page.text) for page in page_numbers if page.text.isdigit()])
        return max_page
    return 1  # Return 1 if no pagination is found

# Set up session with retry mechanism
session = requests.Session()
retry = Retry(total=5, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])  # Retry on server errors
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Open CSV file to save the extracted data
with open('../data/landing/historical_data.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write headers for CSV
    writer.writerow(['Address', 'Latitude', 'Longitude', 'Bedrooms', 'Bathrooms', 'Car Parks', 'Property Type', 'Last Advertised Price', 'Price History'])

    # Loop through each city and postcode, generate URL and try scraping
    for city, postcode in victoria_cities_postcodes:
        # Replace spaces in city names with plus signs for the URL
        city_url = city.replace(' ', '+')

        # Start with the first page to find the maximum number of pages
        first_url = f'{base_url}/{city_url}/{postcode}/rent/1'
        headers = {
            'User-Agent': random.choice(user_agents),  # Randomly choose a User-Agent
            'Referer': 'https://www.oldlistings.com.au',
            'Accept-Language': 'en-US,en;q=0.9'
        }
        try:
            response = session.get(first_url, headers=headers)  # Removed proxy
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f'Error fetching the first page for {city} ({postcode}): {e}')
            continue

        soup = BeautifulSoup(response.text, 'html.parser')

        # Get the maximum page number from the first page
        max_page = get_max_page(soup)
        print(f'Maximum page for {city} ({postcode}): {max_page}')

        # Loop through each page
        for page in range(1, max_page + 1):
            url = f'{base_url}/{city_url}/{postcode}/rent/{page}'
            print(f'Scraping: {url}')
            
            # Randomly choose a User-Agent for each request
            headers['User-Agent'] = random.choice(user_agents)

            try:
                response = session.get(url, headers=headers)  # Removed proxy
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f'Error fetching page {page} for {city} ({postcode}): {e}')
                continue

            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all property listings on the page
            properties = soup.find_all('div', class_='property')

            # Iterate through each property listing
            for property_div in properties:
                # Extract latitude and longitude information
                latitude = property_div.get('data-lat', 'No latitude info')
                longitude = property_div.get('data-lng', 'No longitude info')

                # Extract address
                address = property_div.find('h2', class_='address').text.strip() if property_div.find('h2', class_='address') else 'No address'

                # Extract number of bedrooms
                bedrooms = property_div.find('p', class_='property-meta bed').text.strip() if property_div.find('p', class_='property-meta bed') else 'No bedroom info'

                # Extract number of bathrooms
                bathrooms = property_div.find('p', class_='property-meta bath').text.strip() if property_div.find('p', class_='property-meta bath') else 'No bathroom info'

                # Extract car parks
                car_parks = property_div.find('p', class_='property-meta car').text.strip() if property_div.find('p', class_='property-meta car') else 'No car park info'

                # Extract property type
                property_type = property_div.find('p', class_='property-meta type').text.strip() if property_div.find('p', class_='property-meta type') else 'No type info'

                # Extract last advertised price
                price_section = property_div.find('section', class_='price')
                last_price = price_section.find('h3').text.strip() if price_section and price_section.find('h3') else 'No price info'

                # Extract historical price information
                historical_prices_section = property_div.find('section', class_='historical-price')
                historical_prices = historical_prices_section.find_all('li') if historical_prices_section else []

                # Iterate through historical prices
                price_history = []
                for price in historical_prices:
                    date = price.find('span').text.strip() if price.find('span') else 'No date'
                    amount = price.contents[1].strip() if len(price.contents) > 1 and isinstance(price.contents[1], str) else 'No price info'
                    price_history.append(f"{date}: {amount}")
                
                # Write data to CSV
                writer.writerow([address, latitude, longitude, bedrooms, bathrooms, car_parks, property_type, last_price, '; '.join(price_history)])

                # Optional: Print property information to console
                print(f'Address: {address}')
                print(f'Latitude: {latitude}, Longitude: {longitude}')
                print(f'Bedrooms: {bedrooms}')
                print(f'Bathrooms: {bathrooms}')
                print(f'Car Parks: {car_parks}')
                print(f'Property Type: {property_type}')
                print(f'Last Advertised Price: {last_price}')
                print("Price History:", '; '.join(price_history))
                print('-' * 40)

            # Add random delay between requests (between 1 and 5 seconds)
            delay = random.uniform(1, 5)
            print(f"Sleeping for {delay:.2f} seconds to prevent being blocked...")
            time.sleep(delay)


Maximum page for MELBOURNE (3000): 852
Scraping: https://www.oldlistings.com.au/real-estate/VIC/MELBOURNE/3000/rent/1
Address: 452 ST KILDA ROAD, MΕ𝙻ВОՍRNΕ
Latitude: , Longitude: 
Bedrooms: Bed : 𝟷
Bathrooms: Bath : 1
Car Parks: No car park info
Property Type: Category : Servicedapartment
Last Advertised Price: $450
Price History: September 2024: $450; June 2024: $550
----------------------------------------
Address: 807/333 EXHIBITION STREET, MΕ𝙻ВОՍRNΕ
Latitude: , Longitude: 
Bedrooms: Bed : 1
Bathrooms: Bath : 1
Car Parks: No car park info
Property Type: Category : Unit/apmt
Last Advertised Price: $550
Price History: September 2024: $550; April 2024: $𝟻𝟻O; October 2022: $4​40; January 2021: $380 per week; January 2021: $38​0 Weekly
----------------------------------------
Address: ４O𝟸 / ４４𝟸 ST KI𝙻DΑ RОΑD , MELBOU​RNE
Latitude: , Longitude: 
Bedrooms: Bed : 2​
Bathrooms: Bath : 𝟸
Car Parks: Car : 1
Property Type: No type info
Last Advertised Price: $680 per week
Price History: Septemb