# Data Scraping

In this notebook, we will fetch and download all the necessary datasets and store them in their associated files within the "data" file

### Importing Libraries and Functions

In [1]:
import sys
sys.path.append('../')
import os
from json import dump
from scripts.scrape_sa2 import get_shapefile
from scripts.parallelised_scrape import generate_url_list, fetch_all_rental_data
import asyncio

### Defining Global Variables - changes pending

In [2]:
# CONSTANTS
BASE_URL = "https://www.domain.com.au"
LANDING_PATH = "data/landing"
RAW_PATH = "data/raw"

### Fetching All The '*domain.com*' Rental Property Links

In [None]:
# Generating all links
url_links = generate_url_list(BASE_URL)

In [4]:
# Checking we have the correct number of properties
print(len(url_links))

13725


### Fetching The Data From Each Of The URLs 

In [None]:
# Scraping all data
rental_data = fetch_all_rental_data(url_links)

In [6]:
# Creates the directory for the rental data if it doesn't exist
if not os.path.exists(f"../{LANDING_PATH}"):
    os.makedirs(f"../{LANDING_PATH}")

# Save the data in the data/landing directory
with open(f'../{LANDING_PATH}/all_properties_metadata.json', 'w') as f:
    dump(rental_data, f)
    print(f"File saved in the {LANDING_PATH} directory")

File saved in the data/landing directory


## Scraping Suburb Info

In [8]:
# Get post codes 
from scripts.scrape_oldlistings import scrape_postcodes_from_file

In [11]:
postcode_df = scrape_postcodes_from_file()
postcode_df.head()

Unnamed: 0,suburb,postcode
0,Abbotsford,3067
1,Aberfeldie,3040
2,Aberfeldy,3825
3,Acheron,3714
4,Addington,3352


### Scraping Additional Data

In [3]:
# scrapes SA2 shapfile and downloads to data folder

get_shapefile(url="https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/SA2_2021_AUST_SHP_GDA2020.zip", output_dir = '../data/SA2/')