In [1]:
import pandas as pd
import requests
import time


In [2]:
# Step 1: Load the property details from the Parquet file using pandas
property_df = pd.read_parquet('../../data/raw/property_details.parquet')


In [3]:
# Drop rows where price = None (Sometimes not given on website)
property_df = property_df[property_df['price'].notnull()]

# Drop nbn_details, property_features columns (often left empty on website)
property_df = property_df.drop(columns=['nbn_details', 'property_features'])

# View rows with None values
property_df[property_df.isnull().any(axis=1)]

Unnamed: 0,title,description,street_address,suburb,postcode,price,bedrooms,bathrooms,parking,primary_property_type,structured_features,video_count,photo_count,date_listed,days_listed,floor_plans_count,virtual_tour,nearby_schools


In [4]:
# Step 2: Define the batch size (20 properties per batch)
batch_size = 20
num_batches = len(property_df) // batch_size + 1


In [5]:
# Function to geocode addresses using Nominatim with User-Agent header
def geocode_address(address):
    base_url = 'https://nominatim.openstreetmap.org/search'
    params = {
        'q': address,  # Property address
        'format': 'json',
        'limit': 1
    }
    headers = {
        # Add your app name and contact email here
        'User-Agent': 'PropertyLocation/1.0'  
    }
    
    # Send the API request with the User-Agent header
    response = requests.get(base_url, params=params, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        if data:
            lat = data[0].get('lat')
            lon = data[0].get('lon')
            return lat, lon
        else:
            return None, None
    else:
        print(f"Error: Failed to retrieve data for '{address}'. Status code: {response.status_code}")
        return None, None

In [6]:
# Initialize an empty DataFrame to store all geocoded results
all_geocoded_df = pd.DataFrame()

In [7]:
# Step 3: Loop through the DataFrame in batches of 20 properties
for batch_num in range(num_batches):
    # Get the current batch of 20 properties
    start_idx = batch_num * batch_size
    end_idx = min(start_idx + batch_size, len(property_df))
    property_batch = property_df.iloc[start_idx:end_idx].copy()  # Make a copy of the batch

    # Initialize lists to store geocoded results for the current batch
    latitudes = []
    longitudes = []

    # Step 4: Geocode addresses for the current batch
    for address in property_batch['street_address']:
        lat, lon = geocode_address(address)
        latitudes.append(lat)
        longitudes.append(lon)
        
        # Respect Nominatim's rate limit of 1 request per second
        time.sleep(1)
    
    # Add latitude and longitude columns to the current batch using .loc
    property_batch.loc[:, 'latitude'] = latitudes
    property_batch.loc[:, 'longitude'] = longitudes

    # Append the current batch to the full DataFrame
    all_geocoded_df = pd.concat([all_geocoded_df, property_batch], ignore_index=True)
    print(f"Geocoding complete for batch {batch_num}.")
    
    # Step 5: Print only the columns `address`, `latitude`, and `longitude` after every 20 batches
    if (batch_num + 1) % 20 == 0 or batch_num == num_batches - 1:
        print(f"DataFrame after {batch_num + 1} batches (address, latitude, longitude):")
        print(all_geocoded_df[['street_address', 'latitude', 'longitude']].head())  

Geocoding complete for batch 0.
Geocoding complete for batch 1.
Geocoding complete for batch 2.
Geocoding complete for batch 3.
Geocoding complete for batch 4.
Geocoding complete for batch 5.
Geocoding complete for batch 6.
Geocoding complete for batch 7.
Geocoding complete for batch 8.
Geocoding complete for batch 9.
Geocoding complete for batch 10.
Geocoding complete for batch 11.
Geocoding complete for batch 12.
Geocoding complete for batch 13.
Geocoding complete for batch 14.
Geocoding complete for batch 15.
Geocoding complete for batch 16.
Geocoding complete for batch 17.
Geocoding complete for batch 18.
Geocoding complete for batch 19.
DataFrame after 20 batches (address, latitude, longitude):
                                  street_address             latitude  \
0  60 Little Windrock Lane, Craigieburn VIC 3064           -37.588897   
1              53 Were Street, Brighton VIC 3186            -37.92564   
2           43 Tackle Drive, Point Cook VIC 3030          -37.9062569   

KeyboardInterrupt: 

~4 minutes for 100 properties per 20 batch

In [8]:
# Step 6: Save the geocoded subset to a single Parquet file
all_geocoded_df.to_parquet('../../data/curated/location_property_details.parquet', index=False)

print("Results saved to 'location_property_details.parquet'.")

Results saved to 'location_property_details.parquet'.


In [9]:
# Read the Parquet file into a DataFrame
df = pd.read_parquet('../../data/curated/location_property_details.parquet')

In [10]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,title,description,street_address,suburb,postcode,price,bedrooms,bathrooms,parking,primary_property_type,structured_features,video_count,photo_count,date_listed,days_listed,floor_plans_count,virtual_tour,nearby_schools,latitude,longitude
0,"60 Little Windrock Lane, Craigieburn VIC 3064 ...","View this 2 bedroom, 1 bathroom rental house a...","60 Little Windrock Lane, Craigieburn VIC 3064",Craigieburn,3064,$450 Per Week,2.0,1.0,1.0,House,"[{'category': 'Indoor', 'name': 'Built in ward...",0.0,21.0,2024-08-22T16:07:26.000,14.0,0.0,False,"[{'address': 'Craigieburn, VIC 3064', 'distanc...",-37.588897,144.9155161
1,"53 Were Street, Brighton VIC 3186 - House For ...","View this $1,500/week 4 bedroom, 2 bathroom re...","53 Were Street, Brighton VIC 3186",Brighton,3186,"$1,490.00",4.0,2.0,2.0,House,[],0.0,6.0,2024-06-02T18:11:41.000,95.0,2.0,True,"[{'address': 'Brighton, VIC 3186', 'distance':...",-37.92564,144.9999037
2,"43 Tackle Drive, Point Cook VIC 3030 - Townhou...","View this 3 bedroom, 2 bathroom rental townhou...","43 Tackle Drive, Point Cook VIC 3030",Point Cook,3030,$550 per Week,3.0,2.0,2.0,Townhouse/Villa,"[{'category': 'Outdoor', 'name': 'Secure Parki...",0.0,17.0,2024-09-03T12:01:18.000,2.0,0.0,True,"[{'address': 'Point Cook, VIC 3030', 'distance...",-37.9062569,144.7202541
3,"3 Rostrevor Parade, Mont Albert VIC 3127 - Hou...","View this 5 bedroom, 2 bathroom rental house a...","3 Rostrevor Parade, Mont Albert VIC 3127",Mont Albert,3127,$800 weekly,5.0,2.0,2.0,House,[],0.0,8.0,2024-07-01T12:53:48.000,66.0,0.0,False,"[{'address': 'Mont Albert, VIC 3127', 'distanc...",-37.8129176,145.1061095
4,"48 Roberts Street, Frankston VIC 3199 - Studio...","View this 9 bedroom, 3 bathroom rental studio ...","48 Roberts Street, Frankston VIC 3199",Frankston,3199,$299 per week,9.0,3.0,4.0,Apartment,"[{'category': 'Indoor', 'name': 'Furnished', '...",0.0,20.0,2024-07-02T11:24:10.000,65.0,1.0,False,"[{'address': 'Frankston, VIC 3199', 'distance'...",-38.154912800000005,145.14040905062515


In [11]:
len(df)

520