In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
from time import sleep

# initialised Nominatim 
geolocator = Nominatim(user_agent="geocoder", timeout=10)

data = pd.read_csv('../data/curated/property/property 2.csv')

# Create a function to handle address geocoding and handle retries
def geocode_address(address, retries=3):
    for attempt in range(retries):
        try:
            location = geolocator.geocode(address)
            if location:
                return location.latitude, location.longitude
            else:
                return None, None
        except Exception as e:
            print(f"Error geocoding {address}: {e}, retrying ({attempt+1}/{retries})...")
            sleep(2)  # Wait a few seconds between each retry
    return None, None

# Batch handling function
def batch_geocode(data, batch_size=100, sleep_time=1):
    latitudes = []
    longitudes = []
    
    # Process data in batches
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        print(f"Process data in batches {i // batch_size + 1} ")
        
        for address in batch['address']:
            lat, lon = geocode_address(address)
            latitudes.append(lat)
            longitudes.append(lon)
            sleep(sleep_time)  # Pause after each request, complying with Nominatim's rate limit
        
        print(f" {i // batch_size + 1} batches processing ove.")
    
    return latitudes, longitudes

batch_size = 100  # Each batch processes 100 addresses
latitudes, longitudes = batch_geocode(data, batch_size=batch_size, sleep_time=2)

# Add geocoding results to the dataset
data['latitude'] = latitudes
data['longitude'] = longitudes

csv_output_path = '../data/curated/property/property_addresses_to_lat_lng_2.csv'
data.to_csv(csv_output_path, index=False)

print(f"All address data has been saved to {csv_output_path}")


In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
import re
from time import sleep

# initialised Nominatim 
geolocator = Nominatim(user_agent="geocoder", timeout=10)

data = pd.read_csv('../data/curated/property/property_addresses_to_lat_lng_2.csv')

def geocode_address(address, retries=3):
    for attempt in range(retries):
        try:
            location = geolocator.geocode(address)
            if location:
                return location.latitude, location.longitude
            else:
                return None, None
        except Exception as e:
            print(f"Error geocoding {address}: {e}, retrying ({attempt+1}/{retries})...")
            sleep(2)  # Each retry interval is 2 seconds
    return None, None

# Remove numbers and symbols from the address
def remove_numbers(address):
    return re.sub(r'[^a-zA-Z\s]', '', address).strip()

# Filter out addresses without latitude and longitude
no_lat_lng = data[data['latitude'].isna() | data['longitude'].isna()].copy()

# Remove numbers and symbols from these addresses
no_lat_lng['cleaned_address'] = no_lat_lng['address'].apply(remove_numbers)

# The processed address is geocoded
no_lat_lng['latitude'], no_lat_lng['longitude'] = zip(*no_lat_lng['cleaned_address'].apply(geocode_address))


data.update(no_lat_lng[['latitude', 'longitude']])

csv_output_path = '../data/curated/property/property_addresses_to_lat_lng_2_improve.csv'
data.to_csv(csv_output_path, index=False)

print(f"All address data has been saved to {csv_output_path}")

In [None]:
import pandas as pd

file_path = '../data/curated/property/property_addresses_to_lat_lng_2_improve.csv'  
data = pd.read_csv(file_path)

# Delete lines containing null values
cleaned_data = data.dropna(subset=['latitude', 'longitude'])

output_file_path = '../data/curated/property/property_addresses_to_lat_lng_2_final.csv'  
cleaned_data.to_csv(output_file_path, index=False)

print("Rows with empty latitude and longitude have been deleted, and the cleared data has been saved.")
