<font color=#1DA1F2>

- **Batch Geocoding:** Nominatim (OpenStreetMap) does not natively support batch geocoding through a single API request. Each geocoding request must be made individually. If you need to perform batch geocoding, you would typically need to loop through multiple requests programmatically, adhering to the service's rate limits.
- **Rate Limits:**
The public Nominatim service has a rate limit of 1 request per second (or 3600 requests per hour) per IP address. This limit can vary if you're using a self-hosted instance or a paid service that offers higher limits

In [1]:
# import libraries
import pandas as pd
from geopy.geocoders import Nominatim 
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time
from tqdm import tqdm

# Load the CSV file into a DataFrame
df = pd.read_csv('data/npidata_pfile.csv')

# Extract the first 5 digits of the zip code ,ensure that the zip code is an integer
df['Zip Code'] = df['Zip Code'].apply(lambda x: int(x / 10**(len(str(int(x))) - 5)) if not pd.isna(x) else x).astype('Int64')
# combine the names provider to create a full name column, ensure that the names are strings and it should be last name, first name middle name format 
df['Full Name'] = df['Provider Last Name'] + ', ' + df['Provider First Name'] + ' ' + df['Provider Middle Name']

df=df[['NPI','Full Name','Street','City','State','Zip Code']]
# drop rows with missing values of street 1,full name
df = df.dropna(subset=['Street','Full Name'])
# drop duplicates
df = df.drop_duplicates()
# print(df.shape)
# randomly sample 1000 rows
df = df.sample(1000)
df.sample(15)


Unnamed: 0,NPI,Full Name,Street,City,State,Zip Code
26181,1104559293,"SARRAZOLLA, SERRIA KATHLEEN",7210 W BARRISTER DR,BOISE,ID,83704
4349,1194216952,"MURPHY, CAMERON S",20041 RIVERSIDE COMMONS PLAZA,ASHBURN,VA,20147
26278,1073177341,"STEWART, NICOLE ANN",701 HEWITT BLVD,RED WING,MN,55066
7030,1699506436,"BOGLE, ELIZABETH SHELDRICK",4960 CORPORATE DR NW STE 135H,HUNTSVILLE,AL,35805
3235,1205135522,"ODELL, ELIZABETH PHAM",2778 TEAYS VALLEY RD,HURRICANE,WV,25526
50,1558192690,"HOLLENBACH, DENISE MICHELLE",400 TUSCARAWAS ST W,CANTON,OH,44702
28147,1639269848,"ALBERTSON, JENNIE H.",20 BRIMBAL AVE,BEVERLY,MA,19151
3649,1447089347,"LEFEBVRE, HEATHER L",301 MALLORY STATION RD STE 110,FRANKLIN,TN,37067
2268,1417788456,"TORRENTES, RANDY DAVID",1191 CENTRAL BLVD STE A,BRENTWOOD,CA,94513
28829,1902868789,"PARAS, RODERICK ROPHEO LAZO",44151 15TH ST W STE 101,LANCASTER,CA,93534


In [2]:
# Initialize the geolocator
geolocator = Nominatim(user_agent="my_app")

# Function to get latitude and longitude with error handling and retries
def get_lat_long(address, max_retries=3):
    for attempt in range(max_retries):
        try:
            location = geolocator.geocode(address)
            if location:
                return location.latitude, location.longitude
            else:
                return None, None
        except (GeocoderTimedOut, GeocoderServiceError):
            if attempt < max_retries - 1:
                time.sleep(1)  # Wait for 1 second before retrying
            else:
                return None, None
        except Exception:
            return None, None

# Create new column for Clean_address
df['Clean_address'] = df.apply(lambda row: f"{row['Street']}, {row['City']}, {row['State']} {row['Zip Code']} ", axis=1)

# Batch processing function
def process_batch(batch):
    results = []
    for address in batch:
        lat, lon = get_lat_long(address)
        results.append((lat, lon))
        time.sleep(1)  # 1 second delay between each request
    return results

# Set batch size and calculate number of batches
batch_size = 10
num_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)

# Process batches
all_results = []
for i in tqdm(range(num_batches), desc="Processing batches"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch = df['Clean_address'][start_idx:end_idx]
    batch_results = process_batch(batch)
    all_results.extend(batch_results)
    time.sleep(2)  # 2 seconds delay between batches

# Add results to DataFrame
df['latitude'], df['longitude'] = zip(*all_results)

# Save the updated DataFrame to a new CSV file
df.to_csv('data/npidata_pfile_OpenStreetMap_with_coordinates.csv', index=False)
df.sample(25)

Processing batches:   3%|▎         | 3/100 [01:14<40:04, 24.78s/it]