In [1]:
#import lib and read the data
import numpy as npy
import pandas as pd
df = pd.read_csv('cleaned_meteorite_landing_from_1950.csv')
df.head()

Unnamed: 0,name,id,type,classification,mass,fall,year,latitude,longitude
0,Haven,11858,Valid,H6,6100.0,Found,1950.0,37.96417,-97.75583
1,St. Louis,23089,Valid,H4,1000.0,Fell,1950.0,38.7,-90.23333
2,Arroyo Aguiar,2340,Valid,H5,7450.0,Fell,1950.0,-31.41667,-60.66667
3,Plainview (1950),18842,Valid,H,2200.0,Found,1950.0,34.11667,-101.78333
4,Santa Rosalia,23168,Valid,"Pallasite, PMG",1631.0,Found,1950.0,27.33333,-112.33333


due to the api limits of OpenCage(2,500 requests one day), I choose to use another library
also to avoid rate limiting, the code takes a long time to run, but dont be worried since it will output the history


In [7]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import pandas as pd
import time

# ini Nominatim geolocator
geolocator = Nominatim(user_agent="my_geocoding_project")  # change to your own user agent

#   check if the dataset contains latitude and longitude columns
if 'latitude' not in df.columns or 'longitude' not in df.columns:
    raise ValueError("Dataset must contain 'latitude' and 'longitude' columns.")

# function to get country name from latitude and longitude
def get_country(lat, lon):
    try:
        location = geolocator.reverse((lat, lon), language='en', timeout=10)
        if location and 'country' in location.raw['address']:
            return location.raw['address']['country']  # substract country name
        else:
            return 'Unknown'
    except GeocoderTimedOut:
        return 'TimedOut'
    except GeocoderServiceError as e:
        return 'ServiceError'
    except Exception as e:
        return 'Error'

# initialize new column for country
total_rows = len(df)
df['country'] = None

# iterate over rows and get country
for index, row in enumerate(df.itertuples(), start=1):
    try:
        # get country name
        country = get_country(row.latitude, row.longitude)
        df.at[row.Index, 'country'] = country
        
        # print progress
        print(f"Processed {index}/{total_rows}: ({row.latitude}, {row.longitude}) -> {country}")
    except Exception as e:
        print(f"Error processing row {index}: {e}")
    
    # sleep for 1 second to avoid rate limiting
    time.sleep(1)

# save results to CSV
df.to_csv('classified_with_nominatim.csv', index=False)

print("Processing complete. Results saved to 'classified_with_nominatim.csv'.")


Processed 1/36459: (37.96417, -97.75583) -> United States
Processed 2/36459: (38.7, -90.23333) -> United States
Processed 3/36459: (-31.41667, -60.66667) -> Argentina
Processed 4/36459: (34.11667, -101.78333) -> United States
Processed 5/36459: (27.33333, -112.33333) -> Mexico
Processed 6/36459: (56.13333, 77.26667) -> Russia
Processed 7/36459: (38.01667, -8.25) -> Portugal
Processed 8/36459: (27.5, -99.5) -> Mexico
Processed 9/36459: (46.1, 12.35) -> Italy
Processed 10/36459: (35.83333, -97.93333) -> United States
Processed 11/36459: (33.75, -98.83333) -> United States
Processed 12/36459: (38.525, -95.78056) -> United States
Processed 13/36459: (29.5, -98.0) -> United States
Processed 14/36459: (34.1, 132.08333) -> Japan
Processed 15/36459: (-36.8, 143.55) -> Australia
Processed 16/36459: (25.91667, 86.36667) -> India
Processed 17/36459: (36.08333, -84.2) -> United States
Processed 18/36459: (-19.53333, -48.56667) -> Brazil
Processed 19/36459: (39.56583, -99.64722) -> United States
Pr

In [8]:
df.head()

Unnamed: 0,name,id,type,classification,mass,fall,year,latitude,longitude,country
0,Haven,11858,Valid,H6,6100.0,Found,1950.0,37.96417,-97.75583,United States
1,St. Louis,23089,Valid,H4,1000.0,Fell,1950.0,38.7,-90.23333,United States
2,Arroyo Aguiar,2340,Valid,H5,7450.0,Fell,1950.0,-31.41667,-60.66667,Argentina
3,Plainview (1950),18842,Valid,H,2200.0,Found,1950.0,34.11667,-101.78333,United States
4,Santa Rosalia,23168,Valid,"Pallasite, PMG",1631.0,Found,1950.0,27.33333,-112.33333,Mexico


In [None]:
#merge the two dataframes
population = pd.read_csv('population_by_country_2020.csv')
df = df.merge(population, on='country', how='left')
df.head()