In [1]:
import pandas as pd
import asyncio
import aiohttp
import nest_asyncio
from geopy.geocoders import Nominatim
import time
import random

# Allow nested asyncio loops
nest_asyncio.apply()

# Load your CSV file
df = pd.read_csv('global_air_pollution_data.csv')

In [3]:
# Initialize the geocoder
geolocator = Nominatim(user_agent="my_geocoder_app_v1")

# Cache for already retrieved cities
cache = {}
semaphore = asyncio.Semaphore(5)  # Limit to 5 concurrent requests

# Asynchronous function to check if the name is an actual city
async def is_actual_city(session, city_name, retries=3):
    if city_name in cache:
        return cache[city_name]

    for attempt in range(retries):
        try:
            async with semaphore:  # Use the semaphore to limit concurrent requests
                async with session.get(f'https://nominatim.openstreetmap.org/search?q={city_name}&format=json') as response:
                    if response.status == 200:
                        location = await response.json()
                        if location:
                            result = any('city' in item.get('type', '') for item in location)
                            cache[city_name] = result
                            return result
                    break  # Break if successful
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {city_name}: {e}")
            if attempt < retries - 1:
                await asyncio.sleep(random.uniform(2, 5))  # Longer wait before retrying

    cache[city_name] = False
    return False

async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [is_actual_city(session, city) for city in df['city_name'].unique()]
        results = await asyncio.gather(*tasks)

        # Map results back to the DataFrame
        city_results = dict(zip(df['city_name'].unique(), results))
        df['Is_Actual_City'] = df['city_name'].map(city_results)

        # Filter out rows where the city is not an actual city
        df_cleaned = df[df['Is_Actual_City']].drop(columns=['Is_Actual_City'])

        # Save the cleaned DataFrame to a new CSV
        df_cleaned.to_csv('cleaned_file.csv', index=False)

# Run the asynchronous main function
asyncio.run(main())

Attempt 1 failed for Praskoveya: Cannot connect to host nominatim.openstreetmap.org:443 ssl:default [Connect call failed ('184.104.226.109', 443)]
Attempt 1 failed for Priolo Gargallo: Cannot connect to host nominatim.openstreetmap.org:443 ssl:default [Connect call failed ('184.104.226.109', 443)]
Attempt 1 failed for Presidente Dutra: Cannot connect to host nominatim.openstreetmap.org:443 ssl:default [Connect call failed ('184.104.226.109', 443)]
Attempt 1 failed for Punaauia: Cannot connect to host nominatim.openstreetmap.org:443 ssl:default [Connect call failed ('184.104.226.109', 443)]
Attempt 1 failed for Przasnysz: Cannot connect to host nominatim.openstreetmap.org:443 ssl:default [Connect call failed ('184.104.226.109', 443)]
Attempt 1 failed for Punta Gorda: Cannot connect to host nominatim.openstreetmap.org:443 ssl:default [Connect call failed ('184.104.226.109', 443)]
Attempt 1 failed for Puurs: Cannot connect to host nominatim.openstreetmap.org:443 ssl:default [Connect call 

KeyboardInterrupt: 

In [3]:
# unique_countries = df['country_name'].nunique()
# print(f"Number of unique countries: {unique_countries}")

# # Initialize the geocoder
# geolocator = Nominatim(user_agent="Hatem2")

# # Cache for already retrieved coordinates
# cache = {}

# # Function to get lat/long with caching and rate limiting
# def get_lat_long(country):
#     if country in cache:
#         return cache[country]
    
#     time.sleep(1)  # Rate limiting (1 request per second)
    
#     location = geolocator.geocode(country)
#     if location:
#         lat_long = (location.latitude, location.longitude)
#         cache[country] = lat_long  # Store in cache
#         return lat_long
#     else:
#         return None, None

# # Apply function to the country column
# df[['Latitude', 'Longitude']] = df['country_name'].apply(lambda x: pd.Series(get_lat_long(x)))

# # Save to a new CSV
# df.to_csv('updated_file.csv', index=False)