In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress

# Impor the OpenWeatherMap API key
from api_keys import weather_api_key

# Import citipy to determine the cities based on latitude and longitude
from citipy import citipy

In [3]:
# Generate random latitude and longitude
lats = np.random.uniform(-90, 90, size=1000)
lngs = np.random.uniform(-180, 180, size=1000)


cities = []

# Use citipy to determine the nearest city for each lat-lng combination
for lat, lng in zip(lats, lngs):
    city = citipy.nearest_city(lat, lng).city_name
    if city not in cities:
        cities.append(city)

print(f"Generated {len(cities)} unique cities.")

Generated 448 unique cities.


In [4]:
# Base URLs
weather_url = "http://api.openweathermap.org/data/2.5/weather"
air_quality_url = "http://api.openweathermap.org/data/2.5/air_pollution"

# Dictionary to store valid city data with coordinates
valid_cities = {}

for city in cities:
    try:
        # Fetch city coordinates using the weather endpoint
        query_url = f"{weather_url}?q={city}&appid={weather_api_key}"
        response = requests.get(query_url)

        if response.status_code != 200:
            print(f"City {city} not found (Status Code: {response.status_code}). Skipping...")
            continue

        # Parse JSON response for coordinates
        data = response.json()
        lat = data['coord']['lat']
        lon = data['coord']['lon']
        country = data['sys']['country']

        # Store in filtered_cities dictionary
        valid_cities[city] = {'city': city, 'lat': lat, 'lon': lon, 'country': country}

        print(f"Retrieved coordinates for {city}: ({lat}, {lon})")

    except KeyError as key_err:
        print(f"KeyError: {key_err} for {city}. Skipping...")
    except requests.exceptions.RequestException as req_err:
        print(f"Request error: {req_err} for {city}. Skipping...")

    # Pause to avoid rate limiting
    time.sleep(1)

print(f"Total valid cities with coordinates: {len(valid_cities)}")

Retrieved coordinates for margaret river: (-33.95, 115.0667)
Retrieved coordinates for punta arenas: (-53.15, -70.9167)
Retrieved coordinates for port-aux-francais: (-49.35, 70.2167)
Retrieved coordinates for puerto natales: (-51.7236, -72.4875)
Retrieved coordinates for adamstown: (-25.066, -130.1015)
Retrieved coordinates for arauco: (-37.2463, -73.3175)
Retrieved coordinates for puerto ayora: (-0.7393, -90.3518)
Retrieved coordinates for dalandzadgad: (43.5708, 104.425)
Retrieved coordinates for isafjordur: (66.0755, -23.124)
Retrieved coordinates for hermanus: (-34.4187, 19.2345)
Retrieved coordinates for qaqortoq: (60.7167, -46.0333)
Retrieved coordinates for gierloz: (54.0813, 21.4955)
City port glaud not found (Status Code: 404). Skipping...
Retrieved coordinates for ribeira grande: (38.5167, -28.7)
Retrieved coordinates for lhasa: (29.65, 91.1)
City abraham's bay not found (Status Code: 404). Skipping...
Retrieved coordinates for mont-dore: (-22.2833, 166.5833)
Retrieved coordi

In [5]:
print(valid_cities)

{'margaret river': {'city': 'margaret river', 'lat': -33.95, 'lon': 115.0667, 'country': 'AU'}, 'punta arenas': {'city': 'punta arenas', 'lat': -53.15, 'lon': -70.9167, 'country': 'CL'}, 'port-aux-francais': {'city': 'port-aux-francais', 'lat': -49.35, 'lon': 70.2167, 'country': 'TF'}, 'puerto natales': {'city': 'puerto natales', 'lat': -51.7236, 'lon': -72.4875, 'country': 'CL'}, 'adamstown': {'city': 'adamstown', 'lat': -25.066, 'lon': -130.1015, 'country': 'PN'}, 'arauco': {'city': 'arauco', 'lat': -37.2463, 'lon': -73.3175, 'country': 'CL'}, 'puerto ayora': {'city': 'puerto ayora', 'lat': -0.7393, 'lon': -90.3518, 'country': 'EC'}, 'dalandzadgad': {'city': 'dalandzadgad', 'lat': 43.5708, 'lon': 104.425, 'country': 'MN'}, 'isafjordur': {'city': 'isafjordur', 'lat': 66.0755, 'lon': -23.124, 'country': 'IS'}, 'hermanus': {'city': 'hermanus', 'lat': -34.4187, 'lon': 19.2345, 'country': 'ZA'}, 'qaqortoq': {'city': 'qaqortoq', 'lat': 60.7167, 'lon': -46.0333, 'country': 'GL'}, 'gierloz':

In [6]:
city_names = list(valid_cities.keys())
print(city_names)

['margaret river', 'punta arenas', 'port-aux-francais', 'puerto natales', 'adamstown', 'arauco', 'puerto ayora', 'dalandzadgad', 'isafjordur', 'hermanus', 'qaqortoq', 'gierloz', 'ribeira grande', 'lhasa', 'mont-dore', 'kingman', 'yuzhno-kurilsk', 'new norfolk', 'namtsy', 'invercargill', 'edinburgh of the seven seas', 'wenchang', 'antsiranana', 'talcahuano', 'tazacorte', 'beberibe', 'albany', 'kirakira', 'sola', 'winkler', 'waitangi', 'las khorey', 'sittwe', 'blackmans bay', 'estebania', 'grytviken', "st. john's", 'tarawa', 'yellowknife', 'kirensk', 'turkeli', 'punta del este', 'georgetown', 'labrador city', 'silvis', 'plettenberg bay', 'ancud', 'vadso', 'college', 'whitehorse', 'longyearbyen', 'kenai', 'kristiansund', 'igarka', 'badger', 'san julian', 'kodiak', 'iqaluit', 'avarua', 'port augusta', 'lazaro cardenas', 'guatrache', 'talnakh', 'tchintabaraden', 'altai', 'selfoss', 'saipan', 'okondja', 'nemuro', 'thompson', 'katsuura', 'ushuaia', 'casablanca', 'rumoi', 'tarakan', 'san patri

In [7]:
# List to store air quality data
air_quality_data = []

# Loop through each city to get pollution data
for city, info in valid_cities.items():
    lat = info['lat']
    lon = info['lon']

    try:
        # Query the Air Pollution API
        pollution_url = f"{air_quality_url}?lat={lat}&lon={lon}&appid={weather_api_key}"
        pollution_response = requests.get(pollution_url).json()

        # Extract pollution components
        components = pollution_response['list'][0]['components']
        pm25 = components.get('pm2_5', None)
        pm10 = components.get('pm10', None)
        no2 = components.get('no2', None)
        o3 = components.get('o3', None)
        so2 = components.get('so2', None)
        co = components.get('co', None)
        nh3 = components.get('nh3', None)

        # Store the pollution data
        air_quality_data.append({
            'City': city,
            'Country': info['country'],
            'PM2.5': pm25,
            'PM10': pm10,
            'NO2': no2,
            'O3': o3,
            'SO2': so2,
            'CO': co,
            'NH3': nh3
        })

        print(f"Retrieved pollution data for {city}, {info['country']}")

    except Exception as e:
        print(f"Error retrieving data for {city}: {e}")

    # Pause to avoid rate limiting
    time.sleep(1)

# Convert the collected data into a DataFrame
air_quality_df = pd.DataFrame(air_quality_data)
print(air_quality_df.head())


Retrieved pollution data for margaret river, AU
Retrieved pollution data for punta arenas, CL
Retrieved pollution data for port-aux-francais, TF
Retrieved pollution data for puerto natales, CL
Retrieved pollution data for adamstown, PN
Retrieved pollution data for arauco, CL
Retrieved pollution data for puerto ayora, EC
Retrieved pollution data for dalandzadgad, MN
Retrieved pollution data for isafjordur, IS
Retrieved pollution data for hermanus, ZA
Retrieved pollution data for qaqortoq, GL
Retrieved pollution data for gierloz, PL
Retrieved pollution data for ribeira grande, PT
Retrieved pollution data for lhasa, CN
Retrieved pollution data for mont-dore, NC
Retrieved pollution data for kingman, US
Retrieved pollution data for yuzhno-kurilsk, RU
Retrieved pollution data for new norfolk, AU
Retrieved pollution data for namtsy, RU
Retrieved pollution data for invercargill, NZ
Retrieved pollution data for edinburgh of the seven seas, SH
Retrieved pollution data for wenchang, CN
Retrieved 

In [8]:
air_quality_df.to_csv('Resources/air_quality_data.csv', index=False)

In [9]:
# Create a dictionary to store city-country pairs
city_country_map = {}

for city in city_names:
    try:
        query_url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={weather_api_key}"
        response = requests.get(query_url).json()
        
        # Store the city and corresponding country code
        country = response['sys']['country']
        city_country_map[city] = country

        # Print city and country
        print(f"City: {city}, Country: {country}")

    except KeyError as key_err:
        print(f"KeyError: {key_err} for {city}. Skipping...")
    except requests.exceptions.RequestException as req_err:
        print(f"Request error: {req_err} for {city}. Skipping...")

    # Pause to avoid rate limiting
    time.sleep(1)

print(city_country_map) 


City: margaret river, Country: AU
City: punta arenas, Country: CL
City: port-aux-francais, Country: TF
City: puerto natales, Country: CL
City: adamstown, Country: PN
City: arauco, Country: CL
City: puerto ayora, Country: EC
City: dalandzadgad, Country: MN
City: isafjordur, Country: IS
City: hermanus, Country: ZA
City: qaqortoq, Country: GL
City: gierloz, Country: PL
City: ribeira grande, Country: PT
City: lhasa, Country: CN
City: mont-dore, Country: NC
City: kingman, Country: US
City: yuzhno-kurilsk, Country: RU
City: new norfolk, Country: AU
City: namtsy, Country: RU
City: invercargill, Country: NZ
City: edinburgh of the seven seas, Country: SH
City: wenchang, Country: CN
City: antsiranana, Country: MG
City: talcahuano, Country: CL
City: tazacorte, Country: ES
City: beberibe, Country: BR
City: albany, Country: US
City: kirakira, Country: SB
City: sola, Country: VU
City: winkler, Country: CA
City: waitangi, Country: NZ
City: las khorey, Country: SO
City: sittwe, Country: MM
City: black

In [10]:
print(f"Total cities: {len(city_country_map)}")

Total cities: 427


In [11]:
data = list(city_country_map.items())

# Create the DataFrame
df = pd.DataFrame(data, columns=['City', 'Country'])

# Save the DataFrame to a CSV file
df.to_csv('Resources/city_country_map.csv', index=False, encoding='utf-8')

In [12]:
# Get all the unique countries from the city-country map
unique_countries = set(city_country_map.values())

# Print the count and the list of unique countries
print(f"Total unique countries: {len(unique_countries)}")
print(unique_countries)


Total unique countries: 115
{'GR', 'KI', 'MX', 'PN', 'OM', 'NZ', 'TK', 'RE', 'PG', 'RU', 'TD', 'ZA', 'KE', 'TM', 'EC', 'AO', 'RO', 'GY', 'CC', 'GS', 'PR', 'MU', 'CG', 'GH', 'MM', 'SA', 'MP', 'SB', 'US', 'CX', 'IN', 'CF', 'CU', 'GQ', 'ID', 'PL', 'GA', 'CD', 'MG', 'FM', 'DK', 'JP', 'WF', 'ES', 'UY', 'DZ', 'PE', 'CL', 'CA', 'NU', 'GL', 'LY', 'VU', 'MH', 'PF', 'CV', 'NG', 'IE', 'BY', 'MN', 'SJ', 'LB', 'NP', 'TF', 'GB', 'UZ', 'MY', 'MW', 'AU', 'GU', 'VE', 'CO', 'SO', 'CM', 'VN', 'SH', 'TO', 'BD', 'ET', 'AS', 'NE', 'PK', 'NA', 'GF', 'BO', 'KZ', 'IQ', 'PW', 'MV', 'PH', 'PT', 'BR', 'SS', 'CK', 'LR', 'TR', 'IR', 'CN', 'KR', 'TG', 'FR', 'UA', 'MA', 'NO', 'IS', 'EG', 'FO', 'AR', 'TZ', 'CI', 'NC', 'DO', 'SE', 'YE', 'DE'}


In [13]:
import pycountry

# WHO API: Get all available countries and their alpha-3 codes
who_url = "https://ghoapi.azureedge.net/api/DIMENSION/COUNTRY/DimensionValues"
who_response = requests.get(who_url).json()

# Convert WHO countries from alpha-3 to alpha-2 codes to be matched withOpenWeather Country Code
who_countries_alpha2 = {}

for entry in who_response['value']:
    alpha_3 = entry['Code']
    country_name = entry['Title']

    # Convert alpha-3 to alpha-2 code using pycountry
    try:
        alpha_2 = pycountry.countries.get(alpha_3=alpha_3).alpha_2
        who_countries_alpha2[alpha_2] = country_name
    except AttributeError:
        print(f"Skipping {alpha_3} - No matching alpha-2 code found.")

print(f"WHO Countries (alpha-2): {who_countries_alpha2}")

who_country_codes = who_countries_alpha2.keys()
print(f"WHO Country Codes: {list(who_country_codes)}")


Skipping CHI - No matching alpha-2 code found.
Skipping ME1 - No matching alpha-2 code found.
Skipping SDN736 - No matching alpha-2 code found.
Skipping XKX - No matching alpha-2 code found.
WHO Countries (alpha-2): {'AW': 'Aruba', 'AF': 'Afghanistan', 'AO': 'Angola', 'AI': 'Anguilla', 'AL': 'Albania', 'AD': 'Andorra', 'AE': 'United Arab Emirates', 'AR': 'Argentina', 'AM': 'Armenia', 'AS': 'American Samoa', 'AG': 'Antigua and Barbuda', 'AU': 'Australia', 'AT': 'Austria', 'AZ': 'Azerbaijan', 'BI': 'Burundi', 'BE': 'Belgium', 'BJ': 'Benin', 'BQ': 'Bonaire, Saint Eustatius and Saba', 'BF': 'Burkina Faso', 'BD': 'Bangladesh', 'BG': 'Bulgaria', 'BH': 'Bahrain', 'BS': 'Bahamas', 'BA': 'Bosnia and Herzegovina', 'BY': 'Belarus', 'BZ': 'Belize', 'BM': 'Bermuda', 'BO': 'Bolivia (Plurinational State of)', 'BR': 'Brazil', 'BB': 'Barbados', 'BN': 'Brunei Darussalam', 'BT': 'Bhutan', 'BW': 'Botswana', 'CF': 'Central African Republic', 'CA': 'Canada', 'CH': 'Switzerland', 'CL': 'Chile', 'CN': 'China'

In [14]:
# Filter cities where the country code exists in the WHO data
filtered_cities = {
    city: code for city, code in city_country_map.items() if code in who_country_codes
}

print(f"Cities with matching WHO data: {len(filtered_cities)}")
print(filtered_cities)


Cities with matching WHO data: 420
{'margaret river': 'AU', 'punta arenas': 'CL', 'puerto natales': 'CL', 'arauco': 'CL', 'puerto ayora': 'EC', 'dalandzadgad': 'MN', 'isafjordur': 'IS', 'hermanus': 'ZA', 'qaqortoq': 'GL', 'gierloz': 'PL', 'ribeira grande': 'PT', 'lhasa': 'CN', 'mont-dore': 'NC', 'kingman': 'US', 'yuzhno-kurilsk': 'RU', 'new norfolk': 'AU', 'namtsy': 'RU', 'invercargill': 'NZ', 'edinburgh of the seven seas': 'SH', 'wenchang': 'CN', 'antsiranana': 'MG', 'talcahuano': 'CL', 'tazacorte': 'ES', 'beberibe': 'BR', 'albany': 'US', 'kirakira': 'SB', 'sola': 'VU', 'winkler': 'CA', 'waitangi': 'NZ', 'las khorey': 'SO', 'sittwe': 'MM', 'blackmans bay': 'AU', 'estebania': 'DO', "st. john's": 'CA', 'tarawa': 'KI', 'yellowknife': 'CA', 'kirensk': 'RU', 'turkeli': 'TR', 'punta del este': 'UY', 'georgetown': 'MY', 'labrador city': 'CA', 'silvis': 'US', 'plettenberg bay': 'ZA', 'ancud': 'CL', 'vadso': 'NO', 'college': 'US', 'whitehorse': 'CA', 'kenai': 'US', 'kristiansund': 'NO', 'igark

In [16]:
# WHO API: Air pollution attributable DALYs
who_url = "https://ghoapi.azureedge.net/api/AIR_42"
response = requests.get(who_url)

# Check for valid response
if response.status_code == 200:
    data = response.json()['value']
    # Parse the data into a DataFrame
    who_data = [
        {
            'Country': entry['SpatialDim'],
            'Year': entry['TimeDim'],
            'MortalityRate_per_100k': entry['NumericValue']
        }
        for entry in data
    ]
    
    df_who = pd.DataFrame(who_data)
    print(df_who.head())
    
    df_who.to_csv('Resources/who_air_pollution_dalys.csv', index=False)
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


  Country  Year  MortalityRate_per_100k
0     CIV  2015                  40.103
1     SOM  2011                  53.318
2     GRD  2012                   0.567
3     BRN  2015                   1.440
4     HND  2016                  32.081


In [17]:
# Function to convert ISO-3 to ISO-2
def iso3_to_iso2(iso3_code):
    try:
        return pycountry.countries.get(alpha_3=iso3_code).alpha_2
    except AttributeError:
        return None  

df_who = pd.read_csv('Resources/who_air_pollution_dalys.csv')

# Convert WHO's ISO-3 codes to ISO-2
df_who['Country_ISO2'] = df_who['Country'].apply(iso3_to_iso2)

df_who_clean = df_who.dropna(subset=['Country_ISO2'])

df_weather = pd.read_csv('Resources/city_country_map.csv')

merged_df = pd.merge(air_quality_df, df_who_clean, left_on='Country', right_on='Country_ISO2', how='inner')

print(merged_df.head())

merged_df.to_csv('Resources/merged_air_quality_health.csv', index=False)


             City Country_x  PM2.5   PM10   NO2     O3   SO2      CO  NH3  \
0  margaret river        AU   2.54  10.04  0.15  59.37  0.39  226.97  0.0   
1  margaret river        AU   2.54  10.04  0.15  59.37  0.39  226.97  0.0   
2  margaret river        AU   2.54  10.04  0.15  59.37  0.39  226.97  0.0   
3  margaret river        AU   2.54  10.04  0.15  59.37  0.39  226.97  0.0   
4  margaret river        AU   2.54  10.04  0.15  59.37  0.39  226.97  0.0   

  Country_y  Year  MortalityRate_per_100k Country_ISO2  
0       AUS  2011                  10.110           AU  
1       AUS  2012                   6.769           AU  
2       AUS  2010                   1.823           AU  
3       AUS  2019                   1.405           AU  
4       AUS  2012                   0.430           AU  


In [18]:
count = len(merged_df)
count

70380

In [19]:
# Drop redundant columns (dropping 'Country_ISO2' and 'Country_y')
merged_df = merged_df.drop(['Country_ISO2', 'Country_y'], axis=1)

# Rename columns for clarity
merged_df = merged_df.rename(columns={'Country_x': 'Country'})

In [20]:
merged_df.to_csv('Resources/merged_air_quality_health.csv', index=False)

In [21]:
# Load the merged dataset
df = pd.read_csv('Resources/merged_air_quality_health.csv')

# Group by City and Country to calculate average values across all years
grouped_df = df.groupby(['City', 'Country'], as_index=False).agg({
    'PM2.5': 'mean',
    'PM10': 'mean',
    'NO2': 'mean',
    'O3': 'mean',
    'SO2': 'mean',
    'CO': 'mean',
    'NH3': 'mean',
    'MortalityRate_per_100k': 'mean'
})

grouped_df = grouped_df.round(1)

grouped_df.to_csv('Resources/average_air_quality_health.csv', index=False)

print(grouped_df.head(10))



            City Country  PM2.5   PM10   NO2    O3  SO2     CO  NH3  \
0    'ain benian      DZ    1.4    4.5   1.2  79.4  0.5  201.9  0.1   
1       abashiri      JP    1.8    3.5   1.0  90.1  0.4  217.0  0.8   
2         acarau      BR    2.3   10.4   0.2  52.2  0.1  347.1  0.1   
3     ahuacatlan      MX    9.1   11.0  15.8  46.5  0.7  313.8  3.0   
4         aitape      PG    1.0    1.2   0.1   7.2  0.0  168.6  0.0   
5       al hamul      EG   18.4   35.7   1.8  83.0  1.6  208.6  5.8   
6       al kharj      SA   33.2   80.8   5.8  39.7  3.8  233.6  1.1   
7       alaghsas      NE   62.9  251.9   0.0  50.8  0.0  191.9  0.0   
8         albany      US    2.6    3.4  16.1  26.1  0.1  270.4  0.3   
9  alice springs      AU   16.4   70.6   0.1  45.4  0.0  283.7  0.2   

   MortalityRate_per_100k  
0                    16.6  
1                     5.1  
2                     9.6  
3                    10.7  
4                    12.4  
5                    36.2  
6                    3