In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress

# Impor the OpenWeatherMap API key
from api_keys import weather_api_key

# Import citipy to determine the cities based on latitude and longitude
from citipy import citipy

In [3]:
# Generate random latitude and longitude
lats = np.random.uniform(-90, 90, size=1000)
lngs = np.random.uniform(-180, 180, size=1000)


cities = []

# Use citipy to determine the nearest city for each lat-lng combination
for lat, lng in zip(lats, lngs):
    city = citipy.nearest_city(lat, lng).city_name
    if city not in cities:
        cities.append(city)

print(f"Generated {len(cities)} unique cities.")

Generated 452 unique cities.


In [4]:
# Base URLs
weather_url = "http://api.openweathermap.org/data/2.5/weather"
air_quality_url = "http://api.openweathermap.org/data/2.5/air_pollution"

# Dictionary to store valid city data with coordinates
valid_cities = {}

for city in cities:
    try:
        # Fetch city coordinates using the weather endpoint
        query_url = f"{weather_url}?q={city}&appid={weather_api_key}"
        response = requests.get(query_url)

        if response.status_code != 200:
            print(f"City {city} not found (Status Code: {response.status_code}). Skipping...")
            continue

        # Parse JSON response for coordinates
        data = response.json()
        lat = data['coord']['lat']
        lon = data['coord']['lon']
        country = data['sys']['country']

        # Store in filtered_cities dictionary
        valid_cities[city] = {'city': city, 'lat': lat, 'lon': lon, 'country': country}

        print(f"Retrieved coordinates for {city}: ({lat}, {lon})")

    except KeyError as key_err:
        print(f"KeyError: {key_err} for {city}. Skipping...")
    except requests.exceptions.RequestException as req_err:
        print(f"Request error: {req_err} for {city}. Skipping...")

    # Pause to avoid rate limiting
    time.sleep(1)

print(f"Total valid cities with coordinates: {len(valid_cities)}")

Retrieved coordinates for huangmei: (30.1924, 116.025)
Retrieved coordinates for sitka: (57.0531, -135.33)
Retrieved coordinates for badger: (64.8, -147.5333)
Retrieved coordinates for edinburgh of the seven seas: (-37.0676, -12.3116)
Retrieved coordinates for ballina: (-28.8667, 153.5667)
Retrieved coordinates for wailua homesteads: (22.0669, -159.378)
Retrieved coordinates for west island: (-12.1568, 96.8225)
City vingt cinq not found (Status Code: 404). Skipping...
Retrieved coordinates for tobelo: (1.7284, 128.0095)
Retrieved coordinates for puerto ayora: (-0.7393, -90.3518)
Retrieved coordinates for port-aux-francais: (-49.35, 70.2167)
Retrieved coordinates for albany: (42.6001, -73.9662)
Retrieved coordinates for gizo: (-8.103, 156.8419)
Retrieved coordinates for angaur state: (6.906, 134.13)
Retrieved coordinates for adamstown: (-25.066, -130.1015)
Retrieved coordinates for tazacorte: (28.629, -17.9293)
Retrieved coordinates for clanton: (32.8387, -86.6294)
Retrieved coordinates

In [5]:
print(valid_cities)

{'huangmei': {'city': 'huangmei', 'lat': 30.1924, 'lon': 116.025, 'country': 'CN'}, 'sitka': {'city': 'sitka', 'lat': 57.0531, 'lon': -135.33, 'country': 'US'}, 'badger': {'city': 'badger', 'lat': 64.8, 'lon': -147.5333, 'country': 'US'}, 'edinburgh of the seven seas': {'city': 'edinburgh of the seven seas', 'lat': -37.0676, 'lon': -12.3116, 'country': 'SH'}, 'ballina': {'city': 'ballina', 'lat': -28.8667, 'lon': 153.5667, 'country': 'AU'}, 'wailua homesteads': {'city': 'wailua homesteads', 'lat': 22.0669, 'lon': -159.378, 'country': 'US'}, 'west island': {'city': 'west island', 'lat': -12.1568, 'lon': 96.8225, 'country': 'CC'}, 'tobelo': {'city': 'tobelo', 'lat': 1.7284, 'lon': 128.0095, 'country': 'ID'}, 'puerto ayora': {'city': 'puerto ayora', 'lat': -0.7393, 'lon': -90.3518, 'country': 'EC'}, 'port-aux-francais': {'city': 'port-aux-francais', 'lat': -49.35, 'lon': 70.2167, 'country': 'TF'}, 'albany': {'city': 'albany', 'lat': 42.6001, 'lon': -73.9662, 'country': 'US'}, 'gizo': {'ci

In [6]:
city_names = list(valid_cities.keys())
print(city_names)

['huangmei', 'sitka', 'badger', 'edinburgh of the seven seas', 'ballina', 'wailua homesteads', 'west island', 'tobelo', 'puerto ayora', 'port-aux-francais', 'albany', 'gizo', 'angaur state', 'adamstown', 'tazacorte', 'clanton', 'silifke', 'olonkinbyen', 'tura', 'solleftea', 'stanley', 'mongoumba', 'bamboo flat', 'nemuro', 'bethel', 'taroa', 'gopalpur', 'invercargill', 'ushuaia', 'bilibino', 'puerto deseado', 'susuman', 'isafjordur', 'cabugao', 'cadale', 'whakatane', 'xiongzhou', 'childress', 'grytviken', 'vorgashor', 'anadyr', "kapa'a", 'mastic beach', 'bodo', 'pacific grove', 'waitangi', 'mammoth lakes', 'colonia', 'port lincoln', 'qaqortoq', 'campoverde', 'lebu', 'udachny', 'kodiak', 'big bend', 'lerwick', 'new norfolk', 'aliveri', 'ilulissat', 'fisterra', 'mumbwa', 'berriozabal', 'utrik', 'port augusta', 'blackmans bay', 'correntina', 'bredasdorp', 'avarua', 'kirakira', 'amahai', 'weno', 'cheria', 'enewetak', 'tamandare', 'sao joao da barra', 'aykhal', 'magadan', 'umm kaddadah', 'si

In [7]:
# List to store air quality data
air_quality_data = []

# Loop through each city to get pollution data
for city, info in valid_cities.items():
    lat = info['lat']
    lon = info['lon']

    try:
        # Query the Air Pollution API
        pollution_url = f"{air_quality_url}?lat={lat}&lon={lon}&appid={weather_api_key}"
        pollution_response = requests.get(pollution_url).json()

        # Extract pollution components
        components = pollution_response['list'][0]['components']
        pm25 = components.get('pm2_5', None)
        pm10 = components.get('pm10', None)
        no2 = components.get('no2', None)
        o3 = components.get('o3', None)
        so2 = components.get('so2', None)
        co = components.get('co', None)
        nh3 = components.get('nh3', None)

        # Store the pollution data
        air_quality_data.append({
            'City': city,
            'Country': info['country'],
            'PM2.5': pm25,
            'PM10': pm10,
            'NO2': no2,
            'O3': o3,
            'SO2': so2,
            'CO': co,
            'NH3': nh3
        })

        print(f"Retrieved pollution data for {city}, {info['country']}")

    except Exception as e:
        print(f"Error retrieving data for {city}: {e}")

    # Pause to avoid rate limiting
    time.sleep(1)

# Convert the collected data into a DataFrame
air_quality_df = pd.DataFrame(air_quality_data)
print(air_quality_df.head())


Retrieved pollution data for huangmei, CN
Retrieved pollution data for sitka, US
Retrieved pollution data for badger, US
Retrieved pollution data for edinburgh of the seven seas, SH
Retrieved pollution data for ballina, AU
Retrieved pollution data for wailua homesteads, US
Retrieved pollution data for west island, CC
Retrieved pollution data for tobelo, ID
Retrieved pollution data for puerto ayora, EC
Retrieved pollution data for port-aux-francais, TF
Retrieved pollution data for albany, US
Retrieved pollution data for gizo, SB
Retrieved pollution data for angaur state, PW
Retrieved pollution data for adamstown, PN
Retrieved pollution data for tazacorte, ES
Retrieved pollution data for clanton, US
Retrieved pollution data for silifke, TR
Retrieved pollution data for olonkinbyen, SJ
Retrieved pollution data for tura, IN
Retrieved pollution data for solleftea, SE
Retrieved pollution data for stanley, GB
Retrieved pollution data for mongoumba, CF
Retrieved pollution data for bamboo flat, 

In [9]:
air_quality_df.to_csv('Resources/air_quality_data.csv', index=False)

In [10]:
# Create a dictionary to store city-country pairs
city_country_map = {}

for city in city_names:
    try:
        query_url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={weather_api_key}"
        response = requests.get(query_url).json()
        
        # Store the city and corresponding country code
        country = response['sys']['country']
        city_country_map[city] = country

        # Print city and country
        print(f"City: {city}, Country: {country}")

    except KeyError as key_err:
        print(f"KeyError: {key_err} for {city}. Skipping...")
    except requests.exceptions.RequestException as req_err:
        print(f"Request error: {req_err} for {city}. Skipping...")

    # Pause to avoid rate limiting
    time.sleep(1)

print(city_country_map) 


City: huangmei, Country: CN
City: sitka, Country: US
City: badger, Country: US
City: edinburgh of the seven seas, Country: SH
City: ballina, Country: AU
City: wailua homesteads, Country: US
City: west island, Country: CC
City: tobelo, Country: ID
City: puerto ayora, Country: EC
City: port-aux-francais, Country: TF
City: albany, Country: US
City: gizo, Country: SB
City: angaur state, Country: PW
City: adamstown, Country: PN
City: tazacorte, Country: ES
City: clanton, Country: US
City: silifke, Country: TR
City: olonkinbyen, Country: SJ
City: tura, Country: IN
City: solleftea, Country: SE
City: stanley, Country: GB
City: mongoumba, Country: CF
City: bamboo flat, Country: IN
City: nemuro, Country: JP
City: bethel, Country: US
City: taroa, Country: MH
City: gopalpur, Country: IN
City: invercargill, Country: NZ
City: ushuaia, Country: AR
City: bilibino, Country: RU
City: puerto deseado, Country: AR
City: susuman, Country: RU
City: isafjordur, Country: IS
City: cabugao, Country: PH
City: cad

In [11]:
print(f"Total cities: {len(city_country_map)}")

Total cities: 434


In [12]:
data = list(city_country_map.items())

# Create the DataFrame
df = pd.DataFrame(data, columns=['City', 'Country'])

# Save the DataFrame to a CSV file
df.to_csv('Resources/city_country_map.csv', index=False, encoding='utf-8')

In [13]:
# Get all the unique countries from the city-country map
unique_countries = set(city_country_map.values())

# Print the count and the list of unique countries
print(f"Total unique countries: {len(unique_countries)}")
print(unique_countries)


Total unique countries: 106
{'ZA', 'JP', 'IE', 'GE', 'IS', 'NG', 'BE', 'KE', 'MU', 'CK', 'GB', 'NO', 'FO', 'GF', 'VU', 'SE', 'DZ', 'MX', 'MH', 'SS', 'RO', 'GA', 'ER', 'ET', 'SH', 'DE', 'PF', 'EH', 'KP', 'CL', 'CN', 'PE', 'CI', 'BR', 'TH', 'GL', 'PN', 'FR', 'LY', 'VG', 'NZ', 'SY', 'AU', 'KZ', 'SA', 'CF', 'RU', 'MY', 'IR', 'MZ', 'YE', 'IT', 'NA', 'VN', 'AR', 'SB', 'AS', 'PL', 'MG', 'SZ', 'BO', 'KR', 'SO', 'NE', 'ID', 'ZW', 'CO', 'CA', 'CM', 'ES', 'SD', 'FM', 'PK', 'CC', 'SJ', 'VE', 'PT', 'RE', 'GS', 'ZM', 'FI', 'GQ', 'PW', 'MN', 'AO', 'UY', 'TK', 'MA', 'ML', 'JM', 'EG', 'US', 'PH', 'SL', 'CV', 'TR', 'MV', 'TZ', 'IN', 'TF', 'PG', 'TC', 'SN', 'LR', 'EC', 'GR'}


In [14]:
import pycountry

# WHO API: Get all available countries and their alpha-3 codes
who_url = "https://ghoapi.azureedge.net/api/DIMENSION/COUNTRY/DimensionValues"
who_response = requests.get(who_url).json()

# Convert WHO countries from alpha-3 to alpha-2 codes to be matched withOpenWeather Country Code
who_countries_alpha2 = {}

for entry in who_response['value']:
    alpha_3 = entry['Code']
    country_name = entry['Title']

    # Convert alpha-3 to alpha-2 code using pycountry
    try:
        alpha_2 = pycountry.countries.get(alpha_3=alpha_3).alpha_2
        who_countries_alpha2[alpha_2] = country_name
    except AttributeError:
        print(f"Skipping {alpha_3} - No matching alpha-2 code found.")

print(f"WHO Countries (alpha-2): {who_countries_alpha2}")

who_country_codes = who_countries_alpha2.keys()
print(f"WHO Country Codes: {list(who_country_codes)}")


Skipping CHI - No matching alpha-2 code found.
Skipping ME1 - No matching alpha-2 code found.
Skipping SDN736 - No matching alpha-2 code found.
Skipping XKX - No matching alpha-2 code found.
WHO Countries (alpha-2): {'AW': 'Aruba', 'AF': 'Afghanistan', 'AO': 'Angola', 'AI': 'Anguilla', 'AL': 'Albania', 'AD': 'Andorra', 'AE': 'United Arab Emirates', 'AR': 'Argentina', 'AM': 'Armenia', 'AS': 'American Samoa', 'AG': 'Antigua and Barbuda', 'AU': 'Australia', 'AT': 'Austria', 'AZ': 'Azerbaijan', 'BI': 'Burundi', 'BE': 'Belgium', 'BJ': 'Benin', 'BQ': 'Bonaire, Saint Eustatius and Saba', 'BF': 'Burkina Faso', 'BD': 'Bangladesh', 'BG': 'Bulgaria', 'BH': 'Bahrain', 'BS': 'Bahamas', 'BA': 'Bosnia and Herzegovina', 'BY': 'Belarus', 'BZ': 'Belize', 'BM': 'Bermuda', 'BO': 'Bolivia (Plurinational State of)', 'BR': 'Brazil', 'BB': 'Barbados', 'BN': 'Brunei Darussalam', 'BT': 'Bhutan', 'BW': 'Botswana', 'CF': 'Central African Republic', 'CA': 'Canada', 'CH': 'Switzerland', 'CL': 'Chile', 'CN': 'China'

In [15]:
# Filter cities where the country code exists in the WHO data
filtered_cities = {
    city: code for city, code in city_country_map.items() if code in who_country_codes
}

print(f"Cities with matching WHO data: {len(filtered_cities)}")
print(filtered_cities)


Cities with matching WHO data: 427
{'huangmei': 'CN', 'sitka': 'US', 'badger': 'US', 'edinburgh of the seven seas': 'SH', 'ballina': 'AU', 'wailua homesteads': 'US', 'tobelo': 'ID', 'puerto ayora': 'EC', 'albany': 'US', 'gizo': 'SB', 'angaur state': 'PW', 'tazacorte': 'ES', 'clanton': 'US', 'silifke': 'TR', 'tura': 'IN', 'solleftea': 'SE', 'stanley': 'GB', 'mongoumba': 'CF', 'bamboo flat': 'IN', 'nemuro': 'JP', 'bethel': 'US', 'taroa': 'MH', 'gopalpur': 'IN', 'invercargill': 'NZ', 'ushuaia': 'AR', 'bilibino': 'RU', 'puerto deseado': 'AR', 'susuman': 'RU', 'isafjordur': 'IS', 'cabugao': 'PH', 'cadale': 'SO', 'whakatane': 'NZ', 'xiongzhou': 'CN', 'childress': 'US', 'vorgashor': 'RU', 'anadyr': 'RU', "kapa'a": 'US', 'mastic beach': 'US', 'bodo': 'NO', 'pacific grove': 'US', 'waitangi': 'NZ', 'mammoth lakes': 'US', 'colonia': 'DE', 'port lincoln': 'AU', 'qaqortoq': 'GL', 'campoverde': 'PE', 'lebu': 'CL', 'udachny': 'RU', 'kodiak': 'US', 'big bend': 'SZ', 'lerwick': 'GB', 'new norfolk': 'AU

In [16]:
# WHO API: Air pollution attributable DALYs
who_url = "https://ghoapi.azureedge.net/api/AIR_42"
response = requests.get(who_url)

# Check for valid response
if response.status_code == 200:
    data = response.json()['value']
    # Parse the data into a DataFrame
    who_data = [
        {
            'Country': entry['SpatialDim'],
            'Year': entry['TimeDim'],
            'MortalityRate_per_100k': entry['NumericValue']
        }
        for entry in data
    ]
    
    df_who = pd.DataFrame(who_data)
    print(df_who.head())
    
    df_who.to_csv('Resources/who_air_pollution_dalys.csv', index=False)
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")


  Country  Year  MortalityRate_per_100k
0     CIV  2015                  40.103
1     SOM  2011                  53.318
2     GRD  2012                   0.567
3     BRN  2015                   1.440
4     HND  2016                  32.081


In [17]:
# Function to convert ISO-3 to ISO-2
def iso3_to_iso2(iso3_code):
    try:
        return pycountry.countries.get(alpha_3=iso3_code).alpha_2
    except AttributeError:
        return None  

df_who = pd.read_csv('Resources/who_air_pollution_dalys.csv')

# Convert WHO's ISO-3 codes to ISO-2
df_who['Country_ISO2'] = df_who['Country'].apply(iso3_to_iso2)

df_who_clean = df_who.dropna(subset=['Country_ISO2'])

df_weather = pd.read_csv('Resources/city_country_map.csv')

merged_df = pd.merge(air_quality_df, df_who_clean, left_on='Country', right_on='Country_ISO2', how='inner')

print(merged_df.head())

merged_df.to_csv('Resources/merged_air_quality_health.csv', index=False)


       City Country_x  PM2.5   PM10    NO2     O3    SO2       CO  NH3  \
0  huangmei        CN  49.51  54.46  48.67  22.17  12.99  1161.58  0.0   
1  huangmei        CN  49.51  54.46  48.67  22.17  12.99  1161.58  0.0   
2  huangmei        CN  49.51  54.46  48.67  22.17  12.99  1161.58  0.0   
3  huangmei        CN  49.51  54.46  48.67  22.17  12.99  1161.58  0.0   
4  huangmei        CN  49.51  54.46  48.67  22.17  12.99  1161.58  0.0   

  Country_y  Year  MortalityRate_per_100k Country_ISO2  
0       CHN  2017                  53.760           CN  
1       CHN  2018                  50.178           CN  
2       CHN  2014                   4.470           CN  
3       CHN  2010                  18.528           CN  
4       CHN  2014                  25.500           CN  


In [18]:
count = len(merged_df)
count

72000

In [19]:
# Drop redundant columns (dropping 'Country_ISO2' and 'Country_y')
merged_df = merged_df.drop(['Country_ISO2', 'Country_y'], axis=1)

# Rename columns for clarity
merged_df = merged_df.rename(columns={'Country_x': 'Country'})

In [20]:
merged_df.to_csv('Resources/merged_air_quality_health.csv', index=False)

In [21]:
# Load the merged dataset
df = pd.read_csv('Resources/merged_air_quality_health.csv')

# Group by City and Country to calculate average values across all years
grouped_df = df.groupby(['City', 'Country'], as_index=False).agg({
    'PM2.5': 'mean',
    'PM10': 'mean',
    'NO2': 'mean',
    'O3': 'mean',
    'SO2': 'mean',
    'CO': 'mean',
    'NH3': 'mean',
    'MortalityRate_per_100k': 'mean'
})

grouped_df = grouped_df.round(1)

grouped_df.to_csv('Resources/average_air_quality_health.csv', index=False)

print(grouped_df.head(10))



            City Country  PM2.5   PM10   NO2     O3  SO2     CO  NH3  \
0      abbeville      FR    4.7    8.8   4.7   36.8  0.8  210.3  1.3   
1         acarau      BR    2.1    9.6   0.2   52.2  0.1  353.8  0.1   
2          agbor      NG   29.9   39.2   5.7    2.5  0.4  894.6  0.6   
3        al jawf      SA   11.1   39.2   1.0  101.6  1.7  193.6  0.5   
4      al qusayr      SY    7.9   10.0   4.7   40.8  2.5  210.3  2.7   
5       alaghsas      NE   63.9  271.8   0.0   53.6  0.0  196.9  0.0   
6         albany      US    2.1    2.6  12.8   24.7  0.1  253.7  0.1   
7  alice springs      AU   14.5   61.3   0.1   50.1  0.0  283.7  0.2   
8        aliveri      GR    1.1    4.3   0.4   78.0  0.2  200.3  0.2   
9       alliance      US   13.4   16.2  21.9   24.3  0.5  367.2  0.6   

   MortalityRate_per_100k  
0                     4.1  
1                     9.6  
2                    29.6  
3                    33.5  
4                    26.6  
5                    34.1  
6          