In [2]:
import requests
import json 
import pandas as pd 
import time
import pycountry
from dotenv import load_dotenv
import os 

load_dotenv()

Nin_api_key = os.getenv('Nin_api_key')

In [8]:
#Load sample file to get the list of cities with population over 4,000,000
pop_ranking = pd.read_csv(rf'C:\Users\gurpr\OneDrive\Documents\New Projects\Pollution and Health Analysis\Data\Raw\world-city-listing-table.csv')
pop_ranking = pop_ranking[pop_ranking['population']>4000000]


def country_to_iso_alpha2(country_name):
    try:
        return pycountry.countries.lookup(country_name).alpha_2
    except LookupError:
        return None  
    
pop_ranking['country_iso'] = pop_ranking['country'].apply(country_to_iso_alpha2)

countries = set(pop_ranking["country"])
cities = pop_ranking[['city','country_iso']]



In [None]:
#Use Ninja API to get Lon and Lat for the cities with updated Population statistics
cities_coord = []
delay = 2

for index, row in cities.iterrows():
    city = row['city']
    country = row['country_iso']

    api_url = f'https://api.api-ninjas.com/v1/city?name={city}&country={country}&min_population=4500000'
    response = requests.get(api_url , headers={'X-Api-Key': Nin_api_key})
    
    if response.status_code == requests.codes.ok:
        
        cities_coord.append(response.text)
        time.sleep(delay)
    else:
        print("Error:", response.status_code, response.text)
print(cities_coord)



In [None]:
#Flatten the nested JSON data

flattened_list = []

for json_str in cities_coord:

    list_of_dicts = json.loads(json_str)
    
    if isinstance(list_of_dicts, list):
        flattened_list.extend(list_of_dicts)
    else:
        print(f"Unexpected data format: {list_of_dicts}")

file_path = rf'Data\Raw\cities_geodata_list.json'

with open(file_path, 'w') as f:
    json.dump(flattened_list, f, indent=4) 


In [11]:
file_path = rf'C:\Users\gurpr\OneDrive\Documents\New Projects\Pollution and Health Analysis\Data\Raw\cities_geodata_list.json'

data_to_csv = pd.read_json(file_path)
data_to_csv.rename(columns={'name': 'city_name'}, inplace=True)
data_to_csv['city_id'] = pd.factorize(data_to_csv['city_name'])[0] + 1

cities = ['Tokyo','London','Paris','Seoul','Singapore','New York','Barcelona','Sydney','Shanghai','Delhi']
data_to_csv = data_to_csv[data_to_csv['city_name'].isin(cities)]


def country_to_iso_alpha3(country_name):
    try:
        return pycountry.countries.lookup(country_name).alpha_3
    except LookupError:
        return None  
    
data_to_csv['country_iso'] = data_to_csv['country'].apply(country_to_iso_alpha3)


    city_name  latitude  longitude country  population  is_capital  city_id  \
0       Tokyo   35.6897   139.6920      JP    37977000        True        1   
1       Delhi   28.6600    77.2300      IN    29617000       False        2   
2    Shanghai   31.1667   121.4670      CN    22120000       False        3   
26      Paris   48.8566     2.3522      FR    11020000        True       27   
29      Seoul   37.5833   127.0000      KR    21794000        True       30   
32     London   51.5072    -0.1275      GB    10979000        True       33   
43   New York   40.6943   -73.9249      US    18713220       False       44   
55  Singapore    1.3000   103.8000      SG     5745000        True       56   
58  Barcelona   41.3825     2.1769      ES     4588000       False       59   
64     Sydney  -33.8650   151.2090      AU     5312163       False       65   

   country_iso  
0          JPN  
1          IND  
2          CHN  
26         FRA  
29         KOR  
32         GBR  
43         

In [13]:

csv_save_path = rf'C:\Users\gurpr\OneDrive\Documents\New Projects\Pollution and Health Analysis\Data_ETL\Final_city_data.csv'
json_save_path = rf'C:\Users\gurpr\OneDrive\Documents\New Projects\Pollution and Health Analysis\Data_ETL\Final_city_data.json'

data_to_csv.to_csv(csv_save_path, index= False)
city_data_json = data_to_csv.to_json(orient='records')
city_data = json.loads(city_data_json)

with open(json_save_path, 'w') as f:
    json.dump(city_data, f, indent=4)