In [None]:
import requests
import pandas as pd
from config import api_key


In [None]:
# Top 10 world cities
CITIES = [
    {"city": "Tokyo", "lat": 35.6895, "lon": 139.6917, "population": 13929286},
    {"city": "Delhi", "lat": 28.7041, "lon": 77.1025, "population": 27882721},
    {"city": "Shanghai", "lat": 31.2304, "lon": 121.4737, "population": 24256800},
    {"city": "São Paulo", "lat": -23.5505, "lon": -46.6333, "population": 21571281},
    {"city": "Mumbai", "lat": 19.0760, "lon": 72.8777, "population": 21558265},
    {"city": "Mexico City", "lat": 19.4326, "lon": -99.1332, "population": 21782378},
    {"city": "Beijing", "lat": 39.9042, "lon": 116.4074, "population": 21516000},
    {"city": "Osaka", "lat": 34.6937, "lon": 135.5023, "population": 19222665},
    {"city": "New York", "lat": 40.7128, "lon": -74.0060, "population": 18804000},
    {"city": "Cairo", "lat": 30.0444, "lon": 31.2357, "population": 20095952}
  ]

In [None]:

# Create an empty list to store air pollution data for each city
air_pollution_data_list = []

# Loop through the cities and make API calls for historical air pollution data
for city in CITIES:
    lat = city["lat"]
    lon = city["lon"]
    city_name = city["city"]

    # Set the start and end dates for historical data
    start = 1641016800  # Start date 01-01-2022
    end = 1672552799  #  End date 31-12-2022

    # Make API call
    url = f"http://api.openweathermap.org/data/2.5/air_pollution/history?lat={lat}&lon={lon}&start={start}&end={end}&appid={api_key}"
    response = requests.get(url)

    # Check if API call was successful
    if response.status_code == 200:
        # Extract air pollution data from response
        air_pollution_data = response.json()

        # Append the air pollution data for this city to the final list
        air_pollution_data_list.append(air_pollution_data)
    

In [None]:
air_pollution_df = pd.json_normalize(air_pollution_data_list,record_path="list", meta=['coord'])
air_pollution_df

In [None]:
air_pollution_df.columns

In [None]:
air_pollution_df

In [None]:
# Define a function to split 'coord' column into 'longitude' and 'latitude'
def split_coord(coord):
    return pd.Series([coord['lon'],coord['lat']])

# Apply the function to 'coord' column to create 'longitude' and 'latitude' columns
air_pollution_df[['longitude', 'latitude']] = air_pollution_df['coord'].apply(split_coord)

# Drop the original 'coord' column
air_pollution_df.drop('coord',axis=1,inplace=True)

In [None]:
cities_df = pd.DataFrame(CITIES)
cities_df.columns

In [None]:
new_columns_name = ['city', 'latitude', 'longitude', 'population']
cities_df.columns= new_columns_name
cities_df

In [None]:
air_pollution_df = pd.merge(air_pollution_df,cities_df, left_on='latitude',right_on='latitude')


In [None]:
#air_pollution_df.drop('longitude_y',axis=1,inplace=True)
air_pollution_df

In [None]:
air_pollution_df['dt'] = pd.to_datetime(air_pollution_df['dt'],unit='s',origin='unix')
air_pollution_df['dt'] = air_pollution_df['dt'].dt.strftime('%Y-%m-%d')

In [None]:
air_pollution_df.drop('longitude_y',axis=1,inplace=True)


In [None]:
updated_pollution_df = air_pollution_df


In [None]:
updated_pollution_df = updated_pollution_df.rename(columns={'dt':'date', 'city':'city', 'longitude_x':'longitude', 'latitude':'latitude', 'main.aqi':'AQI', 'components.co':'CO', 'components.no':'NO', 'components.no2':'NO2',
       'components.o3':'O3', 'components.so2':'SO2', 'components.pm2_5':'PM2_5', 'components.pm10':'PM10', 'components.nh3':'NH3', 'population':'population'})

In [None]:
updated_pollution_df = updated_pollution_df[['date','longitude', 'latitude', 'city', 'population', 'AQI', 'CO', 'NO', 'NO2', 'O3', 'SO2', 'PM2_5', 'PM10', 'NH3',
       ]]
updated_pollution_df.sort_values(by='date')


In [None]:
updated_pollution_df.to_csv('air_pollution.csv')

In [None]:
import glob


In [None]:
# path where the CSV files are located
path = r'City_csv'

In [None]:
# get a list of all CSV files in the directory
all_files = glob.glob(path + "/*.csv")

In [None]:
# combine all CSV files into one dataframe
df = pd.concat((pd.read_csv(f) for f in all_files))

In [None]:
# write the combined data to a new CSV file
df.to_csv("combined_data.csv", index=False)

In [None]:
weather_data = pd.read_csv("combined_data.csv")

In [None]:
weather_data_df = pd.DataFrame(weather_data)

In [None]:
updated_pollution_df

In [None]:
weather_data_df

In [None]:
updated_500 = updated_pollution_df.head(500)
weather_500 = weather_data_df.head(500)

In [None]:
data_merged = pd.merge(updated_500,weather_500, on=['longitude', 'latitude'], how='left')