In [2]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import os
import numpy as np

## Sample Code For Scraping The Data at Station

In [19]:
base_url = "https://enviinfo.cem.gov.vn"
endpoint = "/eip/default/call/json/get_indicators_have_data"

url = base_url + endpoint

payload = {
    'station_id': 28602553176253587650986727137, 
    'from_public': 1,
    'station_type': 4  
}

response = requests.post(url, data=payload)

if response.status_code == 200:
    print(response.json())
else:
    print("Lỗi:", response.status_code)


{'html': '<option value="CO" selected>CO</option><option value="NO2" selected>NO2</option><option value="O3" selected>O3</option><option value="PM-10" selected>PM-10</option><option value="PM-2-5" selected>PM-2-5</option><option value="SO2" selected>SO2</option>', 'success': True}


## API For Getting Data At Stations In 1900 Days

- AQI API: `/eip/default/call/json/get_aqi_data%3Fdate%3D1900%26aqi_type%3D0` for getting data in 1900 days
- Indicators API: `/eip/default/call/json/get_indicators_have_data` for getting the name of datatables columns
- Station Details API: `/eip/default/call/json/get_stations_by_province` for getting the `StationID`, `lat`, `lon`, and its `address`

In [20]:
# Configuration for the API request
api_base_url = "https://enviinfo.cem.gov.vn"
stations_endpoint = "/eip/default/call/json/get_stations_by_province"
complete_stations_url = f"{api_base_url}{stations_endpoint}"

# Detailed request payload for fetching station data
stations_payload = {
    'sEcho': 1,  # Request identifier for DataTables
    'iColumns': 3,  # Number of columns in the DataTable
    'sColumns': ',,',  # Column identifiers
    'iDisplayStart': 0,  # Start point in the data set (for pagination)
    'iDisplayLength': 77,  # Number of records to fetch
    'mDataProp_0': 0,  # Data property for the first column
    'sSearch_0': '',  # Search term for the first column
    'bRegex_0': False,  # Use regex for the first column search term?
    'bSearchable_0': True,  # Can the first column be searched?
    'mDataProp_1': 1,  # Data property for the second column
    'station_type': 4  # Filter for station type
}

# Perform the POST request to fetch station data by province
station_response = requests.post(complete_stations_url, data=stations_payload)

# Handling the response
if station_response.status_code == 200:
    # Saving the response data to a JSON file
    with open('province-stations/province_station_data.json', 'w') as file:
        json.dump(station_response.json(), file)
    print("Station data saved successfully.")
else:
    print(f"Error fetching station data: {station_response.status_code}")

Station data saved successfully.


In [69]:
# Define your base URL and specific endpoints
base_url = "https://enviinfo.cem.gov.vn"
endpoint_aqi_data = "/eip/default/call/json/get_aqi_data%3Fdate%3D1900%26aqi_type%3D0"
endpoint_indicators_data = "/eip/default/call/json/get_indicators_have_data"

# Construct complete URLs for the requests
url_aqi_data = f"{base_url}{endpoint_aqi_data}"
url_indicators_data = f"{base_url}{endpoint_indicators_data}"

# Load province data to get province IDs
with open("province-stations/province_station_data.json", "r") as file:
    province_data = json.load(file)

    province_ids = [BeautifulSoup(province[1], "html.parser").find("a")["data-id"] for province in province_data['aaData']]
    province_latitudes = [BeautifulSoup(province[1], "html.parser").find("a")["data-lat"] for province in province_data['aaData']]
    province_longitudes = [BeautifulSoup(province[1], "html.parser").find("a")["data-lon"] for province in province_data['aaData']]

# Initialize payloads for the requests
payload_aqi = {
    'sEcho': 1,
    'iColumns': 9,
    'sColumns': ',,,,,,,,',
    'iDisplayStart': 0,
    'iDisplayLength': 1900,
    'mDataProp_0': 0,
    'sSearch_0': '',
    'bRegex_0': False,
    'bSearchable_0': True,
}

payload_indicators = {
    'station_id': 0,  
    'from_public': 1,
    'station_type': 4,
}

# Process each province
for province_id in province_ids[70:77]:
    # Update payloads with the current province ID
    payload_aqi["station_id"] = province_id
    payload_indicators["station_id"] = province_id

    # Fetch available indicators for the current province
    response_indicators = requests.post(url_indicators_data, data=payload_indicators)
    if response_indicators.status_code == 200:
        indicator_html = response_indicators.json().get('html', '')
        soup = BeautifulSoup(indicator_html, 'html.parser')
        selected_indicators = [option.text for option in soup.find_all('option', selected=True)]
        
        # Fetch AQI data for the current province
        response_aqi = requests.post(url_aqi_data, data=payload_aqi)
        if response_aqi.status_code == 200:
            data = response_aqi.json()
            # Assign the new column names including the indicators
            data['aoColumns'] = ['ID', 'Date', 'AQI'] + selected_indicators

            # Insert 2 columns to store the (province_lattitude, province_longitude)
            data['aoColumns'].insert(1, 'Province Latitude')
            data['aoColumns'].insert(2, 'Province Longitude')

            # Insert the province latitude and longitude values ignoring the ID and Date columns
            data['aaData'] = [[*row[:1], province_latitudes[province_ids.index(province_id)], province_longitudes[province_ids.index(province_id)], *row[1:]] for row in data['aaData']]

            # Save the updated data to a file
            with open(f"data-province/test-{province_id}.json", "w") as file:
                json.dump(data, file)
            # print(f"Data for province {province_id} saved successfully.")
        else:
            print(f"Error fetching AQI data for province {province_id}: {response_aqi.status_code}")
    else:
        print(f"Error fetching indicators for province {province_id}: {response_indicators.status_code}")

In [70]:
with open("province-stations/province_station_data.json", "r") as f:
    province_data = json.load(f)
    province_names = []
    province_ids = []
    for province in province_data['aaData']:
        html_snippet = province[1]
        soup = BeautifulSoup(html_snippet, "html.parser")
        province_id = soup.find("a")["data-id"]
        province_name = soup.find("a").get_text()

        province_name = province_name.replace(":", "").replace("/", "-").replace("\\", "-")
        
        # Find the file with their data-id then change to the province name if exists, ifnot, ignore
        try:
            with open(f"data-province/test-{province_id}.json", "r") as file:
                data = json.load(file)
                with open(f"data-provinces-name/{province_name}.json", "w") as new_file:
                    json.dump(data, new_file)
        except FileNotFoundError:
            print(f"File with data-id {province_id} not found.")
        except Exception as e:
            print(f"Error processing data for province {province_name}: {e}")

### Preprocessing for HCM

In [80]:
HCM_station_data = {
  '1' : {
    'Address': 'DHQG, Linh Trung, Thủ Đức',
    'Province': 'TP HCM',
    'Province Latitude': '10.86994333',
    'Province Longitude': '106.7960143'
  },
  '2' : {
    'Address': '20, Nguyễn Trọng Trí, An Lạc, Bình Tân',
    'Province': 'TP HCM',
    'Province Latitude': '10.74097081',
    'Province Longitude': '106.6204143'
  },
  '3' : {
    'Address': 'KCN Tân Bình, Tây Thạnh, Tân Phú',
    'Province': 'TP HCM',
    'Province Latitude': '10.81621227',	
    'Province Longitude': '106.6204143'
  },
  '4' : {
    'Address': '49, Thanh Đa, P.27, Bình Thạnh',
    'Province': 'TP HCM',
    'Province Latitude': '10.81584553',	
    'Province Longitude': '106.7174282'
  },
  '5' : {
    'Address': '268, Nguyễn Đình Chiểu, P.6, Q.3',
    'Province': 'TP HCM',
    'Province Latitude': '10.77636612',		
    'Province Longitude': '106.6878094'
  },
  '6' : {
    'Address': 'MM18, Trường Sơn, P.14, Q.10',
    'Province': 'TP HCM',
    'Province Latitude': '10.78047163',	
    'Province Longitude': '106.6594579'
  }
}

# Read the file temp_data/data/aqi_hcm_2021_2022.csv
df_hcm = pd.read_csv("temp_data/data/aqi_hcm_2021_2022.csv")
# Convert the Station_No base on the HCM_station_data
# Add the Province, Province Latitude, Province Longitude, Address columns
df_hcm["Station_No"] = df_hcm["Station_No"].astype(str)
df_hcm["Province"] = df_hcm["Station_No"].map(lambda x: HCM_station_data[x]["Province"])
df_hcm["Province Latitude"] = df_hcm["Station_No"].map(lambda x: HCM_station_data[x]["Province Latitude"])
df_hcm["Province Longitude"] = df_hcm["Station_No"].map(lambda x: HCM_station_data[x]["Province Longitude"])
df_hcm["Address"] = df_hcm["Station_No"].map(lambda x: HCM_station_data[x]["Address"])

# Rename date to Date
df_hcm.rename(columns={"date": "Date"}, inplace=True)

# Re-order the columns
df_hcm = df_hcm[["Station_No", "Province", "Address", "Province Latitude", "Province Longitude", "Date", "TSP" ,"PM2.5", "O3", "CO", "NO2", "SO2","Temperature", "Humidity"]]

# Save the updated DataFrame to a new CSV file
# df_hcm.to_csv("aqi_hcm_2021_2022_updated.csv", index=False)

In [81]:
# Print missing values of df_hcm
print(df_hcm.isnull().sum())

Station_No                0
Province                  0
Address                   0
Province Latitude         0
Province Longitude        0
Date                      0
TSP                      60
PM2.5                     0
O3                    10610
CO                     9065
NO2                    5666
SO2                   11006
Temperature            4437
Humidity               4432
dtype: int64


In [82]:
# Filling missing value with the mean of the column of each station No
for station_no in df_hcm["Station_No"].unique():
    for column in df_hcm.columns[6:]:
        df_hcm.loc[df_hcm["Station_No"] == station_no, column] = df_hcm.loc[df_hcm["Station_No"] == station_no, column].fillna(df_hcm.loc[df_hcm["Station_No"] == station_no, column].mean())

In [83]:
# Print missing values of df_hcm after filling missing values
print(df_hcm.isnull().sum())

Station_No            0
Province              0
Address               0
Province Latitude     0
Province Longitude    0
Date                  0
TSP                   0
PM2.5                 0
O3                    0
CO                    0
NO2                   0
SO2                   0
Temperature           0
Humidity              0
dtype: int64


### Calculate AQI

In [87]:
# Define breakpoints and AQI levels
breakpoints = {
    'O3': [(0, 54), (55, 70), (71, 85), (86, 105), (106, 200)],
    'PM2.5': [(0.0, 12.0), (12.1, 35.4), (35.5, 55.4), (55.5, 150.4), (150.5, 250.4), (250.5, 350.4), (350.5, 500.4)],
    'CO': [(0.0, 4.4), (4.5, 9.4), (9.5, 12.4), (12.5, 15.4), (15.5, 30.4), (30.5, 40.4), (40.5, 50.4)],
    'SO2': [(0, 35), (36, 75), (76, 185), (186, 304)],
    'NO2': [(0, 53), (54, 100), (101, 360), (361, 649), (650, 1249), (1250, 1649), (1650, 2049)]
}
aqi_levels = [(0, 50), (51, 100), (101, 150), (151, 200),
              (201, 300), (301, 400), (401, 500)]

def calculate_aqi(parameter, value, breakpoints):
    for index, (low, high) in enumerate(breakpoints):
        if low <= value <= high:
            aqi = round(((aqi_levels[index][1] - aqi_levels[index][0]) / (high - low)) * (value - low) + aqi_levels[index][0])
            return aqi
    return 0  # Return a default value if outside any breakpoint range

# Calculate total AQI for each row
df_hcm["AQI"] = df_hcm.apply(lambda x: max([calculate_aqi(param, x[param], breakpoints[param]) for param in ["O3", "PM2.5", "CO", "SO2", "NO2"] if x[param] is not None]), axis=1)


# Save the updated DataFrame to a new CSV file
df_hcm.to_csv("aqi_hcm_2021_2022_updated.csv", index=False)

In [77]:
df_hcm.to_csv("final-data/aqi_hcm_2021_2022_updated.csv", index=False)

## Pre-processing for All Provinces

In [3]:
# Initialize an empty DataFrame with predefined columns
df_columns = ["Province", "Province Latitude", "Province Longitude", "Date", "AQI", "CO", "NO2", "O3", "PM-10", "PM-2-5", "SO2"]
df = pd.DataFrame(columns=df_columns)

# Path to the folder containing the data files
folder_path = "data-provinces-name"

# Define a function to handle missing or "-" values
def replace_missing(value):
    if value == "-":
        return np.nan
    return value

# Process each JSON file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)

            # Extract province name from the filename
            province_name = filename[:-5]  # Assuming '.json' is the extension

            # Assuming 'aoColumns' contains column names directly
            # 'aaData' is the list of data rows
            file_columns = data.get("aoColumns", [])
            aqi_data = data.get("aaData", [])

            # Prepare data for DataFrame creation
            temp_data = []
            for row in aqi_data:
                # Replace "-" with np.nan and ensure the length matches file_columns
                row_data = [replace_missing(item) for item in row] + [np.nan] * (len(file_columns) - len(row))
                temp_data.append([province_name] + row_data[:len(file_columns)])

            # Create DataFrame for the current file
            temp_df = pd.DataFrame(temp_data, columns=["Province"] + file_columns)

            # Ensure all predefined columns are present, adding missing ones with np.nan
            for column in df_columns:
                if column not in temp_df.columns:
                    temp_df[column] = np.nan

            # Concatenate with the main DataFrame
            df = pd.concat([df, temp_df[df_columns]], ignore_index=True)

# Convert numeric columns to float, handling errors (like conversion failures) by ignoring them
numeric_columns = ["AQI", "CO", "NO2", "O3", "PM-10", "PM-2-5", "SO2"]
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors="coerce")

# Convert '%d/%m/%Y' date format to 'yyyy-mm-dd'
df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y").dt.strftime("%Y-%m-%d")

# Save the combined DataFrame to CSV
df.to_csv("combined_data.csv", index=False)

In [57]:
# Read the combined data from the CSV file
df = pd.read_csv("combined_data.csv")

# Convert the name of 'Province' column to 'Address'
df.rename(columns={"Province": "Address"}, inplace=True)

df["Address"] = df["Address"].str.replace("Thừa Thiên Huế", "TT Huế")
df["Address"] = df["Address"].str.replace("HCM", "TP HCM")

# Separate the 'Address' column into 'Province' and 'Address' columns
# The Province will get 2 first words and the Address will get the rest
df["Province"] = df["Address"].str.split(" ").str[:2].str.join(" ")
df["Address"] = df["Address"].str.split(" ").str[2:].str.join(" ")

# Re-order the columns
df = df[["Province", "Address", "Province Latitude", "Province Longitude", "Date", "AQI", "CO", "NO2", "O3", "PM-10", "PM-2-5", "SO2"]]

# Save the updated DataFrame to a new CSV file
df.to_csv("combined_data_updated.csv", index=False)

In [12]:
# Show the distinct value of Province column
print(df["Province"].unique())
print("Provinces count: ", df["Province"].nunique())

['Thanh Hoá' 'Hưng Yên' 'Hà Nam' 'Bắc Ninh' 'Thái Bình' 'Vũng Tàu'
 'Trà Vinh' 'Quảng Ngãi' 'Hà Nội' 'Quảng Ninh' 'TT Huế' 'Gia Lai'
 'Hậu Giang' 'Long An' 'Bắc Giang' 'Đà Nẵng' 'Bình Phước' 'Nghệ An'
 'TP HCM' 'Cao Bằng' 'Vĩnh Long' 'Hải Dương' 'Bình Dương' 'Ninh Thuận'
 'Lâm Đồng' 'Phú Thọ' 'Quảng Bình' 'Gia lai' 'Bình Định' 'Hà Tĩnh'
 'Khánh Hòa' 'Quảng Nam' 'Lào Cai' 'Lạng Sơn']
Provinces count:  34


In [13]:
# Print the properties of the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55774 entries, 0 to 55773
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Province            55774 non-null  object 
 1   Address             55774 non-null  object 
 2   Province Latitude   55774 non-null  float64
 3   Province Longitude  55774 non-null  float64
 4   Date                55774 non-null  object 
 5   AQI                 55774 non-null  int64  
 6   CO                  44432 non-null  float64
 7   NO2                 44570 non-null  float64
 8   O3                  44187 non-null  float64
 9   PM-10               52606 non-null  float64
 10  PM-2-5              52075 non-null  float64
 11  SO2                 46264 non-null  float64
dtypes: float64(8), int64(1), object(3)
memory usage: 5.1+ MB
None


In [58]:
# Count the number of missing values in each column
print(df.isnull().sum())

Province                  0
Address                   0
Province Latitude         0
Province Longitude        0
Date                      0
AQI                       0
CO                    11342
NO2                   11204
O3                    11587
PM-10                  3168
PM-2-5                 3699
SO2                    9510
dtype: int64


### Fill missing value

In [59]:
df["Date"] = pd.to_datetime(df["Date"])
df["Month"] = df["Date"].dt.month
df["Year"] = df["Date"].dt.year

def fill_by_same_month_year(group):
    """Fill missing values with the mean of the same month in the same year of the province."""
    numeric_cols = group.select_dtypes(include=[np.number]).columns.difference(['Year', 'Month'])
    for col in numeric_cols:
        if col not in ['Province', 'Date', 'Year']:
            group[col] = group[col].fillna(group[col].mean())
    return group

df = fill_by_same_month_year(df)

df.drop(columns=["Year", "Month"], inplace=True)

In [63]:
# Count lines has year before 2020 and after 2023
print(df[(df["Date"] < "2020-01-01") | (df["Date"] > "2023-12-31")].shape[0])

# Drop
df = df[(df["Date"] >= "2020-01-01") & (df["Date"] <= "2023-12-31")]

9274


In [64]:
print(df.isnull().sum())

Province              0
Address               0
Province Latitude     0
Province Longitude    0
Date                  0
AQI                   0
CO                    0
NO2                   0
O3                    0
PM-10                 0
PM-2-5                0
SO2                   0
dtype: int64


In [67]:
# save this updated DataFrame to a new CSV file
df.to_csv("final-data/combined_data_updated_filled.csv", index=False)