In [None]:
print("hello")

In [None]:
import pandas as pd
from statsmodels.tsa.arima_model import ARIMA

import matplotlib.pyplot as plt
import numpy as np # linear algebra
import os
import aqi
import missingno as msno
from fancyimpute import SimpleFill, KNN, MatrixFactorization, IterativeImputer 

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

data_path = r"D:\AQI_forecasting\backend\data\bangkok-air-quality.csv"
aqi_data = pd.read_csv(data_path)
aqi_data.head()

In [None]:
def remove_whitespace_header(df):
    df.columns = df.columns.str.strip()
    return df

def update_data_aqi(df):
    return df

def cleaning_data(df):
    # Remove whitespace from column headers
    df = remove_whitespace_header(df)
    
    # Ensure 'date' column exists before processing
    if 'date' not in df.columns:
        raise KeyError("'date' column is missing in the DataFrame.")
    
    # Convert 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    # Convert other columns to numeric, if applicable
    for col in df.columns:
        if col != 'date':  # Skip 'date' column
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Sort by 'date' and reset index
    df = df.sort_values('date').reset_index(drop=True)
    return df

def extract_aqi(df):
    aqi_list = []
    df = df.replace({'NaT': np.nan})
    col_name = df.columns
    
    for idx, row in df.iterrows():
        # Initialize variables based on row data
        aqi_val = row['aqi'] if 'aqi' in col_name else np.nan
        pollutants = ['pm25', 'pm10', 'o3', 'no2', 'so2', 'co']
        
        # Create the input list for AQI calculation, filtering out NaN values
        input_list = [
            (pollutant, row[pollutant]) 
            for pollutant in pollutants 
            if pollutant in df.columns and not np.isnan(row[pollutant])
        ]
        
        # Handle AQI calculation if necessary
        if np.isnan(aqi_val) and len(input_list) > 1:
            try:
                calc_aqi = aqi.to_aqi(input_list, algo=aqi.ALGO_MEP)
                aqi_list.append(float(calc_aqi))
            except ValueError:
                aqi_list.append(np.nan)  # In case of any error, append NaN
        elif np.isnan(aqi_val) and len(input_list) == 1:
            val = input_list[0]
            try:
                # Call to_aqi() with the 'algo' argument explicitly
                calc_aqi = aqi.to_aqi([val], algo=aqi.ALGO_MEP)  # Pass as a list and specify the algorithm
                aqi_list.append(calc_aqi)
            except ValueError:
                aqi_list.append(np.nan)  # In case of any error, append NaN
        elif len(input_list) < 1:
            aqi_list.append(np.nan)
        else:
            aqi_list.append(float(aqi_val))
        
    df['aqi'] = aqi_list
    return df

    
aqi_cleaning = cleaning_data(aqi_data)
aqi_cleaning = extract_aqi(aqi_cleaning)
aqi_cleaning


In [None]:
cols = ['date','aqi']
aqi_complete = aqi_cleaning[cols]
aqi_complete['aqi'] = pd.to_numeric(aqi_complete['aqi'], errors='coerce')

aqi_complete = aqi_complete[aqi_complete.date >= '2016-01-01'].reset_index(drop=True)
aqi_complete = aqi_complete.rename(columns={'aqi': 'bangkok_aqi'}).set_index('date')
aqi_complete.head()
aqi_complete.tail()



print(aqi_complete.index.min())
print(aqi_complete.index.max())

In [5]:
def plot_imputation(data_series, imputed_sf, imputed_knn, imputed_mice, start_date, end_date):
    df = pd.DataFrame(data_series)
    
    df['sf'] = np.squeeze(imputed_sf)
    df['knn'] = np.squeeze(imputed_knn)
    df['mice'] = np.squeeze(imputed_mice)
    
    
    df = df[start_date:end_date]
    
    fig, axes = plt.subplots(3, 1, figsize=(15, 4 * 6), dpi=100)
    
    axes[0].plot(df.index, df.sf, marker='o', color='red', label='Simple Fill')
    axes[0].plot(df.index, df[data_series.name], marker='o', color='blue', label='Original Data')
    axes[0].set_title('Simple Fill (Gaussian Noise)')
    axes[0].legend()
    
    axes[1].plot(df.index, df.knn, marker='o', color='red', label='KNN Imputation')
    axes[1].plot(df.index, df[data_series.name], marker='o', color='blue', label='Original Data')
    axes[1].set_title('KNN Imputation')
    axes[1].legend()
    
    axes[2].plot(df.index, df.mice, marker='o', color='red', label='MICE Imputation')
    axes[2].plot(df.index, df[data_series.name], marker='o', color='blue', label='Original Data')
    axes[2].set_title('MICE Imputation')
    axes[2].legend()
    
    plt.tight_layout()
    plt.show()


In [None]:
aqi_complete.bangkok_aqi[aqi_complete.bangkok_aqi.isnull()==True]

In [None]:
bangkok_imputed_sf = SimpleFill(fill_method="random", min_value= min(aqi_complete.bangkok_aqi.dropna())).fit_transform(aqi_complete.bangkok_aqi.values.reshape(-1,1))

bangkok_imputed_knn = KNN(k=10, verbose=False).fit_transform(aqi_complete.bangkok_aqi.values.reshape(-1,1))

bangkok_imputed_mice = IterativeImputer(max_iter=100, verbose=False).fit_transform(aqi_complete.bangkok_aqi.values.reshape(-1,1))


In [None]:
plot_imputation(aqi_complete.bangkok_aqi,
                bangkok_imputed_sf,
                bangkok_imputed_knn,
                bangkok_imputed_mice,
                start_date='2016-01-01',
                end_date='2025-01-15'
                )

In [None]:
plot_imputation(aqi_complete.bangkok_aqi,
                bangkok_imputed_sf,
                bangkok_imputed_knn,
                bangkok_imputed_mice,
                start_date='2016-01-01',
                end_date='2016-06-01'
                )

In [None]:
def extract_ymd(df):
    df['Year'] = df['date'].dt.year
    df['Month'] = df['date'].dt.month
    df['Day'] = df['date'].dt.day
    return df

data = extract_ymd(aqi_complete.reset_index())
data.head()

In [None]:
plt.Figure(figsize=(15,6), dpi=100)
sns.lineplot(data=data, x='date', y='bangkok_aqi')
plt.legend(['Bangkok'])


In [None]:
plt.Figure(figsize=(15,6), dpi=100)
sns.lineplot(data=data, x='Month', y='bangkok_aqi', hue='Year', palette='muted', errorbar=None)
plt.title('Bangkok')
plt.show()

In [13]:
import holidays
from datetime import date, datetime


In [14]:
def crawling_holiday(country):
    thai_to_english_months = {
        "มกราคม": "Jan", "กุมภาพันธ์": "Feb", "มีนาคม": "Mar", "เมษายน": "Apr",
        "พฤษภาคม": "May", "มิถุนายน": "Jun", "กรกฎาคม": "Jul", "สิงหาคม": "Aug",
        "กันยายน": "Sep", "ตุลาคม": "Oct", "พฤศจิกายน": "Nov", "ธันวาคม": "Dec"
    }
    df_list = []
    
    for year in range(2016, 2025):
        df = pd.read_html(f"https://www.timeanddate.com/holidays/{country}/{year}")[0]

        df.columns = df.columns.get_level_values(0)
        df.drop("Unnamed: 1_level_0", axis=1, inplace=True)

        df = df.iloc[:-1]
        df = df.dropna(how="all")

        # Replace Thai month names with English
        df['Date'] = df['Date'].replace(thai_to_english_months, regex=True)

        # Rearrange date components to match the format '%b %d'
        df['Date'] = df['Date'].apply(
            lambda x: " ".join([part if part.isalpha() else part.zfill(2) for part in x.split()])
        )

        # Prepend year and parse dates
        df['Date'] = df['Date'].apply(
            lambda x: datetime.strptime(f"{year} {x}", "%Y %d %b")
        )


        df['Name'] = df['Name'].str.lower()
        df['Type'] = df['Type'].str.lower()

        df = df.drop_duplicates(['Date', 'Name', 'Type'])
        
        if 'Details' in df.columns:
            df = df.drop(columns=['Details'])
        
        df.columns = ['date', 'name', 'type']
        df_list.append(df.reset_index(drop=True))
    return df_list


In [None]:
holiday_th_l = crawling_holiday('thailand')
holiday_th_l[4]

In [16]:
def plot_holiday(data, col_city, holiday_l):
    import matplotlib.pyplot as plt
    import pandas as pd
    
    # Extract year if not already present
    if 'Year' not in data.columns:
        data['Year'] = pd.to_datetime(data['date']).dt.year
    
    # Dynamically determine the number of unique years
    unique_years = data['Year'].unique()
    num_years = len(unique_years)
    
    # Create subplots based on the number of unique years
    fig, axes = plt.subplots(num_years, 1, figsize=(15, 5*num_years), dpi=100)
    if num_years == 1:
        axes = [axes]  # Ensure axes is iterable when there's only one subplot
    
    fig.suptitle(col_city)
    fig.tight_layout()
    fig.subplots_adjust(top=0.97)
    
    for idx, y in enumerate(unique_years):
        data_slice = data[data['Year'] == y]
        data_slice = data_slice[['date', col_city]]
        
        # Use empty DataFrame for missing holiday data
        if idx >= len(holiday_l):
            holiday = pd.DataFrame(columns=['date', col_city])
        else:
            holiday = pd.merge(holiday_l[idx], data_slice, how='left', on='date')
        
        # Plot the data
        axes[idx].plot(data_slice['date'], data_slice[col_city], marker='o')
        axes[idx].plot(holiday['date'], holiday[col_city], marker='o', linestyle='none', color='red')
        axes[idx].set_title(f"Year: {y}")


In [None]:
plot_holiday(data, 'bangkok_aqi', holiday_th_l)

In [None]:
season_bangkok_dict = {1: 'Hot & Cool',
                       2: 'Hot & Cool',
                       3: 'Hot & Cool',
                       4: 'Hot',
                       5: 'Rainy',
                       6: 'Rainy',
                       7: 'Rainy',
                       8: 'Rainy',
                       9: 'Rainy',
                       10: 'Rainy',
                       11: 'Hot & Cool',
                       12: 'Hot & Cool'}
season_bangkok = pd.DataFrame(data.date, columns=['date'])
season_bangkok = extract_ymd(season_bangkok)
season_bangkok['season'] = season_bangkok.Month.map(season_bangkok_dict)

season_bangkok.head()

In [19]:
def plot_season(data, col_city, season_city):
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd

    # Dynamically determine the number of unique years
    unique_years = data['Year'].unique()
    num_years = len(unique_years)
    
    # Create subplots based on the number of unique years
    fig, axes = plt.subplots(num_years, 1, figsize=(15, 5*num_years), dpi=100)
    if num_years == 1:
        axes = [axes]  # Ensure axes is iterable when there's only one subplot
    
    fig.suptitle(col_city)
    fig.tight_layout()
    fig.subplots_adjust(top=0.97)
    
    for idx, y in enumerate(unique_years):
        data_slice = data[data['Year'] == y]
        data_slice = data_slice[['date', col_city]]
        
        # Filter the season data for the current year
        season_slice = season_city[season_city['Year'] == y]
        
        # Merge the season data with the AQI data
        s = pd.merge(season_slice, data_slice, how='left', on='date')
        
        # Plot the data
        sns.lineplot(x=s['date'], y=s[col_city], hue=s['season'], marker='o', ax=axes[idx])
        axes[idx].set_title(f"Year: {y}")


In [None]:
plot_season(data, 'bangkok_aqi', season_bangkok)

In [None]:
from statsmodels.tsa.stattools import adfuller

data.head()

In [None]:
data = data.dropna()
data.head()

In [None]:
import matplotlib.pyplot as plt

data['bangkok_aqi'].plot(color = 'green', figsize = (15,4))
plt.title('AQI')

In [None]:
adf_res = adfuller(data['bangkok_aqi'], autolag= 'AIC')
print('p-Values:' + str(adf_res[1]))

In [None]:
data['bangkok_aqi_diff'] = data['bangkok_aqi'].diff()
data['bangkok_aqi_diff'].plot(color = 'green', figsize = (15,4))
plt.title('AQI (diff)')

adf_res = adfuller(data['bangkok_aqi_diff'].dropna(), autolag = 'AIC')
print('p-Values:' + str(adf_res[1]))

In [None]:
data.head()

In [None]:
data = data.dropna()
data.head()

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.stattools import pacf

df_pacf = pacf(data['bangkok_aqi'].dropna(), method ='ols')
for i in range(0, len(df_pacf)):
    if df_pacf[i] < 1.96 / np.sqrt(len(data['bangkok_aqi'])):
        print('p=', i-1)
        break

In [None]:
#Test order
import itertools

p = range(1, 2)
d = range(1, 2)
q = range(0, 4)
pdq = list(itertools.product(p, d, q))
print(pdq)

import statsmodels.api as sm

aic = []

for param in pdq:
    try:
        model = sm.tsa.arima.ARIMA(data['bangkok_aqi'].dropna(), order = param)
        results = model.fit()
        print('Order = {}'.format(param))
        print('AIC = {}'.format(results.aic))
        a = 'Order: '+str(param) +' AIC: ' + str(results.aic)
        aic.append(a)
    except:
        continue

In [None]:
#ARIMA model
model = sm.tsa.arima.ARIMA(data['bangkok_aqi'], order = (1, 1, 3))
results = model.fit()
print(results.summary())

#Prediction
plt.figsize = (25,10)
plt.plot(data['bangkok_aqi_diff'], color = 'green', label = 'Actual diff')
plt.plot(results.predict(), color= 'orange', label = 'Predicted diff')
plt.legend()

In [None]:
prediction = pd.DataFrame(results.predict(), columns = ['Predicted'])
df_pred = pd.merge(data, prediction, how = 'left', left_index = True, right_index = True)
df_pred['Predicted_close'] = df_pred['bangkok_aqi'] + df_pred['Predicted'].shift(-1)
df_pred['Predicted_close'] = df_pred['Predicted_close'].shift(1).fillna(data['bangkok_aqi'])
plt.figure(figsize=(20, 6))
plt.plot(df_pred['bangkok_aqi'], color = 'green', label = 'Actual close')
plt.legend()
plt.figure(figsize=(20, 6))
plt.plot(df_pred['Predicted_close'], color='orange', label = 'Predicted close')
plt.legend()

#Performance
from sklearn.metrics import mean_squared_error

mean_squared_error(df_pred['bangkok_aqi'], df_pred['Predicted_close'])

In [None]:
data.head()

In [None]:
data = data.set_index('date')
df = data['bangkok_aqi'].to_frame()
df.tail(5)

In [None]:
df['SMA7'] = df['bangkok_aqi'].rolling(7).mean()
df.dropna(inplace=True)
df.head()

In [None]:
df[['bangkok_aqi', 'SMA7']].plot(label='bangkok_aqi', 
                                  figsize=(20, 6))

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(df.bangkok_aqi, df.SMA7))
print(rms)

In [None]:
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
fit2 = SimpleExpSmoothing(np.asarray(df['SMA7'])).fit(smoothing_level=0.6,optimized=False)
df['SES'] = fit2.forecast(len(df))
plt.figure(figsize=(20,8))
plt.plot(df['SMA7'], label='SMA7')
plt.plot(df['bangkok_aqi'], label='bangkok_aqi')
plt.plot(df['SES'], label='SES')
plt.legend(loc='best')
plt.show()

In [None]:
import numpy as np

# Filter data for the year 2025
data_2025 = data[data['Year'] == 2025].reset_index()

# Plot the data for the year 2025
plt.figure(figsize=(15, 6))
sns.lineplot(data=data_2025, x='date', y='bangkok_aqi')
plt.title('Bangkok AQI for the Year 2025')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.show()

# Forecast the next 30 days
forecast = results.get_forecast(steps=30)
forecast_dates = pd.date_range(start=data_2025['date'].max() + pd.Timedelta(days=1), periods=30)

# Sample from the forecasted distribution
forecast_values = forecast.predicted_mean  # Mean values
forecast_std_errors = forecast.se_mean     # Standard errors

# Generate "exact" forecasted data by sampling
exact_forecast_values = np.random.normal(loc=forecast_values, scale=forecast_std_errors)

# Confidence intervals
conf_int = forecast.conf_int(alpha=0.05)

# Create a DataFrame for forecasted values
forecast_df = pd.DataFrame({
    'date': forecast_dates,
    'forecasted_aqi': exact_forecast_values,  # Sampled values
    'lower_bound': conf_int.iloc[:, 0],
    'upper_bound': conf_int.iloc[:, 1]
})

# Plot the actual and forecasted AQI
plt.figure(figsize=(15, 6))
sns.lineplot(data=data_2025, x='date', y='bangkok_aqi', label='Actual AQI')
sns.lineplot(data=forecast_df, x='date', y='forecasted_aqi', label='Sampled Forecasted AQI', color='red')
plt.fill_between(forecast_df['date'], forecast_df['lower_bound'], forecast_df['upper_bound'], color='red', alpha=0.2, label='Confidence Interval')
plt.title('Bangkok AQI for the Year 2025 with Forecast')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.legend()
plt.show()


In [None]:
data.describe()