In [1]:
import pandas as pd 
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.metrics import mean_squared_error
import aqi
warnings.filterwarnings('ignore')


In [2]:
data_path = r"D:\AQI_forecasting\backend\data\bangkok-air-quality.csv"
df = pd.read_csv(data_path)
df

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co
0,2025/1/1,116,50,15,2,,
1,2025/1/2,119,56,19,2,,
2,2025/1/3,120,58,24,1,,
3,2025/1/4,114,62,22,2,,
4,2025/1/5,119,68,22,2,,
...,...,...,...,...,...,...,...
4021,2015/3/15,,,23,13,3,9
4022,2014/11/30,,,40,36,4,11
4023,2013/12/31,,,68,33,3,7
4024,2014/1/26,,,54,53,6,18


In [3]:
def remove_whitespace_cols(df):
    df.columns = df.columns.str.strip()
    return df

def cleaning_data(df):
    
    df = remove_whitespace_cols(df)
    if 'date' not in df.columns:
        raise ValueError('date column not found')
    else:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
    try:
        for col in df.columns:
            if col != 'date':
                df[col] = pd.to_numeric(df[col], errors='coerce')
        df = df.sort_values('date').reset_index(drop=True)
    except ValueError:
        raise ValueError('Invalid data in columns')
    return df

df_cleaned = cleaning_data(df)

df_cleaned

Unnamed: 0,date,pm25,pm10,o3,no2,so2,co
0,2013-12-31,,,68.0,33.0,3.0,7.0
1,2014-01-01,,84.0,90.0,48.0,5.0,16.0
2,2014-01-02,,84.0,119.0,63.0,7.0,14.0
3,2014-01-03,,103.0,95.0,57.0,7.0,13.0
4,2014-01-04,,101.0,52.0,34.0,1.0,
...,...,...,...,...,...,...,...
4021,2025-01-11,84.0,57.0,18.0,3.0,,
4022,2025-01-12,101.0,66.0,20.0,2.0,,
4023,2025-01-13,129.0,73.0,21.0,2.0,,
4024,2025-01-14,150.0,74.0,19.0,2.0,,


In [4]:
def calc_aqi(row:pd.Series):
    pollutants = {
        'pm25': aqi.POLLUTANT_PM25,
        'pm10': aqi.POLLUTANT_PM10,
        'o3': aqi.POLLUTANT_O3_8H,
        'no2': aqi.POLLUTANT_NO2_1H,
        'so2': aqi.POLLUTANT_SO2_1H,
        'co': aqi.POLLUTANT_CO_8H
    }
    aqi_values = []
    for pollutant, aqi_pollutant in pollutants.items():
        if row[pollutant] is not None and not np.isnan(row[pollutant]):
            try:
                aqi_values.append(aqi.to_aqi([(aqi_pollutant, row[pollutant])]))
            except TypeError:
                continue
    if len(aqi_values) == 0:
        return np.nan
    return np.max(aqi_values)

df_cleaned['aqi'] = df_cleaned.apply(calc_aqi, axis=1)
print(df_cleaned['aqi'].isnull().sum(),"row")
print(df_cleaned.columns)
df_cleaned

0 row
Index(['date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi'], dtype='object')


Unnamed: 0,date,pm25,pm10,o3,no2,so2,co,aqi
0,2013-12-31,,,68.0,33.0,3.0,7.0,76
1,2014-01-01,,84.0,90.0,48.0,5.0,16.0,204
2,2014-01-02,,84.0,119.0,63.0,7.0,14.0,176
3,2014-01-03,,103.0,95.0,57.0,7.0,13.0,159
4,2014-01-04,,101.0,52.0,34.0,1.0,,74
...,...,...,...,...,...,...,...,...
4021,2025-01-11,84.0,57.0,18.0,3.0,,,166
4022,2025-01-12,101.0,66.0,20.0,2.0,,,174
4023,2025-01-13,129.0,73.0,21.0,2.0,,,189
4024,2025-01-14,150.0,74.0,19.0,2.0,,,200


In [5]:
def preprocess_data(df):
    if df is None:
        raise ValueError('Dataframe is None')
    try:
        if 'date' not in df.columns or 'aqi' not in df.columns:
            raise KeyError('Columns not found')
        
        aqi_df = df[['date', 'aqi']].copy()
        aqi_df['aqi'] = pd.to_numeric(aqi_df['aqi'], errors='coerce')
        aqi_df.set_index('date', inplace=True)
        aqi_df['aqi'] = aqi_df['aqi'].interpolate(method='time')
        aqi_df = aqi_df.reset_index(drop=False)
        
    except KeyError:
        raise ValueError('Columns not found')
    return aqi_df

df_cleaned = preprocess_data(df_cleaned)
print(df_cleaned.columns)
df_cleaned

Index(['date', 'aqi'], dtype='object')


Unnamed: 0,date,aqi
0,2013-12-31,76.0
1,2014-01-01,204.0
2,2014-01-02,176.0
3,2014-01-03,159.0
4,2014-01-04,74.0
...,...,...
4021,2025-01-11,166.0
4022,2025-01-12,174.0
4023,2025-01-13,189.0
4024,2025-01-14,200.0


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

ModuleNotFoundError: No module named 'tensorflow'

In [7]:
from pmdarima import auto_arima