## Setup

In [None]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt 
import matplotlib
import os

Fetch yearly archives from AWS
```
aws s3 cp --no-sign-request --recursive s3://openaq-data-archive/records/csv.gz/locationid=1214510/year=2023/ ./data/2023/location-1214510
aws s3 cp --no-sign-request --recursive s3://openaq-data-archive/records/csv.gz/locationid=1214721/year=2023/ ./data/2023/location-1214721
aws s3 cp --no-sign-request --recursive s3://openaq-data-archive/records/csv.gz/locationid=1214722/year=2023/ ./data/2023/location-1214722
```

## Config

In [None]:
locations = ['1214510', '1214721', '1214722']
path_data = '../data'
path_portal = '../portal'
path_is = '../instructor_solution'
year = 2023
dataset_path_orig = f'{path_is}/dataset_orig.csv'

In [None]:
for item in [path_data, path_portal, path_is]:
    os.makedirs(item, exist_ok=True)

## Data import

In [None]:
if os.path.isfile(dataset_path_orig):
    combined_df = pd.read_csv(dataset_path_orig)
    print(f'imported dataframe from {dataset_path_orig}')
    
else:
    dataframes = []
    for loc in locations:

        root_folder = f"{path_data}/{str(year)}/location-{loc}"
        for subdir, _, files in os.walk(root_folder):
            for file in files:

                if file.endswith(".csv.gz"):
                    file_path = os.path.join(subdir, file)
                    try:
                        df = pd.read_csv(file_path)
                        dataframes.append(df)

                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        print(f"{len(dataframes)} days imported into dataframe shape:{combined_df.shape}")
        
        combined_df.to_csv(dataset_path_orig, index=False)
        print(f'exported dataframe to {dataset_path_orig}')
        
    else:
        print("No dataframes to combine.")

In [None]:
# rename sites
combined_df = combined_df.rename(columns={'location_id': 'site_id'})
lookup = {
    location_id: f'site_{str(idx+1)}'
    for idx, location_id in enumerate(combined_df['site_id'].unique())
}
combined_df.site_id = combined_df.site_id.apply(lambda x: lookup[x])

# setup multi-index
combined_df.datetime = pd.to_datetime(combined_df.datetime).dt.tz_localize(None)
combined_df = combined_df.set_index(['datetime', 'site_id'], drop=True).sort_index()

assert len(combined_df)>0

combined_df.head()

## Sanity check for multi-index

Ensuring the index covers all hourly datetime and locations

In [None]:
hourly_index = combined_df.index.levels[0].to_series().asfreq('h').index
location_ids = combined_df.index.levels[1]
hourly_index = pd.MultiIndex.from_product(
    [hourly_index, location_ids], names=['datetime', 'site_id']
)
cols = ['pm25','nox']

df = pd.DataFrame(index=hourly_index, columns=cols)
for col in cols:
    my_df = combined_df[combined_df['parameter']==col]
    df.loc[my_df.index,col] = my_df.loc[my_df.index, 'value']

print(len(df))
df.head()

## Generate `nox` forecast

In [None]:
df_spatial_nox = df.groupby('datetime').mean()['nox']
df_spatial_nox.name = 'exog'
# df_spatial_nox = df_spatial_nox.loc['2023-12-12 22:00:00':'2023-12-24 02:00:00']
df_spatial_nox = pd.to_numeric(df_spatial_nox, errors='coerce')
df_spatial_nox_last = df_spatial_nox.loc['2023-12-15':'2023-12-24 23:00:00']

# Interpolate (linear) with missing values
df_spatial_nox = df_spatial_nox.interpolate(method='linear')

# Add scaled brownian noise
np.random.seed(42)
brownian_noise = np.cumsum(np.random.normal(loc=0, scale=2, size=len(df_spatial_nox)))
scale_factor = 0.025 * (df_spatial_nox.max() - df_spatial_nox.min())
brownian_noise = scale_factor * brownian_noise / np.max(np.abs(brownian_noise))
brownian_noise -= brownian_noise.mean()

df_spatial_nox += brownian_noise

# Interpolate with exponentially weighted moving average
alpha = 0.3
df_spatial_nox_forward = df_spatial_nox.ewm(alpha=alpha).mean()
df_spatial_nox_backward = df_spatial_nox[::-1].ewm(alpha=alpha).mean()

df_forecast_nox = (df_spatial_nox_forward + df_spatial_nox_backward) / 2.
df_forecast_nox_last = df_forecast_nox.loc['2023-12-15':'2023-12-24 23:00:00']

df_forecast_nox_last.plot(label="Forecast", figsize=(12, 4))
df_spatial_nox_last.plot(label="Original")
plt.title("NOx Forecast")
plt.legend()
plt.xlabel("Datetime")
plt.ylabel("NOx")
plt.show()

In [None]:
df_forecast_nox_last

In [None]:
df_ref_pm25 = df.groupby('datetime').mean()['pm25']
df_ref_pm25 = df_ref_pm25.loc['2023-12-14 21:00:00':'2023-12-25 02:00:00']
df_ref_pm25 = pd.to_numeric(df_ref_pm25, errors='coerce')
df_ref_pm25 = df_ref_pm25.loc['2023-12-15':'2023-12-24 23:00:00']

df_ref_pm25

# Export dataset

## Reference data

In [None]:
filepath = f'{path_is}/dataset_ref.csv'
df.to_csv(filepath, index=True)
print(f'dataset reference exported to {filepath}')

filepath = f'{path_portal}/data'
df_ref_pm25.to_csv(filepath, index=False, header=False, float_format='%.1f')
print(f'test reference exported to {filepath}')

### Student data

In [None]:
idx = pd.IndexSlice

df_export = df.loc[idx['2023-08-01':'2023-12-14 23:00:00']].sort_index()
df_export_forecast = df_forecast_nox.loc[idx['2023-08-01':'2023-12-24 23:00:00']].sort_index()

In [None]:
filepath = f'{path_data}/train.csv'
df_export.to_csv(filepath, index=True)
print(f'dataset exported to {filepath}')

filepath = f'{path_data}/nox_forecast.csv'
df_export_forecast.to_csv(filepath, index=True, float_format='%.3f')
print(f'nox forecast exported to {filepath}')

np.random.seed(42)
noise = np.random.normal(loc=df_ref_pm25.mean(), scale=df_ref_pm25.std(), size=len(df_ref_pm25))
df_sample = pd.DataFrame(index=df_ref_pm25.index, data=np.abs(noise))
filepath = f'{path_data}/sample_submission.csv'
df_sample.to_csv(filepath, index=False, header=False, float_format='%.1f')
print(f'sample submission to {filepath}')

In [None]:
df_export

In [None]:
df_export_forecast