<a href="https://colab.research.google.com/github/nussarafirn/aarhus-aqi/blob/main/arhus_aqi_classifiction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## A Deep Learning model for air quality slassification and prediction. The model uses OneVsOneClassifier method. 

In [None]:
import os
import json
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import altair as alt

In [None]:
data_dir = 'dataset/larger'

In [None]:
with open(os.path.join(data_dir, 'dataset_x.json'), 'r') as f:
    X = np.array(json.load(f))
X.shape

(12185, 8)

In [None]:
with open(os.path.join(data_dir, 'dataset_y.json'), 'r') as f:
    _y = [l.index(1) for l in json.load(f)]
    y = np.array([i for i in _y])
    # y = np.array([i[0] * 24 >= 75 for i in json.load(f)])
count = [0,0,0,0,0,0]
for i in y:
    count[i] += 1
print(count)
print(y)

[0, 492, 283, 788, 1139, 9483]
[5 5 5 ... 5 5 5]


In [None]:
category = [
    "good",
    "fair",
    "moderate",
    "poor",
    "very poor",
    "extremely poor",
]

In [None]:
def dist_chart(data):
    return alt.Chart.from_dict({
        "width": 600,
        "height": 300,
        "title": "Distribution of the data points for each air quality category",
        "data": {
            "values": [{"num_points": c, "category": category[i]} for i, c in enumerate(data)]
        },
        "encoding": {
            "x": {"field": "num_points", "type": "quantitative", "title": "Number of data points"},
            "y": {"field": "category", "type": "nominal", "sort": None, "title": "Category"},
        },
        "layer": [
            {"mark": "bar"},
            {
                "mark": {"type": "text", "dx": 2, "align": "left"},
                "encoding": {
                    "text": {"field": "num_points", "type": "quantitative"}
                }
            }
        ]
    })

### Distribution of the air quality from Jan 1, 2021 until June 16, 2022. 

We can notice here that the data is very skewed. From the total of 12,185 hours, Århus unfortunately have experienced 9,483 hours of extremely poor air quality. 

The distribution of the data after random dropped some data points to reduce the skewness. 

In [None]:
dist_chart(count)

In [None]:
import random
to_keep = []
for i in range(y.shape[0]):
    if y[i] == 5:
        if random.random() < 0.3 / 5.0:
            to_keep.append(i)
    elif y[i] == 4:
        if random.random() < 1 / 2:
            to_keep.append(i)
    elif y[i] == 3:
        if random.random() < 3 / 5:
            to_keep.append(i)
    # if y[i]:
    #     if random.random() < 1.5 / 5.0:
    #         to_keep.append(i)
    else:
        to_keep.append(i)
count2 = [0,0,0,0,0,0]
for i in y[to_keep]:
    count2[int(i)] += 1
print(count2)

[0, 492, 283, 478, 574, 583]


In [None]:
dist_chart(count2)

In [None]:
# y = X[:, 14]

In [None]:
# import random
# X = np.array([[random.random(), random.random(), random.random(), random.random(), random.random(), random.random(), random.random(), i % 10] for i in range(1000)])
# y = np.array([i % 10 for i in range(1000)])

In [None]:
# columns = [5, 7, 9, 10, 14, 15, 16, 17]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[to_keep, :], y[to_keep], random_state = 0)

In [None]:
X_train.shape

(1807, 8)

In [None]:
X_test.shape

(603, 8)

### Deep Learning Model
Using OneVsOneClassifier method, the model achieves about 45.27% foe the accuracy. Though it is better than random prediction (16.67%), we need more relavent data to provide more accurate predictions to forcast the air pollution.

In [None]:
model = MLPClassifier(
    hidden_layer_sizes=(100, 100, 100),
    random_state=1,
    max_iter=500,
)
# model = LinearSVC(random_state=0)

In [None]:
clf = OneVsOneClassifier(model).fit(X_train, y_train)

In [None]:
sum(clf.predict(X_test) == y_test)

273

In [None]:
y_test.shape

(603,)

In [None]:
sum(y_test)

1850

In [None]:
accuracy = clf.score(X_test, y_test)

In [None]:
accuracy

0.4527363184079602

In [None]:
cm = confusion_matrix(y_test, clf.predict(X_test))

In [None]:
cm

array([[59, 28, 21, 18, 19],
       [ 8, 29, 15, 10,  3],
       [ 6, 13, 59, 36, 10],
       [ 5,  5, 28, 74, 30],
       [ 9,  9, 17, 40, 52]])

In [None]:
import pickle

In [None]:
with open("model.pickle", "wb") as f:
    pickle.dump(clf, f)