<a href="https://colab.research.google.com/github/KamilBartosik/RNN_AirPolutionPrediction/blob/main/RNN_AirPolutionData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score

from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [3]:
dataset_path = '/content/gdrive/MyDrive/MachineLearning/Datasets/Air_Pollution/Dataset.csv'
df = pd.read_csv(dataset_path)

# Data preprocessing

In [4]:
df.head()

Unnamed: 0,Date,Temperature,NOx,Wind Direction,Wind Speed,PM2.5
0,1/1/19 0:00,17.2,16.2,18,2.0,17
1,1/1/19 1:00,17.2,17.0,357,2.2,20
2,1/1/19 2:00,17.0,14.6,16,2.3,14
3,1/1/19 3:00,16.8,12.8,6,2.7,15
4,1/1/19 4:00,16.7,16.3,14,2.2,10


In [5]:
df.shape

(1416, 6)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1416 entries, 0 to 1415
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            1416 non-null   object 
 1   Temperature     1416 non-null   float64
 2   NOx             1416 non-null   object 
 3   Wind Direction  1416 non-null   object 
 4   Wind Speed      1416 non-null   object 
 5   PM2.5           1416 non-null   object 
dtypes: float64(1), object(5)
memory usage: 66.5+ KB


In [7]:
df.loc[22:25, 'Date']

22    1/1/19 22:00
23    1/1/19 23:00
24     1/2/19 0:00
25     1/2/19 1:00
Name: Date, dtype: object

In [8]:
df['Date'] = pd.to_datetime(df['Date'])

In [9]:
df.loc[22:25, 'Date']

22   2019-01-01 22:00:00
23   2019-01-01 23:00:00
24   2019-01-02 00:00:00
25   2019-01-02 01:00:00
Name: Date, dtype: datetime64[ns]

In [10]:
wrong_NOx = []
wrong_WD = []
wrong_WS = []
wrong_PM = []

def check_wrong_values(column, values):
  
  for i in range(len(df)):
    try:
      df.loc[i, column] = float(df.loc[i, column])
    except:
      values.append(i)

In [11]:
def display_examples(column, values, ex_11, ex_12, ex_21, ex_22):
  
  print('Wrong values:')
  print(df.loc[values, column])

  print('\nHow value(s) of 1st example look among neighbours:\n')
  print(df.loc[ex_11:ex_12, column])

  print('\nHow value(s) of 2nd example look among neighbours:\n')
  print(df.loc[ex_21:ex_22, column])

In [12]:
check_wrong_values('NOx', wrong_NOx)

In [13]:
display_examples('NOx', wrong_NOx, 157, 163, 1258, 1267)

Wrong values:
159     13.1#                          
161     23#                            
179     18.5#                          
272     49.9#                          
325     10.9#                          
491     26.2#                          
673     38.6#                          
951     11.2#                          
1261    9.1#                           
1263    8.5#                           
1264    12.4#                          
1335    50.1#                          
Name: NOx, dtype: object

How value(s) of 1st example look among neighbours:

157                               12.8
158                               13.0
159    13.1#                          
160                               16.1
161    23#                            
162                               21.6
163                               15.0
Name: NOx, dtype: object

How value(s) of 2nd example look among neighbours:

1258                               15.1
1259                               14

In [14]:
check_wrong_values('Wind Direction', wrong_WD)

In [15]:
display_examples('Wind Direction', wrong_WD, 417, 423, 1089, 1095)

Wrong values:
420     295#                           
1092    0#                             
Name: Wind Direction, dtype: object

How value(s) of 1st example look among neighbours:

417                              268.0
418                              344.0
419                              312.0
420    295#                           
421                              299.0
422                              301.0
423                              302.0
Name: Wind Direction, dtype: object

How value(s) of 2nd example look among neighbours:

1089                               17.0
1090                              338.0
1091                               31.0
1092    0#                             
1093                              284.0
1094                              221.0
1095                              256.0
Name: Wind Direction, dtype: object


In [16]:
check_wrong_values('Wind Speed', wrong_WS)

In [17]:
display_examples('Wind Speed', wrong_WS, 417, 423, 1089, 1095)

Wrong values:
420     1.6#                           
1092    0#                             
Name: Wind Speed, dtype: object

How value(s) of 1st example look among neighbours:

417                                1.4
418                                1.6
419                                2.1
420    1.6#                           
421                                2.5
422                                2.5
423                                2.5
Name: Wind Speed, dtype: object

How value(s) of 2nd example look among neighbours:

1089                                2.1
1090                                2.7
1091                                1.5
1092    0#                             
1093                                1.7
1094                                3.9
1095                                2.4
Name: Wind Speed, dtype: object


In [18]:
check_wrong_values('PM2.5', wrong_PM)

In [19]:
print('Wrong values:')
print(df.loc[wrong_PM, 'PM2.5'])

Wrong values:
37      16#                            
38      27#                            
159     793#                           
178     745#                           
323     27#                            
324     30#                            
420     1#                             
491     785#                           
492     33#                            
580     161*                           
581     157x                           
582     155x                           
606     98x                            
734     43#                            
735     27#                            
950     784#                           
1091    33#                            
1092    47#                            
1262    22#                            
1263    753#                           
1264    42#                            
1335    800#                           
1359    25#                            
1360    174#                           
1361    170#              

In [20]:
def replace_hashes(column, values):
  
  for i in values:
    df.loc[i, column] = df.loc[i, column].replace("#", "")

replace_hashes('NOx', wrong_NOx)
replace_hashes('Wind Direction', wrong_WD)
replace_hashes('Wind Speed', wrong_WS)

In [21]:
for i in wrong_PM:
  try:
    df.loc[i, 'PM2.5'] = int( (df.loc[i-1, 'PM2.5'] + df.loc[i+1, 'PM2.5']) / 2 )
  except:
    df.loc[i, 'PM2.5'] = int(df.loc[i-1, 'PM2.5'])

In [22]:
print(df.loc[wrong_PM, 'PM2.5'])

37      16
38      23
159     22
178     35
323     27
324     28
420     41
491     20
492     27
580     58
581     58
582     60
606     84
734     43
735     32
950     28
1091    33
1092    37
1262    22
1263    22
1264    24
1335    33
1359    25
1360    25
1361    25
1362    21
Name: PM2.5, dtype: object


In [23]:
df = df.astype({'NOx':'float', 'Wind Direction':'int', 'Wind Speed':'float', 'PM2.5':'int'})

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1416 entries, 0 to 1415
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            1416 non-null   datetime64[ns]
 1   Temperature     1416 non-null   float64       
 2   NOx             1416 non-null   float64       
 3   Wind Direction  1416 non-null   int64         
 4   Wind Speed      1416 non-null   float64       
 5   PM2.5           1416 non-null   int64         
dtypes: datetime64[ns](1), float64(3), int64(2)
memory usage: 66.5 KB


In [25]:
df.describe()

Unnamed: 0,Temperature,NOx,Wind Direction,Wind Speed,PM2.5
count,1416.0,1416.0,1416.0,1416.0,1416.0
mean,20.950282,19.421398,116.644774,2.450777,33.048729
std,3.438216,8.622356,141.731594,0.801641,17.385011
min,12.0,5.7,0.0,0.0,1.0
25%,18.7,13.6,13.0,1.9,21.0
50%,20.5,17.3,24.0,2.5,30.0
75%,23.4,23.0,301.0,3.0,42.0
max,30.5,73.1,360.0,4.8,108.0


# Preparing data for training