https://aqs.epa.gov/aqsweb/airdata/download_files.html#Raw
➡ CO (42101)

This is the link to the original dataset the model needs to be trained on

**The data file I used in this notebook is a reduced version of the original to run faster on colab

**Original(1000000+ lines) ThisVersion(200 lines)

In [None]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
import math

In [None]:
#Read csv

df = pd.read_csv("hourly_42101_2021_new.csv")

In [None]:
#check df

df

In [None]:
#dropping all irrelevant columns (only kept the necessary ones)

df = df.drop(["State Code", "County Code", "Parameter Code", "POC", "Datum",
              "Site Num", "MDL", "Uncertainty", "Qualifier", "Method Type",
              "Method Code", "Method Name", "Date of Last Change",
              "Parameter Name", "Units of Measure", "State Name", "County Name",
              "Date GMT", "Time GMT"], axis = 'columns')

In [None]:
#checking drop

df

Unnamed: 0,Latitude,Longitude,Date Local,Time Local,Sample Measurement
0,33.553056,-86.815,1/1/2021,0:00,0.2
1,33.553056,-86.815,1/1/2021,1:00,0.1
2,33.553056,-86.815,1/1/2021,2:00,0.1
3,33.553056,-86.815,1/1/2021,3:00,0.1
4,33.553056,-86.815,1/1/2021,4:00,0.1
...,...,...,...,...,...
194,33.553056,-86.815,1/9/2021,11:00,0.2
195,33.553056,-86.815,1/9/2021,12:00,0.2
196,33.553056,-86.815,1/9/2021,13:00,0.2
197,33.553056,-86.815,1/9/2021,14:00,0.2


In [None]:
#First function converts date into day of year, and then normalizes it to scale of 0 to 1
#Second function converts time of day, into hour of day, and then normalizes it to scale of 0 to 1

def adjust_date(arr):
  adjusted_date_local = []

  for date in arr:
    temp = date.split("/")
    curr = temp[2] + "-" + temp[0] + "-"  + temp[1]
    period = pd.Period(curr)
    adjusted_date_local.append(int(period.day_of_year)/365)

  return adjusted_date_local

def adjust_time(arr):
  adjusted_time_local = []

  for time in arr:
    strTime = time.replace(":", ".")
    adjusted_time_local.append(float(strTime)/24)

  return adjusted_time_local

In [None]:
#Creating the new adjusted columns by applying the functions to existing values
#Deleting old non-formatted columns

df["Date Local (adjusted)"] = adjust_date(df["Date Local"])
df["Time Local (adjusted)"] = adjust_time(df["Time Local"])

df = df.drop(["Date Local", "Time Local"], axis = 'columns')

In [None]:
#function that normalizes the longitude and latitude to a scale of 0 to 1

def adjust_long_lat(arr):
  adjusted_long_lat = []

  for pos in arr:
    adjusted_long_lat.append(float(pos)/180)

  return adjusted_long_lat

In [None]:
#Once again adding the new adjusted columns and deleting old non-formatted columns

df["Latitude (adjusted)"] = adjust_long_lat(df["Latitude"])
df["Longitude (adjusted)"] = adjust_long_lat(df["Longitude"])

df = df.drop(["Latitude", "Longitude"], axis = 'columns')

In [None]:
def new_sin(arr):
  sin_ans = []

  for input in arr:
    sin_ans.append(math.sin(input))

  return sin_ans

def new_cos(arr):
  cos_ans = []

  for input in arr:
    cos_ans.append(math.cos(input))

  return cos_ans

In [None]:
df["sin(day of year)"] = new_sin(df["Date Local (adjusted)"])
df["cos(day of year)"] = new_cos(df["Date Local (adjusted)"])
df["sin(time of day)"] = new_sin(df["Time Local (adjusted)"])
df["cos(time of day)"] = new_cos(df["Time Local (adjusted)"])

In [None]:
#checking new edits to df

df

Unnamed: 0,Sample Measurement,Date Local (adjusted),Time Local (adjusted),Latitude (adjusted),Longitude (adjusted),sin(day of year),cos(day of year),sin(time of day),cos(time of day)
0,0.2,0.002740,0.000000,0.186406,-0.482306,0.002740,0.999996,0.000000,1.000000
1,0.1,0.002740,0.041667,0.186406,-0.482306,0.002740,0.999996,0.041655,0.999132
2,0.1,0.002740,0.083333,0.186406,-0.482306,0.002740,0.999996,0.083237,0.996530
3,0.1,0.002740,0.125000,0.186406,-0.482306,0.002740,0.999996,0.124675,0.992198
4,0.1,0.002740,0.166667,0.186406,-0.482306,0.002740,0.999996,0.165896,0.986143
...,...,...,...,...,...,...,...,...,...
194,0.2,0.024658,0.458333,0.186406,-0.482306,0.024655,0.999696,0.442454,0.896791
195,0.2,0.024658,0.500000,0.186406,-0.482306,0.024655,0.999696,0.479426,0.877583
196,0.2,0.024658,0.541667,0.186406,-0.482306,0.024655,0.999696,0.515565,0.856851
197,0.2,0.024658,0.583333,0.186406,-0.482306,0.024655,0.999696,0.550809,0.834631


In [None]:
#Turning "Sample Measurement" column into target (output) array and dropping from df

target = df["Sample Measurement"]
#df = df.drop("Sample Measurement", axis = 'columns')

In [None]:
#Splitting dataset into 80/20 train/test split and checking shape

split_point1 = int(0.6 * len(df))
split_point2 = int(0.8 * len(df))

x_train = df[0:split_point1]
x_test = df[split_point1:split_point2]
x_val = df[split_point2:]

y_train = target[0:split_point1]
y_test = target[split_point1:split_point2]
y_val = target[split_point2:]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape)

(119, 9) (119,) (40, 9) (40,) (40, 9) (40,)


In [None]:
#converting df DataFrame objects to Numpy, so reshape function can be applied

x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
x_val = x_val.to_numpy()

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()
y_val = y_val.to_numpy()

In [None]:
#Reshaping current data format to fit 3D input array for LSTM
#(num_of_samples, num_timesteps, num_features)
#(total # of data points, 1 incremental timestep, 8 input categories)

x_train_reshaped = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_test_reshaped = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))
x_val_reshaped = np.reshape(x_val, (x_val.shape[0], 1, x_val.shape[1]))

In [None]:
print(x_train_reshaped.shape, y_train.shape)

(119, 1, 9) (119,)


-----------------------------------------------------------------

In [None]:
x_train_reshaped[1]

array([[ 0.1       ,  0.00273973,  0.04166667,  0.18640587, -0.48230556,
         0.00273972,  0.99999625,  0.04165461,  0.99913207]])

In [None]:
x_train[1]

array([ 0.1       ,  0.00273973,  0.04166667,  0.18640587, -0.48230556,
        0.00273972,  0.99999625,  0.04165461,  0.99913207])

In [None]:
x_train.shape, y_train.shape

((119, 9), (119,))

In [None]:
def hourly_sequencer(inputArray):
  sequence_length = 23
  sequences = []
  targets = []

  for j in range(inputArray.shape[0]-sequence_length+1):
    window = []
    for i in range(sequence_length):
      window.append(inputArray[j+i])
      temp_window = np.array(window)

    sequences.append(temp_window.flatten())

  return sequences