In [49]:
import tensorflow as tf
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [38]:
PATH = os.path.join(os.curdir, "Dataset", "MultiVariate Dataset", "pollution_multivariate.csv")
dataset = pd.read_csv(PATH)

In [39]:
dataset.head()

Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or plurals of the same example:   2015-09-03 10:53:00

In [40]:
dataset["time"] = dataset.year.astype("str") + "-" + dataset.month.astype("str") + "-" + dataset.day.astype("str") + " " + dataset.hour.astype("str")+':0:0'

In [41]:
dataset.time = pd.to_datetime(dataset.time)
dataset.head()

In [42]:
dataset.set_index(dataset['time'],inplace=True)
dataset.drop(["time", "No"], axis=1, inplace=True)
dataset.head()

In [43]:
dataset.info()

In [44]:
dataset.describe()

In [52]:
# Lets check the NULL Values first

null = pd.DataFrame(dataset.isnull().sum()).rename(columns={0:"Total"})
null['percentage'] = null['Total'] / len(dataset)
null.sort_values('percentage',ascending=False).head()

In [50]:
# Lets see the distribution of pm2.5
plt.figure(figsize=(10,10))
sns.kdeplot(dataset['pm2.5'], shade=True)
# Very much right skewed

In [51]:
# Lets forward fill the points
dataset = dataset.fillna(method='ffill')
dataset.head()

In [54]:
# Still there are some null values lets fill them with the mean value
meanvalue = dataset['pm2.5'].mean()
dataset['pm2.5'] = dataset['pm2.5'].fillna(value=meanvalue)

In [55]:
dataset.head()

In [56]:
from sklearn.preprocessing import LabelEncoder

dataset.cbwd = LabelEncoder().fit_transform(dataset.cbwd)

In [58]:
# Lets look at the distribution of values. They are all hourly distributed
dataset = dataset.resample('H').mean()
dataset.head()

In [64]:
plt.figure(figsize=(20, 20))
for idx, col in enumerate(dataset.columns):
    plt.subplot(len(dataset.columns), 1, idx+1)
    plt.plot(dataset[col], label=col)
    plt.legend()    
plt.subplots_adjust(hspace=0.3, wspace=0.5)

In [65]:
train = dataset[ : int(len(dataset) * 0.7)]
val = dataset[int(len(dataset) * 0.7) : int(len(dataset) * 0.9)]
test = dataset[int(len(dataset) * 0.9):]

In [67]:
train.shape

In [68]:
train_label = train.TEMP
train_data = train.drop(['TEMP'], axis=1)

val_label = val.TEMP
val_data = val.drop(['TEMP'], axis=1)

test_data = test.TEMP
test_label = test.drop(['TEMP'], axis=1)


In [70]:
mean = train_data.mean()
std = train_data.std()

train_data = (train_data - mean) / std
val_data = (val_data - mean) / std
test_data = (test_data - mean) / std

In [71]:
train_data.head()

In [72]:
def create_non_sequential_loader(series, labels, batchsize=32, buffersize=100):
    dataset = tf.data.Dataset.from_tensor_slices((series, labels))
    dataset = dataset.cache().shuffle(buffersize).batch(batchsize)
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [73]:
train_sequential = create_non_sequential_loader(train_data.to_numpy(), train_label.to_numpy())
val_sequential = create_non_sequential_loader(val_data.to_numpy(), val_label.to_numpy())

In [74]:
for X, Y in train_sequential.take(1):
    print(X.shape)
    print(Y.shape)

In [98]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=[11]),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
])

In [99]:
model.compile(loss='mse', optimizer='adam')

In [100]:
earlystop = tf.keras.callbacks.EarlyStopping(patience=10)
checkpoint = tf.keras.callbacks.ModelCheckpoint('best_temp/', save_best_only=True)

In [101]:
history = model.fit(train_sequential, epochs=50, validation_data=val_sequential, callbacks=[earlystop, checkpoint])

In [102]:
# Lets try sequential loader

def create_non_sequential_loader(series, window_size=24, batchsize=32, buffersize=100):
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    dataset = tf.data.Dataset.from_tensor_slices(series)
    dataset = dataset.window(window_size, drop_remainder=True, shift=1)
    dataset = dataset.flat_map(lambda window: window.batch(window_size))
    dataset = dataset.map(lambda window: (window[:,:-1], window[-1,-1]), num_parallel_calls=AUTOTUNE)
    dataset = dataset.cache().shuffle(buffersize).batch(batchsize)
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [106]:
train = tf.concat([train_data.to_numpy(), train_label.to_numpy().reshape(-1,1)], axis=1)
val = tf.concat([val_data.to_numpy(), val_label.to_numpy().reshape(-1,1)], axis=1)

In [107]:
train = create_non_sequential_loader(train)
val = create_non_sequential_loader(val)

In [108]:
for X, Y in train.take(1):
    print(X.shape)
    print(Y.shape)

In [109]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(filters=32, kernel_size=5, strides=1, padding='causal', activation='relu',
                          input_shape=[None, 11]),
    # This convnet can learn to detect short term patterns that are most useful for the RNN.
    tf.keras.layers.LSTM(32, return_sequences=True),
    tf.keras.layers.LSTM(32, return_sequences=True),
    tf.keras.layers.Dense(1),
])

In [110]:
model.compile(loss='mse', optimizer='adam')

In [111]:
earlystop = tf.keras.callbacks.EarlyStopping(patience=10)
checkpoint = tf.keras.callbacks.ModelCheckpoint('best_temp/', save_best_only=True)

In [None]:
history = model.fit(train, epochs=50, validation_data=val, callbacks=[earlystop, checkpoint])