In [0]:
import numpy as np 
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
#import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [0]:
df = pd.read_csv("/content/drive/My Drive/masterTesis/code/data/BeijingPM20100101_20151231.csv")

In [0]:
data = df.iloc[:,[1,2,3,4,9,11,13,15]].values

In [6]:
print(data)

[[ 2.010e+03  1.000e+00  1.000e+00 ...  4.300e+01 -1.100e+01  1.790e+00]
 [ 2.010e+03  1.000e+00  1.000e+00 ...  4.700e+01 -1.200e+01  4.920e+00]
 [ 2.010e+03  1.000e+00  1.000e+00 ...  4.300e+01 -1.100e+01  6.710e+00]
 ...
 [ 2.015e+03  1.200e+01  3.100e+01 ...  7.300e+01 -6.000e+00  8.900e-01]
 [ 2.015e+03  1.200e+01  3.100e+01 ...  7.300e+01 -6.000e+00  1.780e+00]
 [ 2.015e+03  1.200e+01  3.100e+01 ...  7.900e+01 -6.000e+00  2.670e+00]]


In [0]:
#GET METEOROLOGICAL DATA 
meteorological = data[:,[6,7,5]]

In [8]:
print(meteorological)

[[-11.     1.79  43.  ]
 [-12.     4.92  47.  ]
 [-11.     6.71  43.  ]
 ...
 [ -6.     0.89  73.  ]
 [ -6.     1.78  73.  ]
 [ -6.     2.67  79.  ]]


In [0]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaledMeteorological = scaler.fit_transform(meteorological)

In [0]:
#GET CATEGORICAL DATA
categorical_month = data[:,1].astype(int)
categorical_hour = data[:,3].astype(int)

In [0]:
#ENCODE CATEGORICAL DATA TO ONE-HOT
months = 12
hours = 24
onehot_month = np.zeros((len(data), months))
onehot_hours = np.zeros((len(data),hours))
onehot_month[np.arange(len(data)),categorical_month-1] = 1.0
onehot_hours[np.arange(len(data)),categorical_hour] = 1.0

In [0]:
auxData = np.concatenate((scaledMeteorological,onehot_month,onehot_hours),axis=1) 

In [13]:
print(auxData)

[[1.31147541e-01 2.29001111e-03 4.18367347e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.14754098e-01 7.63906691e-03 4.59183673e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.31147541e-01 1.06981116e-02 4.18367347e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [2.13114754e-01 7.51943946e-04 7.24489796e-01 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.13114754e-01 2.27292147e-03 7.24489796e-01 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [2.13114754e-01 3.79389900e-03 7.85714286e-01 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]]


In [0]:
#PM2.5
pm25 = data[:,4]
pm25 = np.nan_to_num(pm25)
pm25 = np.reshape(pm25,(len(data),1))

In [15]:
print(pm25)

[[  0.]
 [  0.]
 [  0.]
 ...
 [203.]
 [212.]
 [235.]]


In [0]:
scaledPm25 = scaler.fit_transform(pm25)

In [17]:
print(scaledPm25)

[[0.        ]
 [0.        ]
 [0.        ]
 ...
 [0.20422535]
 [0.21327968]
 [0.23641851]]


In [0]:
historicalData =  np.zeros((len(data),8))
historicalLabel = np.zeros((len(data),1))
row = 0
col = 0
for i in range (0,len(data)):
    j = 8
    col = 0
    while(i > 8 and j > 0):
        historicalData[row,col] = scaledPm25[i-j]
        col += 1
        j -= 1
    historicalLabel[row,0] = scaledPm25[i]
    row+= 1

In [19]:
print(historicalData)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.04325956 0.04828974 0.0583501  ... 0.11468813 0.13380282 0.17002012]
 [0.04828974 0.0583501  0.0694165  ... 0.13380282 0.17002012 0.20422535]
 [0.0583501  0.0694165  0.0915493  ... 0.17002012 0.20422535 0.21327968]]


In [0]:
fullData = np.concatenate((historicalData,auxData,historicalLabel),axis=1)

In [21]:
print(fullData)
fulldata = fullData[0:50000,:]
print(fullData)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.04325956 0.04828974 0.0583501  ... 0.         0.         0.20422535]
 [0.04828974 0.0583501  0.0694165  ... 1.         0.         0.21327968]
 [0.0583501  0.0694165  0.0915493  ... 0.         1.         0.23641851]]
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.04325956 0.04828974 0.0583501  ... 0.         0.         0.20422535]
 [0.04828974 0.0583501  0.0694165  ... 1.         0.         0.21327968]
 [0.0583501  0.0694165  0.0915493  ... 0.         1.         0.23641851]]


In [0]:
np.savetxt("/content/drive/My Drive/masterTesis/code/data/data.csv", fullData, delimiter=",")

In [23]:
print(fulldata.shape)

(50000, 48)


In [24]:
print(fulldata[50,:])

[0.1498994  0.15492958 0.16498994 0.15694165 0.12676056 0.09054326
 0.06338028 0.06539235 0.19672131 0.11152696 0.91836735 1.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         1.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.05533199]


In [0]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold

In [26]:
from keras.models import Model, Sequential
from keras.layers import Dense, Input, concatenate
from keras.layers import LSTM
from keras.layers.core import Reshape

Using TensorFlow backend.


In [0]:
def lstme(xtrain,ytrain,auxdata):
  inputs = Input((1,xtrain.shape[1]))

  
  #~inputs = Embedding(output_dim=xtrain.shape[0], input_dim=10000, input_length=100)(main_input)
  
  lstm1 = LSTM(50,input_shape=(1,xtrain.shape[1]),return_sequences=True)
  lstmPm25 = lstm1(inputs)
  lstm2 = LSTM(50,return_sequences=True)(lstmPm25)
  lstmPm25 = Dense(1,activation='sigmoid')(lstm2)
  lstmModel1 = Model(inputs, lstmPm25)
  
  aux = Input((1,auxdata.shape[1]))
  #lstm2 = LSTM(50,input_shape=(1,auxdata.shape[1]), return_sequences=True)
  #lstmMeteo = lstm2(aux)
  lstmMeteo = Dense(39,activation='sigmoid')(aux)
  lstmModel2 = Model(aux, lstmMeteo)
  
  combined = concatenate([lstmModel1.output, lstmModel2.output],axis=2)
  fc1 = Dense(50)(combined)
  fc2 = Dense(1, activation="sigmoid")(fc1)
  fcModel = Model(inputs=[lstmModel1.input,lstmModel2.input],outputs=fc2)
  return fcModel

def lstm(xtrain,ytrain,auxdata):
  inputs = Input((1,xtrain.shape[1]))
  
  #~inputs = Embedding(output_dim=xtrain.shape[0], input_dim=10000, input_length=100)(main_input)
  
  lstm1 = LSTM(50,input_shape=(1,xtrain.shape[1]),return_sequences=True)
  lstmPm25 = lstm1(inputs)
  lstm2 = LSTM(50,return_sequences=True)(lstmPm25)
  lstmPm25 = Dense(1,activation='sigmoid')(lstm2)
  lstmModel1 = Model(inputs, lstmPm25)

  return lstmModel1
  

In [30]:
kFolds = KFold(n_splits=5)
countCross = 1 
for train, test in kFolds.split(fulldata):
    xTrain = fulldata[train,0:8]
    xTest= fulldata[test,0:8]
    auxTrain = fulldata[train,8:47]
    auxTest = fulldata[test,8:47] 
    yTrain = fulldata[train,47:48]
    yTest = fulldata[test,47:48]
    print(auxTrain.shape)
    print("------------- CROSS VALIDATION ",countCross,"---------")
    print("**********LTSME***********")
    model = lstme(xTrain,yTrain,auxTrain)
    print(model.summary())
    model.compile(loss='mae', optimizer='adam') 
    xTrain = np.reshape(xTrain,(xTrain.shape[0],1,xTrain.shape[1]))
    auxTrain = np.reshape(auxTrain,(auxTrain.shape[0],1,auxTrain.shape[1]))
    yTrain = np.reshape(yTrain,(yTrain.shape[0],1,yTrain.shape[1]))
    
    model.fit([xTrain,auxTrain], yTrain, epochs=30, batch_size=1000)
    
    xTest = np.reshape(xTest,(xTest.shape[0],1,xTest.shape[1]))
    auxTest = np.reshape(auxTest,(auxTest.shape[0],1,auxTest.shape[1]))
    
    yhat = model.predict([xTest,auxTest])
    
    yhat = np.reshape(yhat,(xTest.shape[0],xTest.shape[1]))
    predictTest = scaler.inverse_transform(yhat)
    yTestI = scaler.inverse_transform(yTest)
    rmseTest = sqrt(mean_squared_error(yTestI, predictTest))
    print("RMSE ", rmseTest)
    countCross+=1

(40000, 39)
------------- CROSS VALIDATION  1 ---------
**********LTSME***********
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1, 8)         0                                            
__________________________________________________________________________________________________
lstm_5 (LSTM)                   (None, 1, 50)        11800       input_5[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   (None, 1, 50)        20200       lstm_5[0][0]                     
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1, 39)        0                                            
__________________________

In [31]:
kFolds = KFold(n_splits=5)
countCross = 1 
for train, test in kFolds.split(fulldata):
    xTrain = fulldata[train,0:8]
    xTest= fulldata[test,0:8]
    auxTrain = fulldata[train,8:47]
    auxTest = fulldata[test,8:47] 
    yTrain = fulldata[train,47:48]
    yTest = fulldata[test,47:48]
    print(auxTrain.shape)
    print("------------- CROSS VALIDATION ",countCross,"---------")
    print("**********LTSMNN***********")
    model = lstm(xTrain,yTrain,auxTrain)
    print(model.summary())
    model.compile(loss='mae', optimizer='adam') 
    xTrain = np.reshape(xTrain,(xTrain.shape[0],1,xTrain.shape[1]))
    auxTrain = np.reshape(auxTrain,(auxTrain.shape[0],1,auxTrain.shape[1]))
    yTrain = np.reshape(yTrain,(yTrain.shape[0],1,yTrain.shape[1]))
    
    model.fit(xTrain, yTrain, epochs=30, batch_size=1000)
    
    xTest = np.reshape(xTest,(xTest.shape[0],1,xTest.shape[1]))
    auxTest = np.reshape(auxTest,(auxTest.shape[0],1,auxTest.shape[1]))

    yhat = model.predict(xTest)
    
    yhat = np.reshape(yhat,(xTest.shape[0],xTest.shape[1]))
    predictTest = scaler.inverse_transform(yhat)
    yTestI = scaler.inverse_transform(yTest)
    rmseTest = sqrt(mean_squared_error(yTestI, predictTest))
    print("RMSE ", rmseTest)
    countCross+=1

(40000, 39)
------------- CROSS VALIDATION  1 ---------
**********LTSMNN***********
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 1, 8)              0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 1, 50)             11800     
_________________________________________________________________
lstm_16 (LSTM)               (None, 1, 50)             20200     
_________________________________________________________________
dense_29 (Dense)             (None, 1, 1)              51        
Total params: 32,051
Trainable params: 32,051
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/3