In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import glob
import math
import matplotlib.pyplot as plt
import time
all_files = glob.glob('*.csv')

temp = []
for filename in all_files:
    df = pd.concat((pd.read_csv(f) for f in all_files))
    temp.append(df)
frame = pd.concat(temp, axis=0, ignore_index=True)

In [2]:
frame = frame.drop('No', 1)
#Drops all rows with a NaN value
frame = frame.dropna()

In [3]:
#Normalize data

#These are all the columns that are numeric
numeric_col = ['SO2','NO2','CO','O3','TEMP','PRES','DEWP','RAIN','WSPM']

for col in numeric_col:
    frame[col] = (frame[col] - frame[col].min())/(frame[col].max() - frame[col].min())

In [4]:
#fetch train-data and test_data
train_data=frame.loc[frame['year']!=2017 ]
test_data=frame.loc[frame['year']==2017]

In [5]:
frame

Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,2013,3,1,0,9.0,9.0,0.011435,0.052083,0.010101,0.057701,0.328455,0.653974,0.261137,0.0,WNW,0.151515,Wanshouxigong
1,2013,3,1,1,11.0,11.0,0.013436,0.041667,0.010101,0.061437,0.321951,0.662252,0.256528,0.0,WNW,0.333333,Wanshouxigong
5,2013,3,1,5,10.0,10.0,0.007433,0.020833,0.010101,0.059569,0.297561,0.700331,0.228879,0.0,NE,0.151515,Wanshouxigong
6,2013,3,1,6,8.0,8.0,0.011435,0.038194,0.020202,0.056767,0.284553,0.711921,0.241167,0.0,NE,0.174242,Wanshouxigong
7,2013,3,1,7,8.0,8.0,0.015438,0.062500,0.020202,0.050230,0.310569,0.733444,0.247312,0.0,NNE,0.151515,Wanshouxigong
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5049211,2017,2,28,19,13.0,37.0,0.005432,0.118056,0.030303,0.055834,0.526829,0.514901,0.304147,0.0,NW,0.181818,Guanyuan
5049212,2017,2,28,20,20.0,43.0,0.007433,0.159722,0.040404,0.039957,0.512195,0.516556,0.321045,0.0,WNW,0.068182,Guanyuan
5049213,2017,2,28,21,16.0,33.0,0.009434,0.128472,0.040404,0.046495,0.499187,0.526490,0.348694,0.0,NW,0.083333,Guanyuan
5049214,2017,2,28,22,11.0,24.0,0.009434,0.156250,0.040404,0.038090,0.494309,0.529801,0.354839,0.0,NNW,0.090909,Guanyuan


y_train=np.array(train_data['PM2.5'].values)
y_test=np.array(test_data['PM2.5'].values)

#multidimentional data
x_train=np.array(train_data[['TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM']].values)
x_test=np.array(test_data[['TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM']].values)


In [7]:
class Linear(tf.keras.Model):

    def __init__(self, dim=1):
        super(Linear, self).__init__()
        self.w = tf.Variable(initial_value=tf.zeros((dim)),
                             trainable=True)

        self.b = tf.Variable(initial_value=tf.zeros((1)),
                             trainable=True)
        self.dim = dim


    def call(self, inputs):
        x = inputs
        
        #x must be the right size
        assert(x.shape[1] == self.dim)
        
        out = (self.w*x) + self.b
        return out


In [None]:
train_col = ['TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM']

train_data[train_col].to_numpy()

In [None]:
train_col = ['TEMP', 'PRES', 'DEWP', 'RAIN', 'WSPM']

x_train = train_data[train_col].to_numpy()
y_train = train_data['PM2.5'].to_numpy()
x_test = test_data[train_col].to_numpy()
y_test = test_data['PM2.5'].to_numpy()

model = Linear(dim = len(train_col))
model.compile('adam','mse')
model.fit(x = x_train ,y = y_train,epochs = 200
          ,batch_size = 1024,validation_data = (x_test,y_test))

Train on 4394460 samples, validate on 191556 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
 502784/4394460 [==>...........................] - ETA: 8s - loss: 6155.3929 - ETA: 11s - loss: 

In [None]:
# represent the wind's direction as a number (radius)
#this method receives a set and change everything to a number
def getWindAngle(Z):
    Y=[]
    for direct in Z:
        if pd.isnull(direct):
            Y.append(math.nan)
        elif direct =='SW':
            Y.append(np.radians(225))
        elif direct =='N':
            Y.append(np.radians(0))
        elif direct =='NNW':
            Y.append(np.radians(337.5))
        elif direct =='WNW':
            Y.append(np.radians(292.5))
        elif direct =='WSW':
            Y.append(np.radians(247.5))
        elif direct =='E':
            Y.append(np.radians(90))
        elif direct =='NE':
            Y.append(np.radians(45))
        elif direct =='ENE':
            Y.append(np.radians(67.5))
        elif direct =='ESE':
            Y.append(np.radians(112.5))
        elif direct =='S':
            Y.append(np.radians(180))
        elif direct =='SE':
            Y.append(np.radians(135))
        elif direct =='SSE':
            Y.append(np.radians(157.5))
        elif direct =='SSW':
            Y.append(np.radians(202.5))
        elif direct =='NNE':
            Y.append(np.radians(22.5))
        elif direct =='NW':
            Y.append(np.radians(315.5))
        elif direct =='W':
            Y.append(np.radians(270))
        
    return np.array(Y)
        
            
            
    

In [None]:
def plot_simple_wind(eta,epochs):
    # this step removes all nans in wind and y then change the string in wd to numbers
    global x_train_wd,y_train,x_test_wd,y_test
    wd_new,y_new=removeNan_str(x_train_wd,y_train)
   
    wd_test,y_test=removeNan_str(x_test_wd,y_test)
    wd_test,y_test=wd_test[:min(wd_test.shape[0],y_test.shape[0])],y_test[:min(wd_test.shape[0],y_test.shape[0])]
    wd_new=getWindAngle(wd_new)
    wd_test=getWindAngle(wd_test)
    x,y,pred=train_simple(wd_new,y_new,wd_test,y_test,eta,epochs)
    plt.scatter(wd_test,y_test,c='r',s=0.01)
    plt.plot(wd_test,pred,c='g')
    plt.yscale('linear')
    plt.show()
    
    

In [None]:
#this part is for part one except wind
plot_simple(x_train_TEMP,y_train,x_test_TEMP,y_test,0.0028,1550)

In [None]:
# this part is for part 1 wind
plot_simple_wind(0.005,500)

In [None]:
 print(x_test_wd.shape, y_test.shape)

In [None]:
#preprocess the training matrix and its label so that it contains no nan's 
#then replace the str angle with numerical one
a=set(np.argwhere(pd.isnull(x_train))[:,0])
b=set(np.argwhere(pd.isnull(y_train))[:,0])
indices=list(a.union(b))
x_newTrain=np.delete(x_train,indices,axis=0)
y_newTrain=np.delete(y_train,indices,axis=0)
angles=getWindAngle(x_newTrain[:,4])
x_newTrain[:,4]=angles
print(x_newTrain)
print(pd.isnull(x_newTrain))

In [None]:
#do that same thing for test data
x_test1=x_test[:min(x_test.shape[0],y_test.shape[0]),:]
y_test1=y_test[:min(x_test.shape[0],y_test.shape[0])]
c=set(np.argwhere(pd.isnull(x_test1))[:,0])
d=set(np.argwhere(pd.isnull(y_test1))[:,0])
indices=list(c.union(d))
x_newTest=np.delete(x_test1,indices,axis=0)
y_newTest=np.delete(y_test1,indices,axis=0)
angles1=getWindAngle(x_newTest[:,4])
x_newTest[:,4]=angles1
print(x_newTest)

In [None]:

#this method is to give a model(multidimensional)
def train_multiple(X,Y,X_test,Y_test,eta,epochs):
    original=0     #the original weight^2+bias^2
    #set placeholders and variables
    x=tf.placeholder(dtype=tf.float32)
    y=tf.placeholder(dtype=tf.float32)
    weight=tf.Variable(np.random.normal(size=(6,1)))
    bias=tf.Variable(np.random.normal())
    #the prediction with the current model
    bias=tf.cast(bias,tf.float32)
    weight=tf.cast(weight,tf.float32)
    result=tf.add(tf.matmul(x,weight),bias)
    loss=tf.reduce_mean(tf.square(result-y))
    optimizer=tf.train.AdamOptimizer(learning_rate=eta).minimize(loss)  #can choose adam if you want
    feed_dict={x:X,y:Y}
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(epochs):
            sess.run(optimizer,feed_dict=feed_dict)
            if(epoch%20==0):
                print('cost:{}'.format(sess.run(loss,feed_dict=feed_dict)))
                print('w:{}'.format(weight.eval()))
                print('b:{}'.format(bias.eval()))
                print()
        print('the final result:')
        print(weight.eval())
        print(bias.eval())
        print('start testing:')
        prediction=sess.run(result,feed_dict={x:X_test})
        return X_test,Y_test,prediction

In [None]:
#this method is to give a model(multidimensional)
def train_multiple_SGD(X,Y,X_test,Y_test,eta,epochs,size):
    original=0     #the original weight^2+bias^2
    #set placeholders and variables
    x=tf.placeholder(dtype=tf.float32)
    y=tf.placeholder(dtype=tf.float32)
    weight=tf.Variable(np.random.normal(size=(6,1)))
    bias=tf.Variable(np.random.normal())
    #the prediction with the current model
    bias=tf.cast(bias,tf.float32)
    weight=tf.cast(weight,tf.float32)
    result=tf.add(tf.matmul(x,weight),bias)
    loss=tf.reduce_mean(tf.square(result-y))
    optimizer=tf.train.AdamOptimizer(learning_rate=eta).minimize(loss)  #can choose adam if you want
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(epochs):
            start=time.time()
            indices=np.random.choice(range(X.shape[0]), size, replace=False)
            feed_dict={x:X[indices],y:Y[indices]}
            sess.run(optimizer,feed_dict=feed_dict)
            print('time spent on this epoch:{}'.format(time.time()-start))
            if(epoch%20==0):
                print('cost:{}'.format(sess.run(loss,feed_dict=feed_dict)))
                print('w:{}'.format(weight.eval()))
                print('b:{}'.format(bias.eval()))
                print()
        print('the final result:')
        print(weight.eval())
        print(bias.eval())
        print('start testing:')
        prediction=sess.run(result,feed_dict={x:X_test}).T[0]
        return X_test,Y_test,prediction,weight.eval().T[0],bias.eval()

    

In [None]:
#if you set the size to be the original one, your runnin out of memory
#so this way is not efficient, we have to use the stochastic one
size=int(x_newTrain.shape[0]/500)
train_multiple(x_newTrain[:size],y_newTrain[:size],x_newTest,y_newTest,0.0039,500)


In [None]:
#test of sgd 
#we gotta choose the best size for stochastic gradient descent
size=int((x_newTrain.shape[0])/1500)
x_t,y_t,pred,w,b=train_multiple_SGD(x_newTrain,y_newTrain,x_newTest,y_newTest,0.03,100,size)


In [None]:
np.mean(abs(np.dot(np.array(x_newTrain,dtype='float32'),w)+b-y_newTrain)/np.dot(np.array(x_newTrain,dtype='float32'),w)+b)

In [None]:
x_newTest.shape