In [0]:
import numpy as np
import tensorflow as tf
import math
import pandas as pd
from gensim.models import Word2Vec

In [62]:
from google.colab import drive
drive.mount('/content/gdrive')
directory='/content/gdrive/My Drive/Masters/DeepLearning/Project/Data'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
def kernel_pca(x,nodes):
  K = np.zeros((x.shape[1], x.shape[1]))
  sigma = 0.1  # Hyperparameter of the RBF kernel
  for i in range(x.shape[1]):  # Applying the RBF kernel on the input data
    K[:, i] = np.exp(-np.sum((x[:, i].reshape((-1, 1))-x)**2, axis=0)/sigma**2)
  u, s, v = np.linalg.svd(K,full_matrices=False)
  return u[:nodes,:nodes]*s[:nodes]

In [0]:
# calculate the F1 score: 
# Parameters: y=Actual label, yhat=Predicted label
def getF1(y,yhat):
  confusion={'TN':0, 'FP':np.count_nonzero(yhat[y!=yhat]),
             'FN':0, 'TP':np.count_nonzero(y[y==yhat])+classes}
  confusion['TN']=len(y[y==yhat])-confusion['TP']
  confusion['FN']=len(y[y!=yhat])-confusion['FP']+classes
  precision=confusion['TP']/(confusion['TP']+confusion['FP'])
  recall=confusion['TP']/(confusion['TP']+confusion['FN'])
  return 2*precision*recall/(precision+recall)

In [0]:
def padding(laplacian,nodes):
  temp=np.zeros((nodes,nodes))
  temp[:laplacian.shape[0],:laplacian.shape[1]]=laplacian
  return temp

In [0]:
def word2vec(corpus_raw,window_size,embedding_dim,nodes):
  vector=np.zeros((len(corpus_raw),nodes,nodes))
  for i in range(len(corpus_raw)):
    words=set(corpus_raw[i])
    print(i,len(words))
    temp=np.zeros((len(words),embedding_dim))
    if not corpus_raw[i] or len(corpus_raw[i])==1:
      temp=np.random.rand(2,embedding_dim)
    else:
      word2vec=Word2Vec(corpus_raw[i],min_count=1,size=embedding_dim,window=window_size,workers=4)
    count=0
    for word in words:
      temp[count]=word2vec.wv[word] if word in word2vec.wv else np.random.rand(embedding_dim)
      count+=1
    w=np.dot(temp,temp.T)
#     graph=kernel_pca(temp,nodes) if len(words)!=nodes else temp
    laplacian=np.diag(np.sum(w,axis=0))-w
    vector[i]=kernel_pca(laplacian,nodes) if len(words)>nodes else padding(laplacian,nodes)
  return vector

In [0]:
classes=2
df1=pd.read_pickle(directory+'/reddit.pkl')
corpus_raw=df1['cleanNews'].values.tolist()
trainY=np.zeros((df1.shape[0],classes))
trainY[:,df1['Change'].values]=1

window_size=2
embedding_dim=64
nodes=24
# vector = word2vec(corpus_raw,window_size,embedding_dim,nodes) 

In [0]:
# np.save(directory+'/vector.npy',vector)

In [0]:
vector=np.load(directory+'/vector.npy')
tf.reset_default_graph()  # To reset all the parameters of the graph for every execution
minibatch_size=128  # Size of the minibatch

In [0]:
filter1=8  # Number of filters
kernel_size=3  # Filter size
pad='same'  # Padding type
filter2=16
filter3=32

In [0]:
pool_size=2
pool_stride=2
pool_pad='valid'

In [0]:
x=tf.placeholder(tf.float32,shape=[None,nodes,nodes])
y=tf.placeholder(tf.float32,shape=[None,classes])

# Convolutional & Max-pooling Layer 1
conv1=tf.layers.conv2d(tf.expand_dims(x,-1),filters=filter1, kernel_size = kernel_size, padding=pad,activation=tf.nn.relu, name="conv1")
pool1=tf.layers.max_pooling2d(conv1,pool_size=pool_size,strides=pool_stride,padding=pool_pad)

# Convolutional & Max-pooling Layer 2
conv2=tf.layers.conv2d(pool1,filters=filter2, kernel_size = kernel_size, padding=pad,activation=tf.nn.relu, name="conv2")
pool2=tf.layers.max_pooling2d(conv2,pool_size=pool_size,strides=pool_stride,padding=pool_pad)

# Convolutional & Max-pooling Layer 3
conv3=tf.layers.conv2d(pool2,filters=filter3, kernel_size = kernel_size, padding=pad,activation=tf.nn.relu, name="conv3")
pool3=tf.layers.max_pooling2d(conv3,pool_size=pool_size,strides=pool_stride,padding=pool_pad)

fc=tf.layers.dense(tf.layers.flatten(pool3),classes,activation=tf.nn.softmax,name='fc')
labels=tf.argmax(fc,axis=1)

In [73]:
loss=-tf.reduce_sum(y*tf.log(fc))  # Loss function:Mean squared error function
train_step = tf.train.AdamOptimizer(learning_rate=0.00001).minimize(loss)  # Adam Optimisation
sess=tf.InteractiveSession()
tf.global_variables_initializer().run()
split=math.ceil(vector.shape[0]*0.7)
indices=np.arange(vector.shape[0])
np.random.shuffle(indices)  # Shuffle the order of the data
maxEpoch=1001  # Total number of epochs
for i in range(maxEpoch):
  errt, _=sess.run([loss,train_step], feed_dict={x:vector[indices[:split]], y: trainY[indices[:split]]})
  if not i%100:
    print('Epoch number:',i,' Loss:',errt)
accuracy=sess.run(tf.reduce_mean(tf.cast(tf.equal(labels,tf.argmax(y,axis=1)), tf.float32)), feed_dict={x:vector[indices[split:]], 
                                                                                                                 y:trainY[indices[split:]]})*100



Epoch number: 0  Loss: 385927.62
Epoch number: 100  Loss: 209877.45
Epoch number: 200  Loss: 113795.97
Epoch number: 300  Loss: 78154.234
Epoch number: 400  Loss: 62831.656
Epoch number: 500  Loss: 55909.03
Epoch number: 600  Loss: 53205.13
Epoch number: 700  Loss: 52012.47
Epoch number: 800  Loss: 51411.613
Epoch number: 900  Loss: 51068.375
Epoch number: 1000  Loss: 50843.14


In [0]:
df2=pd.read_pickle(directory+'/demonetisation.pkl')
corpus_raw1=df2['cleanText'].values.tolist()
trainY=np.zeros((df2.shape[0],classes))
trainY[:,df2['isRetweet'].values]=1
# vector1 = word2vec(corpus_raw1,window_size,embedding_dim,nodes) 
# np.save(directory+'/vector1.npy',vector1)
vector1=np.load(directory+'/vector1.npy')
pool=sess.run(pool3,feed_dict={x:vector1})
pool=pool.reshape((-1,288))
sess.close()

In [0]:
tf.reset_default_graph()  # To reset all the parameters of the graph for every execution
p=tf.placeholder(tf.float32,shape=[None,288])
y=tf.placeholder(tf.float32,shape=[None,classes])
fc1=tf.layers.dense(p,72,activation=tf.nn.relu,name='fc1')
fc2=tf.layers.dense(fc1,classes,activation=tf.nn.sigmoid,name='fc2')
labels=tf.argmax(fc2,axis=1)

In [76]:
loss=-tf.reduce_sum(y*tf.log(fc2))  # Loss function:Mean squared error function
train_step = tf.train.AdamOptimizer(learning_rate=0.01).minimize(loss)  # Adam Optimisation
sess=tf.InteractiveSession()
tf.global_variables_initializer().run()
# split=math.ceil(vector1.shape[0]*0.7)
# indices=np.arange(df2.shape[0])
# np.random.shuffle(indices)  # Shuffle the order of the data
split=vector1.shape[0]
maxEpoch=101  # Total number of epochs
for i in range(maxEpoch):
  errt, _=sess.run([loss,train_step], feed_dict={p:pool[:split],y:trainY[:split]})
  if not i%10:
    print('Epoch number:',i,' Loss:',errt)
test_pool=pool[:split]
testY=np.argmax(trainY[:split],axis=1)
predictY,dense2=sess.run([labels,fc2],feed_dict={p:test_pool})
f1=getF1(testY,predictY)
sess.close()



Epoch number: 0  Loss: 23657.848
Epoch number: 10  Loss: 0.77475035
Epoch number: 20  Loss: 0.0911819
Epoch number: 30  Loss: 0.036685508
Epoch number: 40  Loss: 0.025082052
Epoch number: 50  Loss: 0.021439958
Epoch number: 60  Loss: 0.02004286
Epoch number: 70  Loss: 0.01940101
Epoch number: 80  Loss: 0.01902841
Epoch number: 90  Loss: 0.018746994
Epoch number: 100  Loss: 0.018497126
