In [1]:
import pandas as pd
import numpy as np


## Initial processing

In [2]:
data = pd.read_csv("data/processed_data.csv")

In [3]:
def string_to_num_list(string):
    return [int(num) for num in string.strip(" []").split(',')]

def string_to_list(string):
    return string.strip(" []").split(',')

data['vector'] = data['vector'].apply(string_to_num_list)

## OH Vectorization

In [4]:
# determine max width
width = max(data['NumberOfWords'])

In [5]:
# Attempting to run this will brick your comp (most likely). Instead you can try to do OH vectorization at runtime.
def OH_vector(vector_list):
    global width
    vocab = 7774
    OH_matrix = np.zeros((vocab,width), dtype=np.float32)
    for index,vector in enumerate(vector_list):
        OH_matrix[vector][index]=1
#     OH_matrix = np.expand_dims(OH_matrix, axis=0)
    OH_matrix = np.expand_dims(OH_matrix, axis=-1)
    return OH_matrix

train = data[:10000]
train_x = train['vector'].apply(OH_vector)
train_y = train['BooleanSentiment']
test = data[10000:11000]
test_x = test['vector'].apply(OH_vector)
test_y = test['BooleanSentiment']

In [6]:
test_x = np.array(test_x.to_list())
train_x = np.array(train_x.to_list())
test_x.shape

(1000, 7774, 58, 1)

In [7]:
test_y = np.array(test_y.to_list()).astype(np.float32)*0.99999999999
train_y = np.array(train_y.to_list()).astype(np.float32)*0.99999999999
test_y.shape

(1000,)

## Create CNN

In [8]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models

In [9]:
# Convolutional layers
model = models.Sequential()
model.add(layers.Conv2D(32, (7774, 5), activation='relu', input_shape=(7774, 58,1)))
model.add(layers.MaxPooling2D((1, 3)))

# Dense/FC layers
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 1, 54, 32)         1243872   
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 1, 18, 32)         0         
_________________________________________________________________
flatten (Flatten)            (None, 576)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                36928     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,280,865
Trainable params: 1,280,865
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_x, train_y, epochs=10, 
                    validation_data=(test_x, test_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
train

Unnamed: 0.1,Unnamed: 0,OriginalTweet,Sentiment,LowerTweet,BooleanSentiment,tokenized_text,stemmed_tokens,vector,NumberOfWords
0,0,advice Talk to your neighbours family to excha...,Positive,advice talk to your neighbours family to excha...,1,"['advice', 'talk', 'to', 'your', 'neighbours',...","['advic', 'talk', 'to', 'your', 'neighbour', '...","[593, 452, 1, 32, 1470, 184, 1, 2061, 805, 388...",37
1,1,Coronavirus Australia: Woolworths to give elde...,Positive,coronavirus australia woolworths to give elder...,1,"['coronavirus', 'australia', 'woolworths', 'to...","['coronaviru', 'australia', 'woolworth', 'to',...","[8, 674, 1499, 1, 220, 315, 962, 1324, 33, 209...",16
2,2,My food stock is not the only one which is emp...,Positive,my food stock is not the only one which is emp...,1,"['my', 'food', 'stock', 'is', 'not', 'the', 'o...","['my', 'food', 'stock', 'is', 'not', 'the', 'o...","[38, 16, 60, 10, 34, 0, 118, 12, 203, 10, 183,...",42
3,3,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me ready to go at supermarket during the covid...,0,"['me', 'ready', 'to', 'go', 'at', 'supermarket...","['me', 'readi', 'to', 'go', 'at', 'supermarket...","[88, 688, 1, 40, 19, 21, 66, 0, 3, 163, 34, 10...",36
4,4,As news of the regionÂs first confirmed COVID...,Positive,as news of the regions first confirmed covid c...,1,"['as', 'news', 'of', 'the', 'regions', 'first'...","['as', 'new', 'of', 'the', 'region', 'first', ...","[27, 80, 4, 0, 1357, 210, 864, 3, 251, 862, 45...",37
...,...,...,...,...,...,...,...,...,...
9995,9995,#Coronavirus tip: shop in your local Asian sup...,Extremely Positive,coronavirus tip shop in your local asian supe...,1,"['coronavirus', 'tip', 'shop', 'in', 'your', '...","['coronaviru', 'tip', 'shop', 'in', 'your', 'l...","[8, 368, 33, 7, 32, 110, 1119, 21, 525, 108, 1...",28
9996,9996,Saudi Arabia will nearly double its debt ceili...,Extremely Negative,saudi arabia will nearly double its debt ceili...,0,"['saudi', 'arabia', 'will', 'nearly', 'double'...","['saudi', 'arabia', 'will', 'nearli', 'doubl',...","[763, 1145, 36, 1228, 790, 14, 834, 5912, 1, 1...",41
9997,9997,I went to put a few bits in Food Bank collecti...,Extremely Negative,i went to put a few bits in food bank collecti...,0,"['went', 'to', 'put', 'few', 'bits', 'in', 'fo...","['went', 'to', 'put', 'few', 'bit', 'in', 'foo...","[253, 1, 190, 298, 684, 7, 16, 181, 731, 19, 0...",45
9998,9998,It's a novel experience watching a government ...,Positive,it s a novel experience watching a government ...,1,"['it', 'novel', 'experience', 'watching', 'gov...","['it', 'novel', 'experi', 'watch', 'govern', '...","[14, 1059, 707, 407, 168, 67, 4379, 7, 378, 59]",10


In [12]:
sum(train_y)/len(train_y)

0.5006

In [13]:
sum(test_y)/len(test_y)

0.506

In [14]:
model.save("model/CNN1.h5")

In [15]:
## Custom example

In [27]:
sample = OH_vector([3, 10, 34, 248, 881, 14, 107])
sample = np.expand_dims(sample, axis=0)
sample.shape

(1, 7774, 58, 1)

In [28]:
model.predict(sample)

array([[1.8567021]], dtype=float32)