In [1]:
import pandas as pd
import numpy as np


## Initial processing

In [2]:
data = pd.read_csv("data/processed_data.csv")

In [3]:
def string_to_num_list(string):
    return [int(num) for num in string.strip(" []").split(',')]

def string_to_list(string):
    return string.strip(" []").split(',')

data['vector'] = data['vector'].apply(string_to_num_list)

## OH Vectorization

In [4]:
# determine max width
width = max(data['NumberOfWords'])

In [5]:
# Attempting to run this will brick your comp (most likely). Instead you can try to do OH vectorization at runtime.
def OH_vector(vector_list):
    global width
    vocab = 7774
    OH_matrix = np.zeros((vocab,width), dtype=np.int8)
    for index,vector in enumerate(vector_list):
        OH_matrix[vector][index]=1
#     OH_matrix = np.expand_dims(OH_matrix, axis=0)
    OH_matrix = np.expand_dims(OH_matrix, axis=-1)
    return OH_matrix

def BOW(vector_list):
    vocab = 7774
    OH_matrix = np.zeros(vocab, dtype=np.float32)
    for vector in vector_list:
        OH_matrix[vector]=1
    return OH_matrix

train = data[:30000]
train_x = train['vector'].apply(OH_vector)
train_y = train['BooleanSentiment']
test = data[30000:]
test_x = test['vector'].apply(OH_vector)
test_y = test['BooleanSentiment']

In [6]:
test_x = np.array(test_x.to_list())
train_x = np.array(train_x.to_list())
test_x.shape

(6623, 7774, 58, 1)

In [7]:
test_y = np.array(test_y.to_list()).astype(np.float32)
train_y = np.array(train_y.to_list()).astype(np.float32)
test_y.shape

(6623,)

## Create CNN

In [8]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models

In [21]:
# Convolutional layers
model = models.Sequential()
model.add(layers.Conv2D(32, (7774, 7), activation='tanh', input_shape=(7774, 58,1)))
model.add(layers.MaxPooling2D((1, 3)))

# Dense/FC layers
model.add(layers.Flatten())
# model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1))

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 1, 52, 32)         1741408   
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 1, 17, 32)         0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 544)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 545       
Total params: 1,741,953
Trainable params: 1,741,953
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_x, train_y, epochs=10, 
                    validation_data=(test_x, test_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
train

In [12]:
sum(train_y)/len(train_y)

0.5365

In [13]:
sum(test_y)/len(test_y)

0.5280084553827571

In [None]:
# model.save("model/CNN1.h5")

In [None]:
## Custom example

In [None]:
sample = OH_vector([3, 10, 34, 248, 881, 14, 107])
sample = np.expand_dims(sample, axis=0)
sample.shape

In [None]:
model.predict(sample)