In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import tensorflow.keras as keras
import os
from collections import Counter

filename = "/kaggle/input/worldnews-on-reddit/reddit_worldnews_start_to_2016-11-22.csv"
df = pd.read_csv(filename, usecols=['title', 'over_18'], dtype={'title': 'str', 'over_18': 'int32'})
dfX = df.loc[:, 'title']
dfY = df.loc[:, 'over_18']
dfX = dfX[:10000]
dfY = dfY[:10000]

vocabulary = Counter()
for title in dfX:
    words = title.split()
    validWords = filter(lambda x: len(x) <= 10, words)
    vocabulary.update(validWords)

# Truncate vocabulary
vocab_size = 1000
truncatedVocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

# Print out the 10 most common words and the number of times they occur
print("Most common words:", vocabulary.most_common()[:10])

# Convert words to tensor
words = tf.constant(truncatedVocabulary)

# Assign each word an ID
word_ids = tf.range(len(truncatedVocabulary), dtype=tf.int64)

# Create KeyValueTensor
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

# Create lookup table
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

# Test the lookup table
testArr = "China and Iraq are in the dataset".split()
testRes = table.lookup(tf.constant(testArr))
print("Test result:", testRes)

Most common words: [('to', 2753), ('in', 2562), ('the', 2534), ('of', 2350), ('and', 1281), ('a', 1241), ('for', 1146), ('s', 1141), ('on', 907), ('The', 760)]
Test result: tf.Tensor([  17    4   20   22    1    2 1677], shape=(7,), dtype=int64)



User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=false
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hyper'
   KMP_REDUCTION_BARRIER='1,1'
   KMP_REDUCTION_BAR

In [2]:
# Convert sentences to arrays of word ids
data = []
for title in dfX:
    sample = tf.strings.split(title)
    processed = table.lookup(sample)
    data.append(processed.numpy())

# Create a ragged tensor and then convert it to a padded dense tensor
ragged = tf.ragged.constant(data)
ragged = ragged.to_tensor(default_value=0)
#print(ragged)
#print(ragged.shape)

# TODO - Make into 2d tensor
#features1 = tf.constant([[1, 3], [2, 1], [3, 3]]) # ==> 3x2 tensor
#print(features1.shape)
#labels1 = tf.constant(['A', 'B', 'A']) # ==> 3x1 tensor
#print(labels1.shape)
#test = tf.data.Dataset.from_tensor_slices((features1, labels1))
#print(next(test.batch(32).as_numpy_iterator())[0])

features = tf.constant(ragged)
#print(features.shape)
labels = tf.constant(dfY)
#print(labels.shape)
train_set = tf.data.Dataset.from_tensor_slices((features, labels)).batch(32)
#print(next(train_set.batch(32).as_numpy_iterator())[0][0])


In [3]:
# Create model
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, batch_size=32, epochs=5)



Epoch 1/5


2022-01-29 07:25:06.824428: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [4]:
testArr = tf.strings.split("Sex sexual dildo bag tits")
test = table.lookup(testArr)
print("testing: ", test)
prediction = model.predict(test)
print("prediction:", prediction)
print(prediction.shape)
print("avg:", np.average(prediction, axis=0))

testing:  tf.Tensor([ 856 1092 1714 1226 1615], shape=(5,), dtype=int64)
prediction: [[0.46560842]
 [0.46780932]
 [0.43600795]
 [0.41654137]
 [0.42808723]]
(5, 1)
avg: [0.44281083]
