In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import os
from collections import Counter

filename = "/kaggle/input/worldnews-on-reddit/reddit_worldnews_start_to_2016-11-22.csv"
df = pd.read_csv(filename, usecols=['title'], dtype={'title': 'str'}).pop('title')

vocabulary = Counter()
for title in df:
    words = title.split()
    validWords = filter(lambda x: len(x) <= 10, words)
    vocabulary.update(validWords)

# Truncate vocabulary
vocab_size = 1000
truncatedVocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

# Print out the 10 most common words and the number of times they occur
print("Most common words:", vocabulary.most_common()[:10])

# Convert words to tensor
words = tf.constant(truncatedVocabulary)

# Assign each word an ID
word_ids = tf.range(len(truncatedVocabulary), dtype=tf.int64)

# Create KeyValueTensor
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

# Create lookup table
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

# Test the lookup table
testArr = "China and Iraq are in the dataset".split()
testRes = table.lookup(tf.constant(testArr)).numpy()
print("Test result:", testRes)







Most common words: [('to', 204943), ('in', 196008), ('the', 176203), ('of', 170501), ('a', 93169), ('and', 84224), ('s', 78501), ('for', 75960), ('on', 70010), ('-', 40879)]
Test result: [  20    5   92   26    1    2 1677]



User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=false
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hyper'
   KMP_REDUCTION_BARRIER='1,1'
   KMP_REDUCTION_BAR