add data

In [1]:
import tensorflow as tf
import numpy as np

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=4000)

word_index = tf.keras.datasets.imdb.get_word_index()
index2word = dict((i + 3, word) for (word, i) in word_index.items())
index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'
x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [2]:
vocabulary = list()
for text in x_train:
  tokens = text.split()
  vocabulary.extend(tokens)

vocabulary = set(vocabulary)
print(len(vocabulary))

3998


In [3]:
from tqdm import tqdm

x_train_binary = list()
x_test_binary = list()

for text in tqdm(x_train):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_train_binary.append(binary_vector)

x_train_binary = np.array(x_train_binary)

for text in tqdm(x_test):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_test_binary.append(binary_vector)

x_test_binary = np.array(x_test_binary)

100%|██████████| 25000/25000 [07:22<00:00, 56.52it/s]
100%|██████████| 25000/25000 [07:11<00:00, 57.88it/s]


In [None]:
print(x_test_binary[2])
print(y_test[2])
print(x_train_binary[2])
print(y_train[2])

[0 0 0 ... 0 0 0]
1
[0 0 0 ... 0 0 0]
0


Calculate entropy

In [45]:
def calculate_entropy_word(binary_array, c, word, word_position = -1):
  has_the_word = 0 # c value
  count = 0 # lengeth
  is_positive = 0
  if word_position == -1:
    word_position = return_word(word) # the word column
  
  for element in binary_array:
      element = element[word_position]
      count += 1
      if(element == 1):
        has_the_word += 1
        if(y_train[count-1] == c):
          is_positive += 1

  
  cProb = is_positive/has_the_word
  if(cProb == 0 or cProb == 1):
    return None
  return - ( cProb * np.log2(cProb) ) - ( (1.0 - cProb) * np.log2(1.0 - cProb))


def calculate_entropy_total(binary_array):
  first = 0 # c value
  count = 0 # lengeth
  for element in binary_array:
      if(y_train[count] == 1):
        first += 1
      count += 1
  cProb = first/binary_array.shape[0]
  if(cProb == 0 or cProb == 1):
            return None        
  return - ( cProb * np.log2(cProb) ) - ( (1.0 - cProb) * np.log2(1.0 - cProb))

  

print(calculate_entropy_total(x_train_binary))  

1.0


return the position of a word

In [44]:
# -1 = has not found the word
def return_word(word):
  count = 0
  for element in vocabulary:
    if word == element:
      return count
    else:
      count += 1
  
  return -1

calculate probability

In [43]:
def calculate_probability(array, c, word, word_position = -1):
  has_the_word = 0 # c value
  count = 0 # lengeth
  is_positive = 0
  if word_position == -1:
    word_position = return_word(word) # the word column
  
  for element in array:
      element = element[word_position]
      count += 1
      if(element == 1):
        has_the_word += 1
        if(y_train[count-1] == c):
          is_positive += 1
  
  return is_positive/has_the_word

calculate sum

In [42]:
def calculate_sum(binary_array, word):
  try:
    word_position = word.index
    p0 = (calculate_probability(binary_array, 0, word, word_position))
    p1 = (calculate_probability(binary_array, 1, word, word_position))
    word.setEntropy0(calculate_entropy_word(binary_array, 0, word,  word_position))
    word.setEntropy1(calculate_entropy_word(binary_array, 1, word,  word_position))
    a = p0 * word.entropy0
    b = p1 * word.entropy1
    return  a + b
  except:
    return None

calculate IG

In [41]:
def calculate_IG(binary_array, c, word, calculate_entrupy_total = 'no value'):
  if calculate_entrupy_total == 'no value':
    calculate_entrupy_total = calculate_entropy_total(binary_array)
  try:
    word.setIg(calculate_entrupy_total - calculate_sum(binary_array, word))
  except:
    word.setIg(None)

In [40]:
class Word:

  def __init__(self, value:str, index:int):
    self.value = value
    self.index = index
    self.p0 = None
    self.p1 = None
    self.ig = None
    self.entropy0 = None
    self.entropy1 = None


  def setP0(self, p0):
    self.p0 = p0
  
  def setP1(self, p1):
    self.p1 = p1
  
  def setIg(self, ig):
    if(ig != None):
      self.ig = float(ig)

  def setEntropy0(self, entropy0):
    self.entropy0 = entropy0

  def setEntropy1(self, entropy1):
    self.entropy1  = entropy1

  def __str__(self):
    return self.value

  def __repr__(self):
    return self.value + " " + str(self.ig)
  

Node

In [59]:
class Node:
  value: Word
  left: Node = None # must be Node
  right: Node = None # must be Node

  def __init__(self, v: int = None, max_depth = 10):
    self.value = v
    self.max_depth = max_depth
  
  def setLeftChild(self, v):
    self.left = v

  def setRightChild(self, v):
    self.right  = v

  def setChilds(self, l, r):
    self.setLeftChild(l)
    self.setRightChild(r)

  def __str__(self):
    return self.value.value
      


Remove words

In [14]:
def hasTheWord(word1: Word, word2: Word):
  index1 = int(word1.index)
  index2 = int(word2.index)
  for x in x_train_binary:
    if(x[index1] == 1 and x[index2] == 1):
      return 1
  return 0


In [15]:
#there is the word in the setence
def findValueFromArray(array, index):
  if len(array) < index:
    return None
  count = 0
  for x in array:
    if(count == index):
      return x
    count+= 1

In [16]:
def findTheNextWord(maxWord, array, c):
  nextWord = array.pop()
  while hasTheWord(maxWord, nextWord) != c:
    if len(array) == 0:
      return None
    try:
      nextWord = array.pop()
    except:
      return None
  return nextWord

create words

In [55]:
test = x_train_binary[0]
ar = x_train_binary.tolist()
i = 0
et = calculate_entropy_total(x_train_binary)
words = list()
for x in vocabulary:
  words.append(Word(x, i))
  i+= 1
words_to_remove = list()
for word in words:
  calculate_IG(x_train_binary, 1, word, et)
  word.setP0(calculate_probability(x_train_binary, 0, word, word.index))
  word.setP1(calculate_probability(x_train_binary, 1, word, word.index))
  if(word.ig == None):
    words_to_remove.append(word)



for x in words_to_remove:
  words.remove(x)

words.sort(key=lambda x: x.ig, reverse=True)
for x in words:
  print(x.value)
  print(x.p0)
  print(x.p1)
  print(x.ig)
  print("\n")

[1;30;43mΗ έξοδος ροής περικόπηκε στις τελευταίες 5000 γραμμές.[0m


saw
0.47152702212549874
0.5284729778745013
0.002340481873942979


was
0.528334570650829
0.471665429349171
0.002317770826343879


lisa
0.4716981132075472
0.5283018867924528
0.0023124239647449585


causes
0.5283018867924528
0.4716981132075472
0.0023124239647449585


proceedings
0.4716981132075472
0.5283018867924528
0.0023124239647449585


angels
0.5283018867924528
0.4716981132075472
0.0023124239647449585


thought
0.5283018867924528
0.4716981132075472
0.0023124239647449585


self
0.5282651072124757
0.47173489278752434
0.002306414465754969


though
0.47173489278752434
0.5282651072124757
0.002306414465754969


again
0.47180277349768873
0.5281972265023113
0.0022953438472432808


forgotten
0.47181008902077154
0.5281899109792285
0.0022941523559198984


playing
0.4718861209964413
0.5281138790035588
0.0022817872456486477


q
0.47191011235955055
0.5280898876404494
0.0022778924668256373


statement
0.5280898876404494
0.4719101

try with node

In [61]:
wordsNodes = Node(words.pop())
i = 0
while len(words) > 0:
  wor = words.pop()
  nod = wordsNodes
  while True:
    if hasTheWord(wor, nod.value) == 1:
      if nod.right == None:
        nod.setRightChild(Node(wor))
        break
      else:
        nod = nod.right
    else:
      if nod.left == None:
        nod.setLeftChild(Node(wor))
        break
      else:
        nod = nod.left


print(wordsNodes)

IndexError: ignored

In [57]:
good_result = 0
count = 0
for sentense in x_train_binary:
  tree = wordsNodes
  result = None
  while True:
    if(tree.value.p0 == 1 or tree.value.p1 == 1 or (tree.left == None and tree.right == None)):
      result = tree.value
      break
    else:
      sentense_word = findValueFromArray(sentense, tree.value.index)
      if(sentense_word == None):
        if(tree.left == None):
          result = tree.value
          break
        tree = tree.left
      else:
        if(tree.right == None):
          result = tree.value
          break 
        tree = tree.right
  if(result.p0 > result.p1 and y_test[count] == 0):
    good_result += 1
  elif (result.p0 < result.p1 and y_test[count] == 1):
    good_result += 1
  count += 1
  if(count == 1000):
    break
  if(count % 100 == 1):
    print(good_result/count*100)

100.0
46.53465346534654
54.72636815920397
52.823920265780735
51.87032418952619
52.29540918163673
50.41597337770383
52.06847360912982
52.30961298377028
52.497225305216425
