# **Bag of Words**

In [3]:
vocab = {}  # maps word to integer representing it, i.e. {..., "word": word encoding, ...}
word_encoding = 1
def bag_of_words(text):
  global word_encoding

  words = text.lower().split(" ")  # create a list of all of the words in the text, well assume there is no grammar in our text for this example
  bag = {}  # stores all of the encodings and their frequency, i.e. all numbers {..., word encoding: frequency, ...}

  for word in words:
    if word in vocab: # word is in the vocabulary
      encoding = vocab[word]  # get encoding from vocab
    else:
      vocab[word] = word_encoding # add new word with the new word_encoding number
      encoding = word_encoding # get encoding of the new word
      word_encoding += 1 # add one for the encoding of the next new word
    
    if encoding in bag:
      bag[encoding] += 1 # add one to the frequency
    else:
      bag[encoding] = 1 # frequency of the new word is 1
  
  return bag

text = "Helloooo my name is Ching Jaymi Mae Lim"
bag = bag_of_words(text)
print(bag)
print(vocab)

{1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}
{'helloooo': 1, 'my': 2, 'name': 3, 'is': 4, 'ching': 5, 'jaymi': 6, 'mae': 7, 'lim': 8}


Drawbacks:

In [4]:
positive_review = "I thought the movie was going to be bad but it was actually amazing"
negative_review = "I thought the movie was going to be amazing but it was actually bad"

pos_bag = bag_of_words(positive_review)
neg_bag = bag_of_words(negative_review)

print("Positive:", pos_bag)
print("Negative:", neg_bag)

Positive: {9: 1, 10: 1, 11: 1, 12: 1, 13: 2, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1}
Negative: {9: 1, 10: 1, 11: 1, 12: 1, 13: 2, 14: 1, 15: 1, 16: 1, 21: 1, 18: 1, 19: 1, 20: 1, 17: 1}


# **Integer Encoding**

In [5]:
vocab = {}  
word_encoding = 1
def one_hot_encoding(text):
  global word_encoding

  words = text.lower().split(" ") 
  encoding = []  

  for word in words:
    if word in vocab:
      code = vocab[word]  
      encoding.append(code) 
    else:
      vocab[word] = word_encoding
      encoding.append(word_encoding)
      word_encoding += 1
  
  return encoding

text = "this is a test to see if this test will work is is test a a"
encoding = one_hot_encoding(text)
print(encoding)
print(vocab)

[1, 2, 3, 4, 5, 6, 7, 1, 4, 8, 9, 2, 2, 4, 3, 3]
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


In [6]:
positive_review = "I thought the movie was going to be bad but it was actually amazing"
negative_review = "I thought the movie was going to be amazing but it was actually bad"

pos_encode = one_hot_encoding(positive_review)
neg_encode = one_hot_encoding(negative_review)

print("Positive:", pos_encode)
print("Negative:", neg_encode)

Positive: [10, 11, 12, 13, 14, 15, 5, 16, 17, 18, 19, 14, 20, 21]
Negative: [10, 11, 12, 13, 14, 15, 5, 16, 21, 18, 19, 14, 20, 17]
