> # **Building the One-Hot Encoding from Scratch**


> - Import the libraries

In [1]:
import numpy as np
import string
import matplotlib.pyplot as plt

> - Define corpus - Collection of sentences

In [2]:
corpus = ['cat in the hat dog on the mat bird in the tree']

> - Text Pre-process

In [4]:
def text_preprocess(text):
  text = text.lower()
  text = text.translate(str.maketrans('','', string.punctuation))
  return text.split()

tokenized_corpus = [text_preprocess(sentence) for sentence in corpus]
print(tokenized_corpus)

[['cat', 'in', 'the', 'hat', 'dog', 'on', 'the', 'mat', 'bird', 'in', 'the', 'tree']]


> - Build vocabulary
    1. Vocabulary contains all unique words across corpus.
    2. Dictionary to map each word to unique index.

In [5]:
vocabulary = sorted(set(word for sentence in tokenized_corpus for word in sentence))
word_to_index = {word: i for i, word in enumerate(vocabulary)}

print(vocabulary)
print(word_to_index)

['bird', 'cat', 'dog', 'hat', 'in', 'mat', 'on', 'the', 'tree']
{'bird': 0, 'cat': 1, 'dog': 2, 'hat': 3, 'in': 4, 'mat': 5, 'on': 6, 'the': 7, 'tree': 8}


> - One-Hot Encoded Sentences

In [9]:
def one_hot_encoding(sentence, word_to_index):
  vocab_size = len(vocabulary)
  encoded_sentence = []
  for word in sentence:
    vector = np.zeros(vocab_size, dtype=int)
    vector[word_to_index[word]] = 1
    encoded_sentence.append(vector)
  return np.array(encoded_sentence)

In [10]:
encoded_vectors = one_hot_encoding(tokenized_corpus[0], word_to_index)
print(encoded_vectors)

[[0 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 0]
 [0 0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 1]]
