# Vector Representation

## Tokenization with split

In [1]:
import numpy as np
sentence = """Monticello wasn't designated as UNESCO World Heritage Site until 1987."""
token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
', '.join(vocab)
print(vocab)   


['1987.', 'Heritage', 'Monticello', 'Site', 'UNESCO', 'World', 'as', 'designated', 'until', "wasn't"]


## One-hot encoded

In [None]:
num_tokens = len(token_sequence)
vocab_size = len(vocab)

print(num_tokens)
print(vocab_size)

onehot_vectors = np.zeros((num_tokens,vocab_size), int)

for i, word in enumerate(token_sequence):
    onehot_vectors[i, vocab.index(word)] = 1
    ' '.join(vocab) 
    
print(onehot_vectors)

10
10
[[0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0]
 [1 0 0 0 0 0 0 0 0 0]]


## Showing One-hot with pandas

In [4]:
import pandas as pd
print(pd.DataFrame(onehot_vectors, columns = vocab))


   1987.  Heritage  Monticello  Site  UNESCO  World  as  designated  until  \
0      0         0           1     0       0      0   0           0      0   
1      0         0           0     0       0      0   0           0      0   
2      0         0           0     0       0      0   0           1      0   
3      0         0           0     0       0      0   1           0      0   
4      0         0           0     0       1      0   0           0      0   
5      0         0           0     0       0      1   0           0      0   
6      0         1           0     0       0      0   0           0      0   
7      0         0           0     1       0      0   0           0      0   
8      0         0           0     0       0      0   0           0      1   
9      1         0           0     0       0      0   0           0      0   

   wasn't  
0       0  
1       1  
2       0  
3       0  
4       0  
5       0  
6       0  
7       0  
8       0  
9       0  


In [5]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable


# Bag of words (Count Vectorize)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(X)

vectorizer.get_feature_names_out()

  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1


array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

# Vector similiraty

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X[0:1], X)

array([[1.        , 0.79056942, 0.54772256, 1.        ]])