In [43]:
def one_hot_encode(text):
    words = text.split()
    vocabularies = set(words)
    word_to_index = {word: i for i, word in enumerate(vocabularies)}
    one_hot_encoded = []
    for word in words:
        one_hot_vector = [0] * len(vocabularies)
        one_hot_vector[word_to_index[word]] = 1
        one_hot_encoded.append(one_hot_vector)
    return one_hot_encoded, vocabularies, word_to_index

# Test
example_text = "cat in the hat dog on the mat bird in the tree."
one_hot_encoded, vocabularies, word_to_index = one_hot_encode(example_text)
print("Vocabularies:", vocabularies)
print("Word to Index Mapping:", word_to_index)
print("One Hot Encoded Matrix:", one_hot_encoded)
for word, encoding in zip(example_text.split(), one_hot_encoded):
    print(f"{word}: {encoding}")



Vocabularies: {'in', 'dog', 'bird', 'mat', 'on', 'hat', 'tree.', 'the', 'cat'}
Word to Index Mapping: {'in': 0, 'dog': 1, 'bird': 2, 'mat': 3, 'on': 4, 'hat': 5, 'tree.': 6, 'the': 7, 'cat': 8}
One Hot Encoded Matrix: [[0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0]]
cat: [0, 0, 0, 0, 0, 0, 0, 0, 1]
in: [1, 0, 0, 0, 0, 0, 0, 0, 0]
the: [0, 0, 0, 0, 0, 0, 0, 1, 0]
hat: [0, 0, 0, 0, 0, 1, 0, 0, 0]
dog: [0, 1, 0, 0, 0, 0, 0, 0, 0]
on: [0, 0, 0, 0, 1, 0, 0, 0, 0]
the: [0, 0, 0, 0, 0, 0, 0, 1, 0]
mat: [0, 0, 0, 1, 0, 0, 0, 0, 0]
bird: [0, 0, 1, 0, 0, 0, 0, 0, 0]
in: [1, 0, 0, 0, 0, 0, 0, 0, 0]
the: [0, 0, 0, 0, 0, 0, 0, 1, 0]
tree.: [0, 0, 0, 0, 0, 0, 1, 0, 0]


Embedding Bow

In [44]:
from sklearn.feature_extraction.text import CountVectorizer 
""" 
CountVectorizer is a class in the scikit-learn library that is used to convert a collection of text documents to a matrix of token counts. It is important to note that CountVectorizer converts text documents to a matrix of token counts, not one-hot encoded vectors.
Different from one-hot encoding, CountVectorizer does not create a binary vector. Instead, it counts the frequency of each word in the text documents.

"""
documents = ["This is the first document.","This document is the second document.","And this is the third one.","Is this the first document?"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

print("Bag-of-words matrix:")
print(X.toarray())
print("Vocabulary(Feature name):" , feature_names)


Bag-of-words matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
Vocabulary(Feature name): ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


Embedding-TFIDF

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
"""
TF-IDF stands for Term Frequency-Inverse Document Frequency. It is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. TF-IDF is a product of two statistics: term frequency and inverse document frequency.
"""
# Sample text
documents = ["The quick brown fox jumps over the lazy dog."," A journey of a thousand miles begins with a single step.",]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()
tfidf_values = {}

for doc_index, doc in enumerate(documents):
    feature_index = tfidf_matrix[doc_index,:].nonzero()[1]
    tfidf_doc_values = zip(feature_index, [tfidf_matrix[doc_index, x] for x in feature_index])
    tfidf_values[doc_index] = {feature_names[i]: value for i, value in tfidf_doc_values}
    
for doc_index,doc in tfidf_values.items():
    print(f"Document {doc_index+1}:")
    for word, tfidf_value in tfidf_values.items():
        print(f"{word}: {tfidf_value}")
        print("\n")

Document 1:
0: {'dog': 0.30151134457776363, 'lazy': 0.30151134457776363, 'over': 0.30151134457776363, 'jumps': 0.30151134457776363, 'fox': 0.30151134457776363, 'brown': 0.30151134457776363, 'quick': 0.30151134457776363, 'the': 0.6030226891555273}


1: {'step': 0.3535533905932738, 'single': 0.3535533905932738, 'with': 0.3535533905932738, 'begins': 0.3535533905932738, 'miles': 0.3535533905932738, 'thousand': 0.3535533905932738, 'of': 0.3535533905932738, 'journey': 0.3535533905932738}


Document 2:
0: {'dog': 0.30151134457776363, 'lazy': 0.30151134457776363, 'over': 0.30151134457776363, 'jumps': 0.30151134457776363, 'fox': 0.30151134457776363, 'brown': 0.30151134457776363, 'quick': 0.30151134457776363, 'the': 0.6030226891555273}


1: {'step': 0.3535533905932738, 'single': 0.3535533905932738, 'with': 0.3535533905932738, 'begins': 0.3535533905932738, 'miles': 0.3535533905932738, 'thousand': 0.3535533905932738, 'of': 0.3535533905932738, 'journey': 0.3535533905932738}




Embedding-CBOW

In [46]:
#Continuous Bag of Words (CBOW) 

import torch
import torch.nn as nn
import torch.optim as optim

# Define the CBOW model
""" 
Expalnation:
embedding: The embedding layer is used to convert the input word indices into word embeddings. The embedding layer is initialized with random weights and is updated during the training process.
linear: The linear layer is used to convert the sum of the word embeddings into a vector of size vocab_size. The linear layer is used to predict the target word.

context_embeds: The context_embeds variable is used to sum the word embeddings of the context words. The context words are the input words that are used to predict the target word.
output: The output variable is used to predict the target word using the linear layer. The output variable is the predicted word vector.

"""
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        self.linear = nn.Linear(embed_size, vocab_size)
        
    def forward(self, context):
        context_embeds = self.embeddings(context).sum(dim=1)
        output = self.linear(context_embeds)
        return output
    
# Sample data
context_size = 2
raw_text = "word embedding are awesome"
tokens = raw_text.split()
vocab = set(tokens)
word_to_index = {word: i for i, word in enumerate(vocab)} #字典映射
data = []
for i in range(2,len(tokens)-2):
    context = [word_to_index[word] for word in tokens[i-2:i]] + [word_to_index[word] for word in tokens[i+1:i+3]] #上下文, 2个前面的词和2个后面的词
    target = word_to_index[tokens[i]] #目标词
    data.append((context, target)) #数据集

vocab_size = len(vocab)
embed_size = 10
learning_rate = 0.01
epochs = 100

# Initialize the CBOW model
cbow_model = CBOWModel(vocab_size, embed_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(cbow_model.parameters(), lr=learning_rate)

# Train the CBOW model
for epoch in range(epochs):
    total_loss = 0
    for context ,target in data:
        optimizer.zero_grad()
        output = cbow_model(context)
        loss = criterion(output.unsqeeze(0), target.unsqueeze(0))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")
        
        

Epoch 1, Loss: 0
Epoch 2, Loss: 0
Epoch 3, Loss: 0
Epoch 4, Loss: 0
Epoch 5, Loss: 0
Epoch 6, Loss: 0
Epoch 7, Loss: 0
Epoch 8, Loss: 0
Epoch 9, Loss: 0
Epoch 10, Loss: 0
Epoch 11, Loss: 0
Epoch 12, Loss: 0
Epoch 13, Loss: 0
Epoch 14, Loss: 0
Epoch 15, Loss: 0
Epoch 16, Loss: 0
Epoch 17, Loss: 0
Epoch 18, Loss: 0
Epoch 19, Loss: 0
Epoch 20, Loss: 0
Epoch 21, Loss: 0
Epoch 22, Loss: 0
Epoch 23, Loss: 0
Epoch 24, Loss: 0
Epoch 25, Loss: 0
Epoch 26, Loss: 0
Epoch 27, Loss: 0
Epoch 28, Loss: 0
Epoch 29, Loss: 0
Epoch 30, Loss: 0
Epoch 31, Loss: 0
Epoch 32, Loss: 0
Epoch 33, Loss: 0
Epoch 34, Loss: 0
Epoch 35, Loss: 0
Epoch 36, Loss: 0
Epoch 37, Loss: 0
Epoch 38, Loss: 0
Epoch 39, Loss: 0
Epoch 40, Loss: 0
Epoch 41, Loss: 0
Epoch 42, Loss: 0
Epoch 43, Loss: 0
Epoch 44, Loss: 0
Epoch 45, Loss: 0
Epoch 46, Loss: 0
Epoch 47, Loss: 0
Epoch 48, Loss: 0
Epoch 49, Loss: 0
Epoch 50, Loss: 0
Epoch 51, Loss: 0
Epoch 52, Loss: 0
Epoch 53, Loss: 0
Epoch 54, Loss: 0
Epoch 55, Loss: 0
Epoch 56, Loss: 0
E

Skip-Gram 

In [47]:
# ! pip install gensim
# ! pip install nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt') # Download the Punkt tokenizer

# Sample text
sample = "The quick brown fox jumps over the lazy dog. A journey of a thousand miles begins with a single step."
tokenized_corpus = word_tokenize(sample.lower()) #lowercasing for consistency

skipgram_model = Word2Vec(sentences=[tokenized_corpus], 
                          vector_size=100, # Dimensionality of the word vectors
                          window=5, # The maximum distance between the current and predicted word within a sentence
                          sg=1, # Training algorithm: skip-gram
                          min_count=1, # Ignores all words with total frequency lower than this
                          workers=2)

# Training 
skipgram_model.train([tokenized_corpus], total_examples=1, epochs=10)
skipgram_model.save("skipgram.model.model")
loaded_model = Word2Vec.load("skipgram.model.model")
vector_representation = loaded_model.wv['fox']  # Get the vector representation of the word "fox"
print("Vector representation of the word 'fox':", vector_representation)

Vector representation of the word 'fox': [ 7.6557780e-03  9.1921482e-03  1.1411518e-03 -8.3091129e-03
  8.4182704e-03 -3.7443098e-03  5.7431986e-03  4.4432674e-03
  9.6431728e-03 -9.3206121e-03  9.2158644e-03 -9.3190912e-03
 -6.9693117e-03 -9.1118161e-03 -5.5355071e-03  7.3632225e-03
  9.1817779e-03 -3.3075348e-03  3.6922069e-03 -3.6953795e-03
  7.8955982e-03  5.8766557e-03  6.1806844e-05 -3.6396664e-03
 -7.2316998e-03  4.7761607e-03  1.4220346e-03 -2.5958526e-03
  7.8297537e-03 -4.0438161e-03 -9.1427332e-03 -2.3037302e-03
  1.5958906e-04 -6.6816122e-03 -5.4892530e-03 -8.4930165e-03
  9.2733549e-03  7.4490649e-03 -3.0979502e-04  7.3696864e-03
  7.9537332e-03 -8.1320369e-04  6.5710987e-03  3.8147303e-03
  5.0863335e-03  7.2565181e-03 -4.7544595e-03 -2.1836259e-03
  8.9267868e-04  4.2488538e-03  3.3067961e-03  5.0703236e-03
  4.6047554e-03 -8.4244283e-03 -3.2130813e-03 -7.2407331e-03
  9.7420076e-03  4.9901991e-03  1.6234057e-04  4.1508423e-03
 -7.6533323e-03 -6.3014315e-03  3.1065089e-0

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\28154\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Glove

In [48]:
from gensim.models import KeyedVectors
from gensim.downloader import load

glove_model = load("glove-wiki-gigaword-50") # Load the GloVe model
word_pairs = [('learn', 'learning'),('india','indian'),('fame','famous')]

for pair in word_pairs:
    similarity = glove_model.similarity(pair[0], pair[1])
    print(f"Similarity between '{pair[0]}' and '{pair[1]}' using Glove: {similarity:.3f}")


Similarity between 'learn' and 'learning' using Glove: 0.802
Similarity between 'india' and 'indian' using Glove: 0.865
Similarity between 'fame' and 'famous' using Glove: 0.589


Fasttext:常见

In [50]:
import gensim.downloader as api

fasttext_model = api.load("fasttext-wiki-news-subwords-300") # Load the FastText model
word_pairs = [('learn', 'learning'),('india','indian'),('fame','famous')]

for pair in word_pairs:
    similarity = fasttext_model.similarity(pair[0], pair[1])
    print(f"Similarity between '{pair[0]}' and '{pair[1]}' using FastText: {similarity:.3f}")

Similarity between 'learn' and 'learning' using FastText: 0.642
Similarity between 'india' and 'indian' using FastText: 0.708
Similarity between 'fame' and 'famous' using FastText: 0.519


Bert

In [52]:
! pip install transformers
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

word_pairs = [('learn', 'learning'),('india','indian'),('fame','famous')]
for pair in word_pairs:
    tokens = tokenizer(pair,retrun_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
       
        # Extract embeddings for the [CLS] token    
        cls_embedding = output.last_hidden_state[:,0,:]
        
        similarity = torch.cosine_similarity(cls_embedding[0], cls_embedding[1], dim=0)
        print(f"Similarity between '{pair[0]}' and '{pair[1]}' using BERT: {similarity:.3f}")

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
     ---------------------------------------- 7.2/7.2 MB 1.3 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp37-cp37m-win_amd64.whl (3.5 MB)
     ---------------------------------------- 3.5/3.5 MB 1.4 MB/s eta 0:00:00
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0.1-cp37-cp37m-win_amd64.whl (153 kB)
     -------------------------------------- 153.2/153.2 kB 1.8 MB/s eta 0:00:00
Collecting safetensors>=0.3.1
  Downloading safetensors-0.4.3-cp37-none-win_amd64.whl (287 kB)
     -------------------------------------- 287.5/287.5 kB 8.7 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.12.2-py3-none-any.whl (10 kB)
Collecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
     -------------------------------------- 268.8/268.8 kB 1.5 MB/s eta 0:00:00
Collecting fsspec
  Downloading fsspec-2023.1.0-py3-

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

TypeError: register_buffer() got an unexpected keyword argument 'persistent'