In [3]:
!pip install gensim sentence-transformers


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-man

In [2]:
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec, FastText

In [3]:
documents = [
    "Natural language processing helps computers understand human language. It includes tasks like text classification, translation, and sentiment analysis.",
    "Word embeddings capture the meaning of words in a numerical format. They are used in applications like search engines, chatbots, and recommendation systems."
]


In [4]:
# 1. TF-IDF Document Embedding
vectorizer = TfidfVectorizer()
tfidf_embeddings = vectorizer.fit_transform(documents).toarray()
print("TF-IDF Embedding Shape:", tfidf_embeddings.shape)

TF-IDF Embedding Shape: (2, 36)


In [5]:
# 2. Word2Vec Document Embedding (Average of Word Embeddings)
word2vec_model = Word2Vec([doc.split() for doc in documents], vector_size=100, window=5, min_count=1)
word2vec_embeddings = np.array([
    np.mean([word2vec_model.wv[word] for word in doc.split() if word in word2vec_model.wv], axis=0)
    for doc in documents
])
print("Word2Vec Embedding Shape:", word2vec_embeddings.shape)

Word2Vec Embedding Shape: (2, 100)


In [6]:
# 3. BERT Document Embedding
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    tokens = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = bert_model(**tokens)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

bert_embeddings = np.array([get_bert_embedding(doc) for doc in documents])
print("BERT Embedding Shape:", bert_embeddings.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Embedding Shape: (2, 768)
