<a href="https://colab.research.google.com/github/Lavanya-Srinivas214/LLM/blob/main/embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy scikit-learn gensim transformers torch
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec, KeyedVectors
from transformers import BertTokenizer, BertModel
import torch

# Sample Text
sentences = [
    "I love machine learning",
    "Machine learning is fun",
    "I love coding"
]

print("\n--- ORIGINAL SENTENCES ---")
for s in sentences:
    print(s)

# --------------------------------------------------
# 1. ONE-HOT ENCODING
# --------------------------------------------------
print("\n--- ONE-HOT ENCODING ---")
vectorizer = CountVectorizer(binary=True)
one_hot = vectorizer.fit_transform(sentences)
print("Vocabulary:", vectorizer.get_feature_names_out())
print(one_hot.toarray())

# --------------------------------------------------
# 2. BAG OF WORDS (BoW)
# --------------------------------------------------
print("\n--- BAG OF WORDS ---")
bow_vectorizer = CountVectorizer()
bow = bow_vectorizer.fit_transform(sentences)
print("Vocabulary:", bow_vectorizer.get_feature_names_out())
print(bow.toarray())

# --------------------------------------------------
# 3. WORD2VEC
# --------------------------------------------------
print("\n--- WORD2VEC ---")
tokenized_sentences = [s.lower().split() for s in sentences]
w2v_model = Word2Vec(tokenized_sentences, vector_size=50, window=3, min_count=1, workers=4)

word = "machine"
print(f"Vector for '{word}':")
print(w2v_model.wv[word])

# --------------------------------------------------
# 4. GLOVE (Using Pretrained File)
# --------------------------------------------------
print("\n--- GLOVE ---")
# Download: https://nlp.stanford.edu/projects/glove/
# Example file: glove.6B.50d.txt

glove_path = "glove.6B.50d.txt"  # Put file in same folder
try:
    glove_model = KeyedVectors.load_word2vec_format(glove_path, binary=False, no_header=True)
    print("Vector for 'machine':")
    print(glove_model["machine"])
except:
    print("GloVe file not found. Please download glove.6B.50d.txt")

# --------------------------------------------------
# 5. BERT EMBEDDING
# --------------------------------------------------
print("\n--- BERT ---")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

text = "I love machine learning"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

bert_embedding = outputs.last_hidden_state
print("BERT Embedding Shape:", bert_embedding.shape)
print("First token embedding (CLS):")
print(bert_embedding[0][0][:10])  # First 10 values


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0





--- ORIGINAL SENTENCES ---
I love machine learning
Machine learning is fun
I love coding

--- ONE-HOT ENCODING ---
Vocabulary: ['coding' 'fun' 'is' 'learning' 'love' 'machine']
[[0 0 0 1 1 1]
 [0 1 1 1 0 1]
 [1 0 0 0 1 0]]

--- BAG OF WORDS ---
Vocabulary: ['coding' 'fun' 'is' 'learning' 'love' 'machine']
[[0 0 0 1 1 1]
 [0 1 1 1 0 1]
 [1 0 0 0 1 0]]

--- WORD2VEC ---
Vector for 'machine':
[-0.01631583  0.0089916  -0.00827415  0.00164907  0.01699724 -0.00892435
  0.009035   -0.01357392 -0.00709698  0.01879702 -0.00315531  0.00064274
 -0.00828126 -0.01536538 -0.00301602  0.00493959 -0.00177605  0.01106732
 -0.00548595  0.00452013  0.01091159  0.01669191 -0.00290748 -0.01841629
  0.0087411   0.00114357  0.01488382 -0.00162657 -0.00527683 -0.01750602
 -0.00171311  0.00565313  0.01080286  0.01410531 -0.01140624  0.00371764
  0.01217773 -0.0095961  -0.00621452  0.01359526  0.00326295  0.00037983
  0.00694727  0.00043555  0.01923765  0.01012121 -0.01783478 -0.01408312
  0.00180291  0.012785

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Embedding Shape: torch.Size([1, 6, 768])
First token embedding (CLS):
tensor([ 0.1335,  0.2301, -0.0360, -0.0634, -0.1529, -0.3501,  0.0989,  0.7408,
        -0.0425, -0.4008], grad_fn=<SliceBackward0>)
