In [13]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec


In [14]:
documents = [
    "Artificial intelligence is transforming technology",
    "Machine learning is a subset of artificial intelligence",
    "Deep learning is used in computer vision",
    "Natural language processing is a branch of AI"
]

documents


['Artificial intelligence is transforming technology',
 'Machine learning is a subset of artificial intelligence',
 'Deep learning is used in computer vision',
 'Natural language processing is a branch of AI']

In [15]:
# Bag of Words - Count Occurrence

count_vectorizer = CountVectorizer()
bow_counts = count_vectorizer.fit_transform(documents)

bow_df = pd.DataFrame(
    bow_counts.toarray(),
    columns=count_vectorizer.get_feature_names_out()
)

bow_df


Unnamed: 0,ai,artificial,branch,computer,deep,in,intelligence,is,language,learning,machine,natural,of,processing,subset,technology,transforming,used,vision
0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0
1,0,1,0,0,0,0,1,1,0,1,1,0,1,0,1,0,0,0,0
2,0,0,0,1,1,1,0,1,0,1,0,0,0,0,0,0,0,1,1
3,1,0,1,0,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0


In [16]:
# Normalized Count Occurrence

bow_array = bow_counts.toarray().astype(float)

row_sums = bow_array.sum(axis=1, keepdims=True)
bow_normalized = bow_array / row_sums

bow_norm_df = pd.DataFrame(
    bow_normalized,
    columns=count_vectorizer.get_feature_names_out()
)

bow_norm_df


Unnamed: 0,ai,artificial,branch,computer,deep,in,intelligence,is,language,learning,machine,natural,of,processing,subset,technology,transforming,used,vision
0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0
1,0.0,0.142857,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.142857,0.142857,0.0,0.142857,0.0,0.142857,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.142857,0.142857,0.142857,0.0,0.142857,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.142857
3,0.142857,0.0,0.142857,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0


In [17]:
# TF-IDF Representation

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

tfidf_df


Unnamed: 0,ai,artificial,branch,computer,deep,in,intelligence,is,language,learning,machine,natural,of,processing,subset,technology,transforming,used,vision
0,0.0,0.420493,0.0,0.0,0.0,0.0,0.420493,0.27832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.533343,0.533343,0.0,0.0
1,0.0,0.361418,0.0,0.0,0.0,0.0,0.361418,0.239219,0.0,0.361418,0.458412,0.0,0.361418,0.0,0.458412,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.411906,0.411906,0.411906,0.0,0.21495,0.0,0.324751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.411906,0.411906
3,0.411906,0.0,0.411906,0.0,0.0,0.0,0.0,0.21495,0.411906,0.0,0.0,0.411906,0.324751,0.411906,0.0,0.0,0.0,0.0,0.0


In [18]:
# Word2Vec Embeddings

# Tokenization
tokenized_docs = [doc.lower().split() for doc in documents]

# Train Word2Vec model
w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

# Display vocabulary
list(w2v_model.wv.key_to_index.keys())


['is',
 'of',
 'a',
 'learning',
 'intelligence',
 'artificial',
 'ai',
 'branch',
 'processing',
 'language',
 'natural',
 'vision',
 'computer',
 'in',
 'used',
 'deep',
 'subset',
 'machine',
 'technology',
 'transforming']

In [19]:
# Example: Get embedding for a word

word = "learning"

if word in w2v_model.wv:
    print("Embedding vector for:", word)
    print(w2v_model.wv[word])
    print("Vector Shape:", w2v_model.wv[word].shape)


Embedding vector for: learning
[-8.2442407e-03  9.3042878e-03 -1.9911421e-04 -1.9678029e-03
  4.6018749e-03 -4.0988647e-03  2.7400651e-03  6.9396640e-03
  6.0632159e-03 -7.5126411e-03  9.3827257e-03  4.6737688e-03
  3.9636744e-03 -6.2461342e-03  8.4644528e-03 -2.1489759e-03
  8.8274535e-03 -5.3621302e-03 -8.1301508e-03  6.8212752e-03
  1.6720359e-03 -2.1992282e-03  9.5171109e-03  9.4946539e-03
 -9.7747138e-03  2.5069588e-03  6.1528711e-03  3.8718472e-03
  2.0245016e-03  4.3248982e-04  6.7490002e-04 -3.8206801e-03
 -7.1375156e-03 -2.0894834e-03  3.9264448e-03  8.8188834e-03
  9.2576966e-03 -5.9740138e-03 -9.4050942e-03  9.7636022e-03
  3.4281113e-03  5.1647285e-03  6.2837750e-03 -2.8009855e-03
  7.3217154e-03  2.8276283e-03  2.8711734e-03 -2.3811408e-03
 -3.1275465e-03 -2.3700953e-03  4.2765290e-03  7.5196906e-05
 -9.5861536e-03 -9.6643185e-03 -6.1500864e-03 -1.2753991e-04
  1.9990692e-03  9.4329305e-03  5.5845501e-03 -4.2885328e-03
  2.7955056e-04  4.9664448e-03  7.7003734e-03 -1.14206

In [20]:
# Document Embeddings using Average of Word Vectors

def get_document_embedding(doc, model):
    words = doc.lower().split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]

    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.wv.vector_size)

document_embeddings = []

for doc in documents:
    document_embeddings.append(get_document_embedding(doc, w2v_model))

print("Number of Documents:", len(document_embeddings))
print("Shape of Each Document Vector:", document_embeddings[0].shape)


Number of Documents: 4
Shape of Each Document Vector: (100,)


In [10]:
# Get embedding for a specific word
word = "learning"

embedding = w2v_model.wv[word]

print(f"Embedding vector for '{word}':\n")
print(embedding)

print("\nVector shape:", embedding.shape)


Embedding vector for 'learning':

[-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03 -5.08