Q2) Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on
data. Create embeddings using Word2Vec

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
import gensim
from gensim.models import Word2Vec
import nltk

# Download required NLTK data package
nltk.download('punkt_tab')  # Correct resource

documents = [
    "Alice and Bob discovered a hidden treasure in the old castle.",
    "The treasure was buried under the ancient oak tree.",
    "Legends spoke of a hidden treasure guarded by a mystical creature."
]

# Preprocess documents (e.g., converting to lowercase)
processed_docs = [doc.lower() for doc in documents]

# 1. Bag-of-Words (BoW)
count_vectorizer = CountVectorizer()
bow_counts = count_vectorizer.fit_transform(processed_docs)
bow_df = pd.DataFrame(bow_counts.toarray(), columns=count_vectorizer.get_feature_names_out())
print("=== Bag-of-Words Count Occurrence ===")
print(bow_df, "\n")

# 2. TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_docs)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("=== TF-IDF Matrix ===")
print(tfidf_df, "\n")

# 3. Word2Vec Embeddings

# Tokenize documents into words using NLTK
tokenized_docs = [nltk.word_tokenize(doc) for doc in processed_docs]
print("=== Tokenized Documents ===")
for i, tokens in enumerate(tokenized_docs, 1):
    print(f"Document {i}: {tokens}")
print()

# Train a Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

# Display the vocabulary from the Word2Vec model
vocab = list(w2v_model.wv.index_to_key)
print("=== Vocabulary in Word2Vec Model ===")
print(vocab, "\n")

# Example: Get the embedding for the word "treasure"
word = "treasure"
if word in w2v_model.wv:
    print(f"=== Word2Vec Embedding for '{word}' ===")
    print(w2v_model.wv[word])
else:
    print(f"Word '{word}' not found in the vocabulary.")


In [None]:
!pip install numpy==1.23.5 gensim==4.3.1

In [None]:
!pip install -U scipy==1.10.1