**Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on
data. Create embeddings using Word2Vec **

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

In [None]:
# Sample data (Replace this with your dataset)
data = [
    "This is a sample sentence",
    "Another sentence for testing",
    "Machine learning is fun",
    "Bag of words and TF-IDF are useful techniques"
]

In [None]:
# Bag-of-Words (Count Vectorizer)
count_vectorizer = CountVectorizer()  # represents each doc. as a vector of word counts.
count_vectorized_data = count_vectorizer.fit_transform(data)

print("Bag-of-Words Representation:")
print(pd.DataFrame(count_vectorized_data.toarray(), columns=count_vectorizer.get_feature_names_out()))


Bag-of-Words Representation:
   and  another  are  bag  for  fun  idf  is  learning  machine  of  sample  \
0    0        0    0    0    0    0    0   1         0        0   0       1   
1    0        1    0    0    1    0    0   0         0        0   0       0   
2    0        0    0    0    0    1    0   1         1        1   0       0   
3    1        0    1    1    0    0    1   0         0        0   1       0   

   sentence  techniques  testing  tf  this  useful  words  
0         1           0        0   0     1       0      0  
1         1           0        1   0     0       0      0  
2         0           0        0   0     0       0      0  
3         0           1        0   1     0       1      1  


In [None]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer() #importance of words based on their frequency
tfidf_vectorized_data = tfidf_vectorizer.fit_transform(data)

print("\nTF-IDF Representation:")
print(pd.DataFrame(tfidf_vectorized_data.toarray(), columns=tfidf_vectorizer.get_feature_names_out()))


TF-IDF Representation:
        and   another       are       bag       for       fun       idf  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.525473  0.000000  0.000000  0.525473  0.000000  0.000000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.525473  0.000000   
3  0.333333  0.000000  0.333333  0.333333  0.000000  0.000000  0.333333   

         is  learning   machine        of    sample  sentence  techniques  \
0  0.437791  0.000000  0.000000  0.000000  0.555283  0.437791    0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.000000  0.414289    0.000000   
2  0.414289  0.525473  0.525473  0.000000  0.000000  0.000000    0.000000   
3  0.000000  0.000000  0.000000  0.333333  0.000000  0.000000    0.333333   

    testing        tf      this    useful     words  
0  0.000000  0.000000  0.555283  0.000000  0.000000  
1  0.525473  0.000000  0.000000  0.000000  0.000000  
2  0.000000  0.000000  0.000000  0.000000  0.000000 

In [None]:
# Word2Vec Embeddings
# Preprocess data for Word2Vec
sentences = [sentence.lower().split() for sentence in data]



In [None]:
# Instantiate and train the Word2Vec model
from gensim.models import Word2Vec # Make sure to import Word2Vec
word2vec_model = Word2Vec(sentences, vector_size=100,
                          window=5, min_count=1, workers=4) # Initialize and train the model

In [None]:
# Display word vectors
print("\nWord2Vec Embeddings:")
for word in word2vec_model.wv.index_to_key:
    print(f"{word}: {word2vec_model.wv[word]}")


Word2Vec Embeddings:
is: [-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-0

In [None]:
# Save the model
word2vec_model.save("word2vec_model.bin")