In [None]:
!pip install gensim



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Sample data
documents = ["I love programming", "Programming is fun", "I love coding"]

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents to create the Bag-of-Words model
X = vectorizer.fit_transform(documents)

# Convert to an array for easy reading
word_counts = X.toarray()

# Normalize word counts by the total words in the document
normalized_counts = word_counts / word_counts.sum(axis=1, keepdims=True)

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Display results
print("Words:", feature_names)
print("Word Counts:", word_counts)
print("Normalized Counts:", normalized_counts)


Words: ['coding' 'fun' 'is' 'love' 'programming']
Word Counts: [[0 0 0 1 1]
 [0 1 1 0 1]
 [1 0 0 1 0]]
Normalized Counts: [[0.         0.         0.         0.5        0.5       ]
 [0.         0.33333333 0.33333333 0.         0.33333333]
 [0.5        0.         0.         0.5        0.        ]]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample data
documents = ["I love programming", "Programming is fun", "I love coding"]

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents to create the TF-IDF model
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert to array for easy reading
tfidf_array = tfidf_matrix.toarray()

# Get feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Display results
print("Words:", feature_names)
print("TF-IDF Scores:", tfidf_array)


Words: ['coding' 'fun' 'is' 'love' 'programming']
TF-IDF Scores: [[0.         0.         0.         0.70710678 0.70710678]
 [0.         0.62276601 0.62276601 0.         0.4736296 ]
 [0.79596054 0.         0.         0.60534851 0.        ]]


In [None]:
import gensim
from gensim.models import Word2Vec

# Sample data
documents = ["I love programming", "Programming is fun", "I love coding"]

# Tokenize the sentences into words (this step may involve more sophisticated preprocessing)
tokenized_documents = [doc.split() for doc in documents]

# Initialize and train the Word2Vec model
model = Word2Vec(tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

# Get word embeddings (vectors) for each word in the vocabulary
embeddings = {word: model.wv[word] for word in model.wv.index_to_key}

# Display word embeddings for a specific word
print("Embedding for 'programming':", embeddings['programming'])


Embedding for 'programming': [ 8.13227147e-03 -4.45733406e-03 -1.06835726e-03  1.00636482e-03
 -1.91113955e-04  1.14817743e-03  6.11386076e-03 -2.02715401e-05
 -3.24596534e-03 -1.51072862e-03  5.89729892e-03  1.51410222e-03
 -7.24261976e-04  9.33324732e-03 -4.92128357e-03 -8.38409644e-04
  9.17541143e-03  6.74942741e-03  1.50285603e-03 -8.88256077e-03
  1.14874600e-03 -2.28825561e-03  9.36823711e-03  1.20992784e-03
  1.49006362e-03  2.40640994e-03 -1.83600665e-03 -4.99963388e-03
  2.32429506e-04 -2.01418041e-03  6.60093315e-03  8.94012302e-03
 -6.74754381e-04  2.97701475e-03 -6.10765442e-03  1.69932481e-03
 -6.92623248e-03 -8.69402662e-03 -5.90020278e-03 -8.95647518e-03
  7.27759488e-03 -5.77203138e-03  8.27635173e-03 -7.24354526e-03
  3.42167495e-03  9.67499893e-03 -7.78544787e-03 -9.94505733e-03
 -4.32914635e-03 -2.68313056e-03 -2.71289347e-04 -8.83155130e-03
 -8.61755759e-03  2.80021061e-03 -8.20640661e-03 -9.06933658e-03
 -2.34046578e-03 -8.63180775e-03 -7.05664977e-03 -8.40115082e