## One Hot Encoding

In [1]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [2]:
documents = ["I love love coding", "Coding in Python is fun", "I love Python"]


In [3]:
words = np.array(documents[0].split()).reshape(-1, 1)

In [4]:
words

array([['I'],
       ['love'],
       ['love'],
       ['coding']], dtype='<U6')

In [5]:
encoder = OneHotEncoder(sparse_output=False)
ohe_matrix = encoder.fit_transform(words)

In [6]:
print("Words:", words.ravel())
print("One-Hot Encoded Matrix:\n", ohe_matrix)

Words: ['I' 'love' 'love' 'coding']
One-Hot Encoded Matrix:
 [[1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]]


## Bag Of Words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(documents)

In [9]:
print("Vocabulary:", vectorizer.vocabulary_)
print("BOW Matrix:\n", bow_matrix.toarray())

Vocabulary: {'love': 4, 'coding': 0, 'in': 2, 'python': 5, 'is': 3, 'fun': 1}
BOW Matrix:
 [[1 0 0 0 2 0]
 [1 1 1 1 0 1]
 [0 0 0 0 1 1]]


## TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Print vocabulary and TF-IDF matrix
print("Vocabulary:", tfidf_vectorizer.vocabulary_)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

Vocabulary: {'love': 4, 'coding': 0, 'in': 2, 'python': 5, 'is': 3, 'fun': 1}
TF-IDF Matrix:
 [[0.4472136  0.         0.         0.         0.89442719 0.        ]
 [0.37302199 0.49047908 0.49047908 0.49047908 0.         0.37302199]
 [0.         0.         0.         0.         0.70710678 0.70710678]]


## Word2Vec

In [11]:
from gensim.models import Word2Vec

# Sample sentences
sentences = [["I", "love", "coding"], ["Python", "is", "great"]]

# Training a Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Getting the embedding for a word
vector = model.wv["coding"]
print(vector)

[-8.2426779e-03  9.2993546e-03 -1.9766092e-04 -1.9672764e-03
  4.6036304e-03 -4.0953159e-03  2.7431143e-03  6.9399667e-03
  6.0654259e-03 -7.5107943e-03  9.3823504e-03  4.6718083e-03
  3.9661205e-03 -6.2435055e-03  8.4599797e-03 -2.1501649e-03
  8.8251876e-03 -5.3620026e-03 -8.1294188e-03  6.8245591e-03
  1.6711927e-03 -2.1985089e-03  9.5136007e-03  9.4938548e-03
 -9.7740470e-03  2.5052286e-03  6.1566923e-03  3.8724565e-03
  2.0227872e-03  4.3050171e-04  6.7363144e-04 -3.8206363e-03
 -7.1402504e-03 -2.0888723e-03  3.9238976e-03  8.8186832e-03
  9.2591504e-03 -5.9759365e-03 -9.4026709e-03  9.7643770e-03
  3.4297847e-03  5.1661171e-03  6.2823449e-03 -2.8042626e-03
  7.3227035e-03  2.8302716e-03  2.8710044e-03 -2.3803699e-03
 -3.1282497e-03 -2.3701417e-03  4.2764368e-03  7.6057913e-05
 -9.5842788e-03 -9.6655441e-03 -6.1481940e-03 -1.2856961e-04
  1.9974159e-03  9.4319675e-03  5.5843508e-03 -4.2906962e-03
  2.7831673e-04  4.9643586e-03  7.6983096e-03 -1.1442233e-03
  4.3234206e-03 -5.81437