One Hot

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Sample data
words = ["cat", "dog", "fish", "dog", "cat"]

# One-hot encoding using LabelEncoder
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(words)

# Create one-hot vectors
one_hot_vectors = np.zeros((len(words), len(set(words))))
for i, label in enumerate(encoded_labels):
    one_hot_vectors[i][label] = 1

print("One-Hot Encoding:\n", one_hot_vectors)


One-Hot Encoding:
 [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]


Bag Of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample data
documents = ["I love programming", "Python is awesome", "I love Python programming", "Python is love"]

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Convert the sparse matrix to a dense array
print("Bag of Words (BoW) with CountVectorizer:\n", X.toarray())

# Display the feature names (vocabulary)
print("Vocabulary:", vectorizer.get_feature_names_out())


Bag of Words (BoW) with CountVectorizer:
 [[0 0 1 1 0]
 [1 1 0 0 1]
 [0 0 1 1 1]
 [0 1 1 0 1]]
Vocabulary: ['awesome' 'is' 'love' 'programming' 'python']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample data
documents = ["I love programming", "Python is awesome", "I love Python programming"]

# Create TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
X_tfidf = tfidf_vectorizer.fit_transform(documents)

# Convert the sparse matrix to a dense array
print("TF-IDF Matrix:\n", X_tfidf.toarray())

# Display the feature names (vocabulary)
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())


TF-IDF Matrix:
 [[0.         0.         0.70710678 0.70710678 0.        ]
 [0.62276601 0.62276601 0.         0.         0.4736296 ]
 [0.         0.         0.57735027 0.57735027 0.57735027]]
Vocabulary: ['awesome' 'is' 'love' 'programming' 'python']


In [None]:
import gensim
from gensim.models import Word2Vec

# Sample data (list of tokenized sentences)
sentences = [["i", "love", "programming"], ["python", "is", "awesome"], ["i", "love", "python"]]

# Train Word2Vec model
model_w2v = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)

# Get the vector for a specific word
vector_cat = model_w2v.wv["python"]
print("Word2Vec - Vector for 'python':\n", vector_cat)


Word2Vec - Vector for 'python':
 [-1.0724545e-03  4.7286271e-04  1.0206699e-02  1.8018546e-02
 -1.8605899e-02 -1.4233618e-02  1.2917745e-02  1.7945977e-02
 -1.0030856e-02 -7.5267432e-03  1.4761009e-02 -3.0669428e-03
 -9.0732267e-03  1.3108104e-02 -9.7203208e-03 -3.6320353e-03
  5.7531595e-03  1.9837476e-03 -1.6570430e-02 -1.8897636e-02
  1.4623532e-02  1.0140524e-02  1.3515387e-02  1.5257311e-03
  1.2701781e-02 -6.8107317e-03 -1.8928028e-03  1.1537147e-02
 -1.5043275e-02 -7.8722071e-03 -1.5023164e-02 -1.8600845e-03
  1.9076237e-02 -1.4638334e-02 -4.6675373e-03 -3.8754821e-03
  1.6154874e-02 -1.1861792e-02  9.0324880e-05 -9.5074680e-03
 -1.9207101e-02  1.0014586e-02 -1.7519170e-02 -8.7836506e-03
 -7.0199967e-05 -5.9236289e-04 -1.5322480e-02  1.9229487e-02
  9.9641159e-03  1.8466286e-02]


In [None]:
import numpy as np

# Load pre-trained GloVe model
def load_glove_model(glove_file):
    model = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            model[word] = vector
    return model

# Example GloVe file (replace with the path to your local GloVe file)
glove_model = load_glove_model("glove.6B.50d.txt")  # 50D GloVe vectors

# Get the vector for a specific word
vector_glove = glove_model["python"]
print("GloVe - Vector for 'python':\n", vector_glove)


In [None]:
from gensim.models import FastText

# Example sentences for FastText training
sentences = [["i", "love", "programming"], ["python", "is", "awesome"], ["i", "love", "python"]]

# Train a FastText model
model_ft = FastText(sentences, vector_size=50, window=3, min_count=1, workers=4)

# Get the vector for a specific word
vector_ft = model_ft.wv["python"]
print("FastText - Vector for 'python':\n", vector_ft)


FastText - Vector for 'python':
 [ 0.00767598  0.00292498 -0.00391118 -0.0019751  -0.00052945  0.00138819
  0.00105093  0.00681139 -0.0019958  -0.00189362 -0.00215858  0.00351798
 -0.00101528 -0.00108583  0.00057213  0.00188714  0.004025    0.00218857
  0.00135858 -0.0025631   0.000346   -0.00030462 -0.00225894 -0.00175163
  0.00550426  0.00148931 -0.00095485  0.00230207 -0.00182532 -0.00287812
 -0.00570289  0.00406225 -0.00442473 -0.00068925 -0.00019111  0.00309115
 -0.00348415  0.00106954 -0.00219935 -0.00321007  0.00251087 -0.00073791
 -0.00128885  0.00094972  0.00122982  0.00244141 -0.00100991  0.00203276
  0.00140259  0.00434942]
