# NLP Vectorization Techniques
This notebook covers various techniques for text vectorization:
- One-Hot Encoding
- Bag-of-Words (BoW)
- TF-IDF
- Word2Vec
- GloVe
- FastText
- Transformers
- BERT

In [4]:
!pip install --user gensim



In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
nltk.download('punkt')
from gensim.models import Word2Vec, FastText
from transformers import BertTokenizer, BertModel
import torch

df = pd.read_csv("sephora_skincare_reviews.csv")

corpus = df['feedback'].tolist()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

tokenized = [nltk.word_tokenize(doc.lower()) for doc in corpus]
one_hot = MultiLabelBinarizer()
one_hot_encoded = one_hot.fit_transform(tokenized)
pd.DataFrame(one_hot_encoded, columns=one_hot.classes_)

Unnamed: 0,!,','m,'s,'ve,",",.,30,30a,30i,...,yhdration,yhgieenic,yhgienic,ym,ype,ypt,ytpe,ytppe,yy,—
0,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2996,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2998,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(corpus)
pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,30,30a,30i,30l,30n,30o,30s,30ss,30t,30u,...,ygienic,yhdration,yhgieenic,yhgienic,ym,ype,ypt,ytpe,ytppe,yy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# 3. TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(corpus)
pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,30,30a,30i,30l,30n,30o,30s,30ss,30t,30u,...,ygienic,yhdration,yhgieenic,yhgienic,ym,ype,ypt,ytpe,ytppe,yy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2996,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2997,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2998,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# 4. Word2Vec
w2v_model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.wv['skin']  # Examples

array([-0.5264536 ,  0.9031807 , -0.39790246,  0.9679387 ,  0.64645517,
       -1.8305812 ,  0.5255278 ,  1.838848  , -0.48883745, -0.7292031 ,
        0.4907772 , -1.7098905 , -0.53813046,  1.3565902 ,  0.12267368,
        0.25725356, -0.30844945, -0.6640486 , -0.7078212 , -1.7340521 ,
       -0.01062248,  0.98976934,  0.37457618, -0.20488672, -0.14184035,
        0.14726368, -1.1609403 , -0.15357111, -0.7920056 ,  0.5265334 ,
        1.0253956 ,  0.01821092,  0.5170474 , -1.2805319 , -1.0593853 ,
        1.4714168 ,  0.831948  ,  0.50372183, -0.56936955, -0.8939265 ,
       -0.13121895, -0.33045062, -0.7633688 , -0.23708107,  0.7844601 ,
        0.13342695, -0.8122683 ,  0.27309155,  0.9547018 ,  1.1653427 ,
        0.65606534, -0.8694375 ,  0.28171012,  0.03834676, -0.39927626,
       -0.03154077,  0.1920974 , -0.54683876, -0.39112657, -0.09955885,
       -0.1525253 ,  0.2652748 ,  0.22837262, -0.59511507, -0.6054979 ,
        0.2746234 ,  0.4237    ,  0.33080035, -0.6034783 ,  0.83

In [12]:
# 5. GloVe (Requires pre-trained GloVe file)
# Convert GloVe format to Word2Vec format using:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec('glove.6B.100d.txt', 'glove.6B.100d.word2vec.txt')
from gensim.models import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format('glove.6B.100d.word2vec.txt')
glove_model['skin']

  glove2word2vec('glove.6B.100d.txt', 'glove.6B.100d.word2vec.txt')


array([-0.56765  ,  0.15144  ,  0.08739  , -0.58177  , -0.36405  ,
        0.25263  ,  0.19216  ,  0.33921  ,  0.20811  ,  0.0036447,
       -0.36843  , -0.1946   ,  1.2074   ,  0.65129  ,  0.85408  ,
        0.61007  , -0.44654  , -0.79221  ,  0.99169  , -0.57925  ,
       -0.52479  , -0.12186  , -0.68424  , -0.24807  ,  0.79997  ,
        1.8644   ,  0.88059  , -1.1063   ,  0.0036065, -0.54901  ,
        0.4942   ,  0.70751  , -0.27179  , -0.20569  ,  0.17593  ,
        0.46356  , -0.30265  ,  0.15677  , -0.20668  ,  0.50407  ,
       -0.15696  , -1.0547   , -0.55695  , -0.66206  , -0.37376  ,
        0.7299   , -0.030611 ,  0.7426   , -0.21264  , -0.83752  ,
        0.30062  ,  0.19039  , -0.16993  ,  1.0665   , -0.29448  ,
       -1.1214   ,  0.17906  , -0.15797  ,  0.64386  ,  0.21188  ,
        0.75228  ,  1.8364   , -0.0076749,  0.53663  ,  1.1772   ,
        0.20493  ,  0.65759  , -0.62886  ,  0.1556   , -1.161    ,
        0.12446  ,  0.46445  ,  0.32222  ,  0.42523  ,  0.6410

In [13]:
# 6. FastText
fasttext_model = FastText(sentences=tokenized, vector_size=100, window=5, min_count=1)
fasttext_model.wv['processing']

array([-7.6200828e-02,  6.8052255e-02, -5.5955875e-01, -1.4974034e-01,
        2.2204624e-02, -1.9808928e-02,  2.4756396e-01,  1.9465701e-01,
        6.6889561e-04, -8.2939908e-02, -4.2132416e-01, -2.4643306e-01,
       -4.7080466e-01,  3.9787406e-01, -1.8345340e-01, -1.7572649e-01,
       -2.0411718e-01, -5.5316687e-01, -7.6164192e-01, -5.1101816e-01,
       -3.8477266e-01, -1.0967568e-01,  8.2738280e-02,  3.0954796e-04,
       -3.7623718e-01, -1.4020821e-01, -4.7636336e-01, -1.6546959e-01,
        7.9588063e-02, -3.2610258e-01, -5.3043085e-01,  3.6918426e-01,
        4.7233200e-01,  3.7894118e-01,  8.4922247e-02,  8.2085025e-01,
        1.9025105e-01,  6.8574691e-01, -4.4594359e-01, -3.6883608e-01,
       -1.1833553e-01, -7.1815825e-01, -1.4693919e-02, -1.3624063e-01,
        2.5024864e-01, -4.3491620e-01, -5.0346136e-01, -8.2395390e-02,
        3.9058828e-01, -3.1596407e-01,  3.3402443e-04,  3.0888796e-01,
        7.0949732e-03, -1.8603003e-02,  1.0836095e-01,  1.6136594e-01,
      

In [14]:
# 7. Transformers Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer(corpus[0], return_tensors='pt')
tokens

{'input_ids': tensor([[  101,  4384, 28241,  2043,  1045,  2109,  2009,  2007,  2060,  3161,
          1025,  2224, 12403,  4892,  2135,  1012,  1045,  1005,  2310,  2042,
          2478,  2009,  9152, 11039,  2135,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}

In [15]:
# 8. BERT Embedding
model = BertModel.from_pretrained('bert-base-uncased')
with torch.no_grad():
    outputs = model(**tokens)
bert_embedding = outputs.last_hidden_state.mean(dim=1)
bert_embedding

tensor([[ 8.1453e-02,  1.5106e-01, -1.3260e-01, -1.6842e-01,  1.6651e-01,
         -1.5893e-01,  2.2553e-01,  4.4810e-01,  6.1181e-02, -3.6249e-01,
          1.7468e-01, -8.9736e-02, -8.5149e-02, -2.3195e-02, -3.5685e-01,
          5.2189e-01,  1.5648e-01, -1.4007e-01, -4.5794e-01,  3.3035e-01,
          3.8853e-01, -1.2630e-01, -1.2903e-01,  5.0997e-01,  4.0505e-02,
          5.4394e-02, -3.7772e-03,  2.3960e-01, -2.0056e-01, -3.8032e-01,
          5.1689e-01,  2.6453e-01, -3.2878e-01, -5.4709e-02,  9.2936e-02,
          1.1973e-01, -7.1817e-02, -8.4835e-02, -2.7961e-01, -2.0971e-02,
         -3.6669e-01, -2.2042e-01, -4.8974e-02,  1.1049e-02,  1.0900e-01,
          3.9793e-02,  2.9932e-01,  3.7605e-01, -9.1686e-02,  4.2107e-03,
         -4.6250e-01, -3.9209e-01,  5.8528e-01, -3.8894e-01, -1.4234e-02,
          7.9962e-01, -1.5294e-01, -4.0911e-01, -1.8246e-01,  2.6913e-01,
         -2.0712e-01, -4.2812e-01,  2.4456e-02,  1.0513e-01,  2.0176e-01,
          3.5867e-01, -1.0204e-01,  3.