# Text Encoding/Embedding Techniques for NLP with Generative AI

This notebook covers traditional encoding techniques (One-Hot Encoding, Bag of Words, TF-IDF, N-grams) and modern Gen AI encoding/embedding (BERT) for NLP. You'll practice encoding text and compare the results for a small dataset.

# Import Libraries

In [None]:
# Before Importing we need to install some of the Modules
# !pip install pandas matplotlib numpy seaborn torch nltk transformers scikit-learn

Collecting pandas
  Downloading pandas-2.3.1-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.3-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting numpy
  Downloading numpy-2.3.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting torch
  Downloading torch-2.7.1-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplo

In [7]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np 

import re

import torch
import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import PCA

from nltk import word_tokenize

from transformers import BertTokenizer, BertModel

nltk.download("punkt")

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\naeem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Sample Dataset

In [86]:
text = [
    'hi how are you?',
    'i am fine @ what about you?',
    "i like the $ Cricket match between Pakistan and India like anything.",
    " Messi and Ronaldo always play well.",
    "I am very **** excited about about  ! the AI technology ."
]

In [87]:
data = pd.DataFrame(text,columns=["text"])

In [88]:
print(" Original data", text)

 Original data ['hi how are you?', 'i am fine @ what about you?', 'i like the $ Cricket match between Pakistan and India like anything.', ' Messi and Ronaldo always play well.', 'I am very **** excited about about  ! the AI technology .']


In [89]:
print("After COnverting to Dataframe", data)

After COnverting to Dataframe                                                 text
0                                    hi how are you?
1                        i am fine @ what about you?
2  i like the $ Cricket match between Pakistan an...
3                Messi and Ronaldo always play well.
4  I am very **** excited about about  ! the AI t...


In [90]:
data.head()

Unnamed: 0,text
0,hi how are you?
1,i am fine @ what about you?
2,i like the $ Cricket match between Pakistan an...
3,Messi and Ronaldo always play well.
4,I am very **** excited about about ! the AI t...


# Text Prerocessing

In [91]:
# Function to perfrom some cleaning operations
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions and hashtags
    text = re.sub(r'[@#]\w+', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase
    text = text.lower().strip()
    return text


In [92]:
data["Cleaned_Text"] = data['text'].apply(clean_text)

In [93]:
data.head()

Unnamed: 0,text,Cleaned_Text
0,hi how are you?,hi how are you
1,i am fine @ what about you?,i am fine what about you
2,i like the $ Cricket match between Pakistan an...,i like the cricket match between pakistan and ...
3,Messi and Ronaldo always play well.,messi and ronaldo always play well
4,I am very **** excited about about ! the AI t...,i am very excited about about the ai technology


# One Hot Encoding

In [94]:
# Tokenizing the Text
data["tokens"] = data["Cleaned_Text"].apply(lambda x: x.split())
data.head()

Unnamed: 0,text,Cleaned_Text,tokens
0,hi how are you?,hi how are you,"[hi, how, are, you]"
1,i am fine @ what about you?,i am fine what about you,"[i, am, fine, what, about, you]"
2,i like the $ Cricket match between Pakistan an...,i like the cricket match between pakistan and ...,"[i, like, the, cricket, match, between, pakist..."
3,Messi and Ronaldo always play well.,messi and ronaldo always play well,"[messi, and, ronaldo, always, play, well]"
4,I am very **** excited about about ! the AI t...,i am very excited about about the ai technology,"[i, am, very, excited, about, about, the, ai, ..."


In [95]:
# Build vocabulary
all_tokens = [token for tokens in data['tokens'] for token in tokens]
print(" All Tokens : ", all_tokens)
print("Length of all tokens is : ",len(all_tokens))
vocab = sorted(set(all_tokens))
print("Vocabulary : ", vocab)
print(" Length of the Vocan is : ", len(vocab))

 All Tokens :  ['hi', 'how', 'are', 'you', 'i', 'am', 'fine', 'what', 'about', 'you', 'i', 'like', 'the', 'cricket', 'match', 'between', 'pakistan', 'and', 'india', 'like', 'anything', 'messi', 'and', 'ronaldo', 'always', 'play', 'well', 'i', 'am', 'very', 'excited', 'about', 'about', 'the', 'ai', 'technology']
Length of all tokens is :  36
Vocabulary :  ['about', 'ai', 'always', 'am', 'and', 'anything', 'are', 'between', 'cricket', 'excited', 'fine', 'hi', 'how', 'i', 'india', 'like', 'match', 'messi', 'pakistan', 'play', 'ronaldo', 'technology', 'the', 'very', 'well', 'what', 'you']
 Length of the Vocan is :  27


In [96]:
# one hot encoding
lb = LabelBinarizer()
lb.fit(vocab)

0,1,2
,neg_label,0
,pos_label,1
,sparse_output,False


In [97]:
def one_hot_encode(tokens):
    return [lb.transform([token])[0].tolist() for token in tokens]


data['OHE'] = data['tokens'].apply(one_hot_encode)
data.head()

Unnamed: 0,text,Cleaned_Text,tokens,OHE
0,hi how are you?,hi how are you,"[hi, how, are, you]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,..."
1,i am fine @ what about you?,i am fine what about you,"[i, am, fine, what, about, you]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,..."
2,i like the $ Cricket match between Pakistan an...,i like the cricket match between pakistan and ...,"[i, like, the, cricket, match, between, pakist...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,..."
3,Messi and Ronaldo always play well.,messi and ronaldo always play well,"[messi, and, ronaldo, always, play, well]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,I am very **** excited about about ! the AI t...,i am very excited about about the ai technology,"[i, am, very, excited, about, about, the, ai, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,..."


In [98]:
print(" The sentance is : ", data['Cleaned_Text'][0])
print("The OHE for", data['tokens'][0][0], "is : ",data['OHE'][0][0])

 The sentance is :  hi how are you
The OHE for hi is :  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [99]:
print(" The sentance is : ", data['Cleaned_Text'][0],". and it  OHE is : ",data['OHE'][0])

 The sentance is :  hi how are you . and it  OHE is :  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]


# Bag of Words (BoW)

In [100]:
bow = CountVectorizer()

bow_matrix  = bow.fit_transform(data['Cleaned_Text'])

bow_fetures = bow.get_feature_names_out()

bow_df = pd.DataFrame(bow_matrix.toarray(),columns=bow_fetures)

In [111]:
print(bow_fetures)
print(" Total Features : ",len(bow_fetures))

['about' 'ai' 'always' 'am' 'and' 'anything' 'are' 'between' 'cricket'
 'excited' 'fine' 'hi' 'how' 'india' 'like' 'match' 'messi' 'pakistan'
 'play' 'ronaldo' 'technology' 'the' 'very' 'well' 'what' 'you']
 Total Features :  26


In [102]:
bow_df.head()

Unnamed: 0,about,ai,always,am,and,anything,are,between,cricket,excited,...,messi,pakistan,play,ronaldo,technology,the,very,well,what,you
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,0,0,0,0,1,1,0,1,1,0,...,0,1,0,0,0,1,0,0,0,0
3,0,0,1,0,1,0,0,0,0,0,...,1,0,1,1,0,0,0,1,0,0
4,2,1,0,1,0,0,0,0,0,1,...,0,0,0,0,1,1,1,0,0,0


# TF-IDF

In [105]:
tfidf = TfidfVectorizer()

tf_matrix = tfidf.fit_transform(data['Cleaned_Text'])

tf_features = tfidf.get_feature_names_out()

tf_df = pd.DataFrame(tf_matrix.toarray(),columns=tf_features)

In [112]:
print(tf_features)
print(" Total Features : ",len(tf_features))

['about' 'ai' 'always' 'am' 'and' 'anything' 'are' 'between' 'cricket'
 'excited' 'fine' 'hi' 'how' 'india' 'like' 'match' 'messi' 'pakistan'
 'play' 'ronaldo' 'technology' 'the' 'very' 'well' 'what' 'you']
 Total Features :  26


In [107]:
tf_df.head()

Unnamed: 0,about,ai,always,am,and,anything,are,between,cricket,excited,...,messi,pakistan,play,ronaldo,technology,the,very,well,what,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.523358,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.422242
1,0.405801,0.0,0.0,0.405801,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50298,0.405801
2,0.0,0.0,0.0,0.0,0.239987,0.297458,0.0,0.297458,0.297458,0.0,...,0.0,0.297458,0.0,0.0,0.0,0.239987,0.0,0.0,0.0,0.0
3,0.0,0.0,0.420669,0.0,0.339393,0.0,0.0,0.0,0.0,0.0,...,0.420669,0.0,0.420669,0.420669,0.0,0.0,0.0,0.420669,0.0,0.0
4,0.573889,0.35566,0.0,0.286945,0.0,0.0,0.0,0.0,0.0,0.35566,...,0.0,0.0,0.0,0.0,0.35566,0.286945,0.35566,0.0,0.0,0.0


# N-Grams

In [133]:
ngram = CountVectorizer(ngram_range=(1,2))
X = ngram.fit_transform(data['Cleaned_Text'])
x_features = ngram.get_feature_names_out()

gram_df = pd.DataFrame(X.toarray(),columns=x_features)

In [134]:
print(x_features)
print(" Total Features : ",len(x_features))

['about' 'about about' 'about the' 'about you' 'ai' 'ai technology'
 'always' 'always play' 'am' 'am fine' 'am very' 'and' 'and india'
 'and ronaldo' 'anything' 'are' 'are you' 'between' 'between pakistan'
 'cricket' 'cricket match' 'excited' 'excited about' 'fine' 'fine what'
 'hi' 'hi how' 'how' 'how are' 'india' 'india like' 'like' 'like anything'
 'like the' 'match' 'match between' 'messi' 'messi and' 'pakistan'
 'pakistan and' 'play' 'play well' 'ronaldo' 'ronaldo always' 'technology'
 'the' 'the ai' 'the cricket' 'very' 'very excited' 'well' 'what'
 'what about' 'you']
 Total Features :  54


In [135]:
gram_df.head()

Unnamed: 0,about,about about,about the,about you,ai,ai technology,always,always play,am,am fine,...,technology,the,the ai,the cricket,very,very excited,well,what,what about,you
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,0,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2,1,1,0,1,1,0,0,1,0,...,1,1,1,0,1,1,0,0,0,0


# BERT Encoding and Embedding

In [136]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained("bert-base-uncased")

model.eval()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [137]:
def encode_Embed(text):
    encoded = tokenizer(
        text,
        max_length=20,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    with torch.no_grad():
        outputs = model(**encoded)
    return {
        "input_ids": encoded["input_ids"].squeeze().tolist(),
        "attention_mask": encoded["attention_mask"].squeeze().tolist(),
        'cls_embedding': outputs.last_hidden_state[:, 0, :].squeeze().tolist()

    }

In [138]:
data["bert_token"] = data['Cleaned_Text'].apply(encode_Embed)

In [141]:
print("the first sentance is : ", data["Cleaned_Text"][0])
print(" Input ID's : ", data["bert_token"][0]['input_ids'][0:10])
print(" The Attention Embedding is : ",data["bert_token"][0]["attention_mask"][0:10])
print(" The CLS is : ",data["bert_token"][0]["cls_embedding"][0:10])

the first sentance is :  hi how are you
 Input ID's :  [101, 7632, 2129, 2024, 2017, 102, 0, 0, 0, 0]
 The Attention Embedding is :  [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
 The CLS is :  [0.09286491572856903, -0.026363983750343323, -0.12239329516887665, -0.07693439722061157, -0.27782443165779114, -0.6085917949676514, 0.2803332805633545, 0.3509485125541687, 0.07150653749704361, -0.1414613425731659]
