In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [3]:
import nltk
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
sentence = "My name is Merlyn"

In [5]:
word_token = word_tokenize(sentence)
word_token

['My', 'name', 'is', 'Merlyn']

In [6]:
sentence = "My name is Merlyn. I am a student."

In [7]:
sent_token = sent_tokenize(sentence)
sent_token

['My name is Merlyn.', 'I am a student.']

# Embeddings

## Bag of words

In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.DataFrame({"text": ["people watch dswithbappy",
                            "dswithbappy watch dswithbappy",
                            "people write comment",
                            "dswithbappy write comment"], 
                   "output": [1, 1, 0, 0]})
df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer()

In [12]:
bow = cv.fit_transform(df["text"])

In [13]:
bow.toarray()

array([[0, 1, 1, 1, 0],
       [0, 2, 0, 1, 0],
       [1, 0, 1, 0, 1],
       [1, 1, 0, 0, 1]])

## TF-IDF (Term Frequency - Inverse Document Frequency)

In [14]:
df = pd.DataFrame({"text": ["people watch dswithbappy",
                            "dswithbappy watch dswithbappy",
                            "people write comment",
                            "dswithbappy write comment"], 
                   "output": [1, 1, 0, 0]})
df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [16]:
arr = tfidf.fit_transform(df["text"])

In [17]:
arr.toarray()

array([[0.        , 0.49681612, 0.61366674, 0.61366674, 0.        ],
       [0.        , 0.8508161 , 0.        , 0.52546357, 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
       [0.61366674, 0.49681612, 0.        , 0.        , 0.61366674]])

## Advance Embedding Techniques with Transformers

In [18]:
import transformers
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [20]:
text = "I was so not happy with the service today"

In [21]:
# tokenize the text
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

Tokens: ['i', 'was', 'so', 'not', 'happy', 'with', 'the', 'service', 'today']


In [22]:
# convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Input IDs:", input_ids)


Input IDs: [1045, 2001, 2061, 2025, 3407, 2007, 1996, 2326, 2651]


In [23]:
# encode the text (tokenisation + conversion to IDs)
encoded_input = tokenizer(text)
print("Encoded Input:", encoded_input)

Encoded Input: {'input_ids': [101, 1045, 2001, 2061, 2025, 3407, 2007, 1996, 2326, 2651, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [24]:
# decode the text
decoded_output = tokenizer.decode(input_ids)
print("Decoded Output:", decoded_output)

Decoded Output: i was so not happy with the service today
