In [1]:
review = """I had the most incredible dining experience at this restaurant! The food was absolutely delicious - every dish was perfectly 
prepared and beautifully presented. Our server was attentive, knowledgeable, and made excellent recommendations. 
The atmosphere was elegant yet comfortable. I can't wait to come back and try more items from their menu. Definitely a new favorite spot!
"""

In [3]:
from nltk import sent_tokenize, word_tokenize
import string

token = sent_tokenize(review)

all_word = []

for i in token:
    word = word_tokenize(i)
    word = [w.lower() for w in word]
    word = [ w for w in word if w not in string.punctuation]
    all_word.append(word)

print(all_word)

[['i', 'had', 'the', 'most', 'incredible', 'dining', 'experience', 'at', 'this', 'restaurant'], ['the', 'food', 'was', 'absolutely', 'delicious', 'every', 'dish', 'was', 'perfectly', 'prepared', 'and', 'beautifully', 'presented'], ['our', 'server', 'was', 'attentive', 'knowledgeable', 'and', 'made', 'excellent', 'recommendations'], ['the', 'atmosphere', 'was', 'elegant', 'yet', 'comfortable'], ['i', 'ca', "n't", 'wait', 'to', 'come', 'back', 'and', 'try', 'more', 'items', 'from', 'their', 'menu'], ['definitely', 'a', 'new', 'favorite', 'spot']]


In [4]:
from nltk.corpus import stopwords

stopword = set(stopwords.words('english'))

clean = [
    [w for w in sen if w not in stopword]
    for sen in all_word
]

clean

[['incredible', 'dining', 'experience', 'restaurant'],
 ['food',
  'absolutely',
  'delicious',
  'every',
  'dish',
  'perfectly',
  'prepared',
  'beautifully',
  'presented'],
 ['server',
  'attentive',
  'knowledgeable',
  'made',
  'excellent',
  'recommendations'],
 ['atmosphere', 'elegant', 'yet', 'comfortable'],
 ['ca', "n't", 'wait', 'come', 'back', 'try', 'items', 'menu'],
 ['definitely', 'new', 'favorite', 'spot']]

In [5]:
from nltk.stem import PorterStemmer,WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

lematized = [
    [lemmatizer.lemmatize(w) for w in sentence]
    for sentence in clean
]

stemmed = [
    [stemmer.stem(w) for w in sen]
    for sen in clean
]

print(lematized)
print(stemmed)

[['incredible', 'dining', 'experience', 'restaurant'], ['food', 'absolutely', 'delicious', 'every', 'dish', 'perfectly', 'prepared', 'beautifully', 'presented'], ['server', 'attentive', 'knowledgeable', 'made', 'excellent', 'recommendation'], ['atmosphere', 'elegant', 'yet', 'comfortable'], ['ca', "n't", 'wait', 'come', 'back', 'try', 'item', 'menu'], ['definitely', 'new', 'favorite', 'spot']]
[['incred', 'dine', 'experi', 'restaur'], ['food', 'absolut', 'delici', 'everi', 'dish', 'perfectli', 'prepar', 'beauti', 'present'], ['server', 'attent', 'knowledg', 'made', 'excel', 'recommend'], ['atmospher', 'eleg', 'yet', 'comfort'], ['ca', "n't", 'wait', 'come', 'back', 'tri', 'item', 'menu'], ['definit', 'new', 'favorit', 'spot']]


In [6]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

vectrorizer = CountVectorizer()
tfidf = TfidfVectorizer()

document1 = [' '.join(w) for w in lematized]
document2 = [' '.join(w) for w in stemmed]

bow = vectrorizer.fit_transform(document1)
matrix = tfidf.fit_transform(document2)

In [7]:
print(vectrorizer.get_feature_names_out())
print(bow.toarray())

['absolutely' 'atmosphere' 'attentive' 'back' 'beautifully' 'ca' 'come'
 'comfortable' 'definitely' 'delicious' 'dining' 'dish' 'elegant' 'every'
 'excellent' 'experience' 'favorite' 'food' 'incredible' 'item'
 'knowledgeable' 'made' 'menu' 'new' 'perfectly' 'prepared' 'presented'
 'recommendation' 'restaurant' 'server' 'spot' 'try' 'wait' 'yet']
[[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [1 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0]]


In [8]:
print(tfidf.get_feature_names_out())
print(matrix.toarray())

['absolut' 'atmospher' 'attent' 'back' 'beauti' 'ca' 'come' 'comfort'
 'definit' 'delici' 'dine' 'dish' 'eleg' 'everi' 'excel' 'experi'
 'favorit' 'food' 'incred' 'item' 'knowledg' 'made' 'menu' 'new'
 'perfectli' 'prepar' 'present' 'recommend' 'restaur' 'server' 'spot'
 'tri' 'wait' 'yet']
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.5        0.
  0.         0.         0.         0.5        0.         0.
  0.5        0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.5        0.
  0.         0.         0.         0.        ]
 [0.33333333 0.         0.         0.         0.33333333 0.
  0.         0.         0.         0.33333333 0.         0.33333333
  0.         0.33333333 0.         0.         0.         0.33333333
  0.         0.         0.         0.         0.         0.
  0.33333333 0.33333333 0.33333333 0.         0.         0.
  0.         0.         0.         0.        