In [None]:
import pandas as pd
import nltk
import string

Import NLTK Resources

In [None]:
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("maxent_ne_chunker")
nltk.download("maxent_ne_chunker_tab")
nltk.download("words")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package ma

Load the Dataset

In [None]:
df = pd.read_csv("Reviews.csv", on_bad_lines="skip", engine="python")

In [None]:
reviews = df['Text']
reviews = reviews.dropna()
reviews = reviews.head(10000)

Preprocessing

In [None]:
def preprocess(text):
    text = text.lower()  # lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    return text

reviews = reviews.apply(preprocess)

Tokenization + clean

In [None]:
stop_words = set(stopwords.words('english'))

def tokenize_and_clean(text):
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w.isalpha()]  # only words
    tokens = [w for w in tokens if w not in stop_words]  # remove stopwords
    return tokens

tokens_list = reviews.apply(tokenize_and_clean)

POS Tagging

In [None]:
pos_tagged = tokens_list.apply(pos_tag)

 Named Entity Recognition (NER)

In [None]:
ner_results = pos_tagged.apply(ne_chunk)

Example Output

In [None]:
print("Sample Review:", reviews.iloc[0])
print("\nTokens:", tokens_list.iloc[0])
print("\nPOS Tags:", pos_tagged.iloc[0])
print("\nNER Tree:", ner_results.iloc[0])

Sample Review: i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meat and it smells better my labrador is finicky and she appreciates this product better than  most

Tokens: ['bought', 'several', 'vitality', 'canned', 'dog', 'food', 'products', 'found', 'good', 'quality', 'product', 'looks', 'like', 'stew', 'processed', 'meat', 'smells', 'better', 'labrador', 'finicky', 'appreciates', 'product', 'better']

POS Tags: [('bought', 'VBD'), ('several', 'JJ'), ('vitality', 'NN'), ('canned', 'VBD'), ('dog', 'JJ'), ('food', 'NN'), ('products', 'NNS'), ('found', 'VBD'), ('good', 'JJ'), ('quality', 'NN'), ('product', 'NN'), ('looks', 'VBZ'), ('like', 'IN'), ('stew', 'NN'), ('processed', 'VBN'), ('meat', 'NN'), ('smells', 'NNS'), ('better', 'RBR'), ('labrador', 'NN'), ('finicky', 'JJ'), ('appreciates', 'VBZ'), ('product', 'NN'), ('better', 'RBR')]

NER Tree: (S
  bought/VBD
  several/JJ
