## Rules-based Model
A very simple rules-based model is to compare the tokens in the document with a set of "good" and "bad" tokens that is picked by the programmer. The sentiment is positive if number of "good" tokens is more than number of "bad" tokens, negative if number of "bad" tokens is more than number of "good" token, neutral if number of "good" tokens equals number of "bad" tokens.

In [None]:
from typing import Callable, Iterable


def predict(
    text,
    preprocessFunction: Callable = str.split,
    goodWords: Iterable | None = None,
    badWords: Iterable | None = None,
) -> int:
    defaultGoodWords = {"good", "like", "love", "great", "amazing"}
    defaultBadWords = {"bad", "hate", "horrible", "terrible", "awful"}

    goodWords = goodWords or defaultGoodWords
    badWords = badWords or defaultBadWords

    tokens = preprocessFunction(text)

    # sentiment = 0 => neutral
    # sentiment > 0 => positive
    # sentiment < 0 => negative

    sentiment = 0  # neutral sentiment by default
    for token in tokens:
        if token in goodWords:
            sentiment += 1
        elif token in badWords:
            sentiment -= 1

    return sentiment


In [None]:
# Model 1 (Basic Tokenizer, NO POS Tagging, NO Lemmatization)
# POS stands for Part of Speech like Noun, Verb, Adjective, Adverb
# Lemmatization is the process of finding the base form of a word.
# For example, the word "like" is the base form of "likes" and "liked".

corpus = [
    "A. R. Rahman is a good film composer and songwriter.",
    "Pineapple on pizzas tastes very bad.",
    "He likes anime. Steins;Gate is his favourite",
    "My introvert friend is TERRIBLE at communicating.",
]

for doc in corpus:
    print(doc)
    sentiment = predict(doc)
    if sentiment > 0:
        print("Positive")
    elif sentiment < 0:
        print("Negative")
    else:
        print("Neutral")
    print()


In [None]:
# Model 2 (NLTK punkt tokenizer, NO POS Tagging, NO Lemmatization)

import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt")


def preprocessPunkt(text):
    return word_tokenize(text.lower())


for doc in corpus:
    print(doc)
    print("Tokenized:", preprocessPunkt(doc))
    sentiment = predict(doc, preprocessPunkt)
    if sentiment > 0:
        print("Positive")
    elif sentiment < 0:
        print("Negative")
    else:
        print("Neutral")
    print()


In [None]:
# Model 3 (NLTP punkt tokenizer, POS Tagging, Lemmatization using WordNet)
# WordNet is a lexical database of English. It is used to find the base form of a word.

from nltk.stem import WordNetLemmatizer

nltk.download("omw-1.4")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")

lemmatizer = WordNetLemmatizer()


def preprocessLemmatize(text) -> list[str]:
    tokens = word_tokenize(text.lower())
    taggedTokens = nltk.pos_tag(tokens)
    tagMap = {"N": "n", "V": "v", "J": "a", "R": "r"}
    lemmatizedTokens = []
    for token, tag in taggedTokens:
        pos = tagMap.get(tag[0], "n")
        lemmatizedTokens.append(lemmatizer.lemmatize(token, pos))
    return lemmatizedTokens


for doc in corpus:
    print(doc)
    print("Lemmatized Tokens:", preprocessLemmatize(doc))
    sentiment = predict(doc, preprocessLemmatize)
    if sentiment > 0:
        print("Positive")
    elif sentiment < 0:
        print("Negative")
    else:
        print("Neutral")
    print()


## Working with Real-World Data (Amazon reviews)

Download from: https://drive.google.com/uc?id=1-WZKE5xHw-3m_SL_PtOgwkzdFROIWqih


In [None]:
import pandas as pd

pd.set_option("max_colwidth", 800)

df = pd.read_csv("data.csv")
df = df[df["Score"] > 1][:1000]
df[["Summary", "Text", "Score"]].head(20)

In [None]:
def predictStars(
    text,
    preprocessFunction=preprocessLemmatize,
    goodWords: Iterable | None = None,
    badWords: Iterable | None = None,
):
    sentiment = predict(text, preprocessFunction, goodWords, badWords)
    if sentiment > 1:
        return 5
    elif sentiment == 1:
        return 4
    elif sentiment == 0:
        return 3
    elif sentiment == -1:
        return 2
    else:
        return 1


df["Prediction"] = df["Text"].apply(predictStars)
score = sum(df["Prediction"] == df["Score"]) / len(df)
print("Accuracy:", round(100 * score, 3), "%")
df[["Summary", "Text", "Score", "Prediction"]].head(20)