Download dataset from: https://drive.google.com/uc?id=1-WZKE5xHw-3m_SL_PtOgwkzdFROIWqih

In [None]:
import pandas as pd

pd.set_option("max_colwidth", 800)

raw_df = pd.read_csv("data.csv")
raw_df = raw_df[raw_df["Score"] > 0]
df = raw_df[:1000]


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download("punkt")
nltk.download("omw-1.4")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")


In [None]:
from collections import defaultdict

lemmatizer = WordNetLemmatizer()


def preprocessLemmatize(text):
    tokens = word_tokenize(text.lower())
    taggedTokens = nltk.pos_tag(tokens)
    tagMap = defaultdict(lambda: wordnet.NOUN)
    tagMap.update({"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV})
    lemmatizedTokens = []
    for token, tag in taggedTokens:
        pos = tagMap.get(tag[0], wordnet.NOUN)
        lemmatizedTokens.append(lemmatizer.lemmatize(token, pos))
    return lemmatizedTokens


In [None]:
from typing import Callable, Iterable


def predict(
    text: str,
    preprocessFunction: Callable = preprocessLemmatize,
    goodWords: Iterable | None = None,
    badWords: Iterable | None = None,
) -> int:
    defaultGoodWords = {"good", "like", "love", "great", "amazing"}
    defaultBadWords = {"bad", "hate", "horrible", "terrible", "awful"}

    goodWords = goodWords or defaultGoodWords
    badWords = badWords or defaultBadWords


    tokens = preprocessFunction(text)

    sentiment = 0  # neutral sentiment by default
    for token in tokens:
        if token in goodWords:
            sentiment += 1
        elif token in badWords:
            sentiment -= 1

    return sentiment


def predictStars(
    text,
    preprocessFunction: Callable = preprocessLemmatize,
    goodWords: Iterable | None = None,
    badWords: Iterable | None = None,
):
    sentiment = predict(text, preprocessFunction, goodWords, badWords)
    if sentiment > 1:
        return 5
    elif sentiment == 1:
        return 4
    elif sentiment == 0:
        return 3
    elif sentiment == -1:
        return 2
    else:
        return 1


## Machine Learning based Model
Instead of the programmer picking the "good" and "bad" tokens, we let the program create its own set of "good" and "bad" tokens by training on the reviews. The tokens that appear the most in 5-star reviews have a high chance of being the "good" tokens and similarly, the tokens that appear the most in 1-star reviews have a high chance of being the "bad" tokens. Of course, it is not that simple, but we'll get to the challenging part in a bit

In [None]:
from collections import Counter

fiveStarTokenCounter = Counter()
oneStarTokenCounter = Counter()
allTokenCounter = Counter()
OFFSET = 10

for text, score in zip(df["Text"], df["Score"]):
    tokens = preprocessLemmatize(text)
    if score == 5:
        fiveStarTokenCounter.update(tokens)
    elif score == 1:
        oneStarTokenCounter.update(tokens)
    allTokenCounter.update(tokens)


In [None]:
print("Good tokens:")
for token, count in fiveStarTokenCounter.most_common(10):
    print(f"{token} ({count})", end=", ")

print("\nBad tokens:")
for token, count in oneStarTokenCounter.most_common(10):
    print(f"{token} ({count})", end=", ")


## The Challenging Part
The "good" and "bad" tokens found using taking the most common words in 5-star and 1-star reviews happen to the be the most common words you'd find in any review (Duh!!).<br>
To overcome this issue, we must come up with a way to penalize the tokens that just occur in all reviews and not only in the 5-star and 1-star reviews we're concerned about. We can do so by divding the frequency of the token in 5-star or 1-star reviews by its total frequency in all reviews.<br>
But this also means, tokens that are really unique like an ingredient's name that may appear only few times will be ranked high (the denominator will be low). We definitely dont want that. To fix this, we'll assume that we've seen every word at least `OFFSET` number of times.

In [None]:
fiveStarNormalized = {
    token: count / (allTokenCounter[token] + OFFSET)
    for token, count in fiveStarTokenCounter.items()
}
sortedFiveStarNormalized = sorted(fiveStarNormalized.items(), key=lambda x: x[1], reverse=True)
goodWords = [token for token, _ in sortedFiveStarNormalized[:10]]


oneStarNormalized = {
    token: count / (allTokenCounter[token] + OFFSET) for token, count in oneStarTokenCounter.items()
}
sortedOneStarNormalized = sorted(oneStarNormalized.items(), key=lambda x: x[1], reverse=True)
badWords = [token for token, _ in sortedOneStarNormalized[:10]]

print("Good words:", end=" ")
print(*goodWords, sep=", ")

print("Bad words:", end=" ")
print(*badWords, sep=", ")

The "good" and "bad" tokens now are promising. Let's go ahead and predict the number of stars.

In [None]:
df["Prediction"] = df["Text"].apply(
    lambda doc: predictStars(
        doc,
        goodWords=goodWords,
        badWords=badWords,
    )
)
score = sum(df["Prediction"] == df["Score"]) / len(df)
print("Accuracy:", round(100 * score, 3), "%")
df[["Summary", "Text", "Score", "Prediction"]].head(20)


## Vectorization
Vectorization is a methodology in NLP to map words or phrases from vocabulary to a corresponding vector of real numbers. By vectorizing the documents, we can convert the document from a sequence of tokens (strings) to an array of numbers which will be used in regression models.<br>
It is to be noted that the order in which the tokens occured is lost by vectorizing the document. But since we didn't care about order to begin with, it should be fine.

In [None]:
corpus = [
    "You like dogs",
    "You like cats",
    "You love dogs",
    "You love cats",
]

vocab = set()
for doc in corpus:
    tokens = preprocessLemmatize(doc)
    vocab.update(tokens)

vocab = sorted(vocab)
print(vocab)

In [None]:
wordToIndex = {word: index for index, word in enumerate(vocab)}

vectors = []
for doc in corpus:
    tokens = preprocessLemmatize(doc)
    vector = [0] * len(vocab)
    for token in tokens:
        index = wordToIndex[token]
        vector[index] += 1
    vectors.append(vector)

print(*vectors, sep="\n")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)
print(vectors.toarray())

In [None]:
df = raw_df[:10000]

train_df = df[:int(len(df) * 0.9)]
test_df = df[int(len(df) * 0.9):]

vectorizer = CountVectorizer()

train_features = vectorizer.fit_transform(train_df["Text"])
test_features = vectorizer.transform(test_df["Text"])

## Logistic Regression
Oooh, finally to the machine learning part. Logistic regression is one of the most popular Machine Learning algorithms, which comes under the Supervised Learning technique. It is used for predicting the categorical dependent variable using a given set of independent variables.<br>
Logistic regression predicts the output of a categorical dependent variable. Therefore the outcome must be a categorical or discrete value. It can be either Yes or No, 0 or 1, true or False, etc. but instead of giving the exact value as 0 and 1, it gives the probabilistic values which lie between 0 and 1.

In [None]:
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression()
lr_classifier.fit(train_features, train_df["Score"])

score = lr_classifier.score(test_features, test_df["Score"])
print("Accuracy:", round(100 * score, 3), "%")