# Naive Bayes Classifier

In [None]:
import nltk
from nltk.corpus import movie_reviews
nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [None]:
from collections import defaultdict, Counter
import math
import random

train_X, train_Y = [], []
test_X, test_Y = [], []

random.seed(0)
for polarity in movie_reviews.categories():
    for fid in movie_reviews.fileids(polarity):
        if random.randrange(5) == 0:
            test_X.append([w for w in movie_reviews.words(fid)])
            test_Y.append(polarity)
        else:
            train_X.append([w for w in movie_reviews.words(fid)])
            train_Y.append(polarity)

print(train_X[0], train_Y[0])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'ba

## Model Construction

$\bar{y} = \text{arg}\max_{y \in \mathbf{y}} P(y|x) = \text{arg}\max_{y \in \mathbf{y}} P(y) \prod_{i=1}^n \frac{P(x_i|y)}{P(x_i)} = \text{arg}\max_{y \in \mathbf{y}} P(y) \prod_{i=1}^n P(x_i|y)$

$P(x_i|y)=\frac{C(x_i, y) + k}{C(y) + |\mathbf{y}| \times k}$

$\bar{y} = \textrm{arg} \max_{y \in \mathbf{y}} \log P(y) + \sum_{i=1}^n \log \frac{C(x_i, y) + k}{C(y) + k|\mathbf{y}|}$

     

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

class NaiveBayesClassifier:
    def __init__(self, k=0.3, ngram_range=(1, 2)):
        from nltk.corpus import stopwords
        from nltk.stem import WordNetLemmatizer
        self.k = k
        self.features = set()
        self.class_feature_counts = defaultdict(Counter)
        self.class_counts = Counter()
        self.total = 0
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.ngram_range = ngram_range

    def get_wordnet_pos(self, treebank_tag):
        from nltk.corpus import wordnet
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def Text_Cleaning(self, tokens):
        import string
        from nltk import pos_tag
        from nltk.util import ngrams

        tokens = [token for token in tokens if token not in string.punctuation]
        tokens = [token for token in tokens if not token.isdigit()]
        tokens = [token.lower() for token in tokens]
        tokens = [token for token in tokens if token not in self.stop_words]

        tagged_tokens = pos_tag(tokens)
        lemmatized_tokens = [
            self.lemmatizer.lemmatize(token, self.get_wordnet_pos(pos))
            for token, pos in tagged_tokens
        ]
        ngram_tokens = []
        for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
            ngram_tokens.extend(ngrams(lemmatized_tokens, n))

        ngram_tokens = [' '.join(ngram) for ngram in ngram_tokens]
        return ngram_tokens


    def train(self, raw_train_X, train_Y, n=4000):
        from collections import Counter

        all_tokens = []
        cleaned_X = []

        for tokens in raw_train_X:
            cleaned = self.Text_Cleaning(tokens)
            cleaned_X.append(cleaned)
            all_tokens.extend(cleaned)

        # 最常見的 n 個詞彙
        token_counts = Counter(all_tokens)
        top_tokens = set(token for token, _ in token_counts.most_common(n))

        # 保留出現在 top_tokens 中的詞彙
        for tokens, label in zip(cleaned_X, train_Y):
            filtered_tokens = [token for token in tokens if token in top_tokens]

            self.class_counts[label] += 1
            self.total += 1

            for token in set(filtered_tokens):
                self.features.add(token)
                self.class_feature_counts[label][token] += 1


    def probabilities(self, token):
        probs = {}
        total_token_count = sum(sum(token_counts.values()) for token_counts in self.class_feature_counts.values())
        total_token_freq = sum(self.class_feature_counts[cls][token] for cls in self.class_feature_counts)

        pw = total_token_freq / total_token_count if total_token_count > 0 else 1e-6

        for cls, cls_token_counts in self.class_feature_counts.items():
            token_count = cls_token_counts.get(token, 0)
            Nc = sum(cls_token_counts.values())
            alpha = self.k
            probs[cls] = (token_count + alpha * pw) / (Nc + alpha)
        return probs


    def predict(self, tokens):
        tokens = self.Text_Cleaning(tokens)
        tokens = set(tokens)

        log_probs = Counter()
        for cls, cls_cnt in self.class_counts.items():
            log_probs[cls] = math.log(cls_cnt / self.total)

        # for token in self.features:
        #     probs = self.probabilities(token)
        #     if token in tokens:
        #         for cls, prob in probs.items():
        #             log_probs[cls] += math.log(prob)
        #     else:
        #         for cls, prob in probs.items():
        #             log_probs[cls] += math.log(1.0 - prob)
        for token in tokens & self.features:
            probs = self.probabilities(token)
            for cls, prob in probs.items():
                log_probs[cls] += math.log(prob)
        return max(log_probs, key=log_probs.get), log_probs



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
print(train_X[0])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'ba

In [None]:
model = NaiveBayesClassifier()
print(model.Text_Cleaning(train_X[0]))

['plot', 'two', 'teen', 'couple', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guy', 'die', 'girlfriend', 'continue', 'see', 'life', 'nightmare', 'deal', 'watch', 'movie', 'sorta', 'find', 'critique', 'mind', 'fuck', 'movie', 'teen', 'generation', 'touch', 'cool', 'idea', 'present', 'bad', 'package', 'make', 'review', 'even', 'hard', 'one', 'write', 'since', 'generally', 'applaud', 'film', 'attempt', 'break', 'mold', 'mess', 'head', 'lose', 'highway', 'memento', 'good', 'bad', 'way', 'make', 'type', 'film', 'folk', 'snag', 'one', 'correctly', 'seem', 'take', 'pretty', 'neat', 'concept', 'execute', 'terribly', 'problem', 'movie', 'well', 'main', 'problem', 'simply', 'jumble', 'start', 'normal', 'downshift', 'fantasy', 'world', 'audience', 'member', 'idea', 'go', 'dreams', 'character', 'come', 'back', 'dead', 'others', 'look', 'like', 'dead', 'strange', 'apparition', 'disappearance', 'looooot', 'chase', 'scene', 'ton', 'weird', 'thing', 'happen', 'simply', 'expla

## Using the Model

In [None]:
model = NaiveBayesClassifier()
model.train(train_X, train_Y)

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Taken from https://www.imdb.com/review/rw0990793/?ref_=tt_urv
review = """A whimsical, often spectacular view of a future in which advances in technology dominate the world. It is well shot and although slow-moving it is intense and enjoyable throughout. The featuring of classical music to establish atmosphere works brilliantly; it provides a feeling of awe, mystery and intrigue  the same aura that Walt Disney worked in creating 'Fantasia'. The special effects, both sound and visual, are still spellbinding by the standards of today's technology. Aside from the technical pluses of the film, it stands strong as it is one of not many films out there that has something important to say about humankind, and where the human race is heading in terms of our increasing reliance on machines and our unquenchable thirst to discover. Despite an ending that is hard to understand, it is even harder to overlook this film a true cinema classic."""

model.predict(word_tokenize(review))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


('pos', Counter({'neg': -478.63064522402584, 'pos': -466.34256029901144}))

In [None]:
correct, total = 0, 0

for x, y in zip(test_X, test_Y):
    prediction, _ = model.predict(x)
    if prediction == y:
        correct += 1
    total += 1

print("%d / %d = %g" % (correct, total, correct / total))

359 / 422 = 0.850711


## Exploring important features

In [None]:
def prob_class_given_feature(feature, cls, model):
    probs = model.probabilities(feature)
    return probs[cls] / sum(probs.values())

print(sorted(model.features, key=lambda t: prob_class_given_feature(t, "pos", model), reverse=True)[:30])
print(sorted(model.features, key=lambda t: prob_class_given_feature(t, "neg", model), reverse=True)[:30])

['argento', 'mallory', 'gattaca', 'shrek', 'sweetback', 'ghost dog', 'mulan', 'leila', 'guido', 'ordell', 'outstanding', 'dark city', 'fairy tale', 'refresh', 'wonderfully', 'jackie brown', 'qui gon', 'anakin', 'qui', 'gon', 'coen', 'jude', 'matt damon', 'big lebowski', 'jedi', 'lebowski', 'damon', 'german', 'marvelous', 'larry flynt']
['sphere', 'brenner', 'krippendorf', 'nbsp', 'schumacher', 'welles', 'henstridge', 'end day', 'video game', 'mission mar', 'ludicrous', 'one bad', 'turkey', 'bad movie', 'wild west', 'stupidity', 'seagal', 'worst', 'waste time', 'poorly', 'palmetto', 'idiotic', 'uninspired', 'inept', 'batman robin', 'uninteresting', 'whole thing', 'movie bad', 'natasha', 'judith']
