In [None]:
# !pip install nltk

In [99]:
import nltk

In [100]:
def extract_words(document: str) -> set:
    return set(
        word.lower()
        for word in nltk.word_tokenize(document)
        if any(c.isalpha() for c in word)
    )

In [101]:
result = []
for filename in ["../data/sentiment/positive.txt", "../data/sentiment/negative.txt"]:
    with open(filename) as f:
        result.append([extract_words(line) for line in f.read().splitlines()])

In [107]:
result[0][0], result[1][0]

({'great', 'it', 'was'}, {'it', 'not', 'worth'})

In [106]:
words = set()
for document in result[0]:
    words.update(document)
for document in result[1]:
    words.update(document)

In [112]:
def generate_features(documents: list, words: set, label: str) -> list:
    features = []
    for document in documents:
        features.append(({word: (word in document) for word in words}, label))
    return features

In [113]:
training = []
training.extend(generate_features(result[0], words, "Positive"))
training.extend(generate_features(result[1], words, "Negative"))

In [114]:
def classify(classifier: nltk.NaiveBayesClassifier, query: str, words: set):
    document_words = extract_words(query)
    features = {word: (word in document_words) for word in words}
    return classifier.prob_classify(features)

In [115]:
classifier = nltk.NaiveBayesClassifier.train(training)

In [116]:
def check_sentiment(query: str):
    result = classify(classifier, query, words)
    for key in result.samples():
        print(f"{key}: {result.prob(key):.4f}")

In [117]:
check_sentiment("I had a great time")

Positive: 0.9488
Negative: 0.0512


In [118]:
check_sentiment("kind of overpriced")

Positive: 0.0438
Negative: 0.9562
