In [2]:
import random
import nltk
from nltk.corpus import reuters
from nltk import FreqDist, classify, NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download("reuters")
nltk.download("stopwords")


categories = ['acq', 'earn']
documents = [(list(reuters.words(fileid)), category)
             for category in categories
             for fileid in reuters.fileids(category)]

random.shuffle(documents)

def document_features(document):
    stop_words = set(stopwords.words("english"))
    ps = PorterStemmer()
    
    words = [ps.stem(word.lower()) for word in document if word.isalpha() and word.lower() not in stop_words]
    features = FreqDist(words)
    
    return features

featuresets = [(document_features(d), c) for (d, c) in documents]

train_size = int(len(featuresets) * 0.8)
train_set, test_set = featuresets[:train_size], featuresets[train_size:]

classifier = NaiveBayesClassifier.train(train_set)

accuracy = classify.accuracy(classifier, test_set)
print(f"Classifier Accuracy: {accuracy:.2%}")

print("\nMost Informative Features:")
classifier.show_most_informative_features(10)


[nltk_data] Downloading package reuters to C:\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classifier Accuracy: 90.53%

Most Informative Features:
Most Informative Features
                     qtr = 1                earn : acq    =    216.1 : 1.0
                     net = 3                earn : acq    =    174.1 : 1.0
                  intent = 1                 acq : earn   =    119.9 : 1.0
                     avg = 2                earn : acq    =    115.4 : 1.0
                     shr = 4                earn : acq    =    114.6 : 1.0
                      vs = 8                earn : acq    =    105.3 : 1.0
                      ct = 4                earn : acq    =    103.2 : 1.0
                     shr = 1                earn : acq    =     86.4 : 1.0
                  tender = 2                 acq : earn   =     80.8 : 1.0
                  payout = 1                earn : acq    =     75.1 : 1.0
