In [1]:
from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset = 'train', shuffle=True)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print('Count Vector : ', X_train_counts.shape)

Count Vector :  (11314, 130107)


In [3]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print('Tfidf : ', X_train_tfidf.shape)

Tfidf :  (11314, 130107)


In [4]:
print(' ---- Naive Bayes algorithm ---- ')

from sklearn.naive_bayes import MultinomialNB # 분류 알고리즘

clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB())])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

import numpy as np

twenty_test = fetch_20newsgroups(subset = 'test', shuffle = True) 
predicted = text_clf.predict(twenty_test.data)
print('Accuracy : ', np.mean(predicted == twenty_test.target))





 ---- Naive Bayes algorithm ---- 
Accuracy :  0.7738980350504514


In [12]:
print(' ---- Grid serach : Naive Bayes algorithm ---- ') # 최적 파라미터 찾기


from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range' : [(1,1), (1,2)],
             'tfidf__use_idf' : (True, False),
             'clf__alpha' : (1e-2, 1e-3)}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1) # -1 : 모든 cpu 사용
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

print(gs_clf.best_score_)
print(gs_clf.best_params_) # 도출한 최적 파라미터

# print('Accuracy : ', np.mean(predicted == twenty_test.target))

 ---- Grid serach : Naive Bayes algorithm ---- 




0.9067526957751458
{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [13]:
print('---- Naive bayes with Stemming ----') # 단어의 원형 추출

from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                            ('tfidf', TfidfTransformer()),
                            ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)
predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)
print('Accuracy : ', np.mean(predicted_mnb_stemmed == twenty_test.target))

---- Naive bayes with Stemming ----
Accuracy :  0.8167817312798725
