In [207]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score

In [221]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [228]:
vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, max_df = 0.08)
X_train_vectors = vectorizer.fit_transform(newsgroups_train.data)
X_test_vectors = vectorizer.transform(newsgroups_test.data)

In [229]:
clf = ComplementNB( alpha = 0.46 ).fit(X_train_vectors, newsgroups_train.target)
predicts = clf.predict(X_test_vectors)

In [230]:
f1_score(newsgroups_test.target, predicts,average='weighted')

0.7138332457790261

In [231]:
f1_score(newsgroups_test.target, predicts,average='macro')

0.6996082212185294

In [151]:
for max_df in np.linspace(0.06,0.09,10):
    for alpha in np.linspace(0.35, 0.5, 15):
        vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, max_df = max_df).fit(newsgroups_train.data)
        X_train_vectors = vectorizer.transform(newsgroups_train.data)
        X_test_vectors = vectorizer.transform(newsgroups_test.data)
        clf = ComplementNB( alpha = alpha ).fit(X_train_vectors, newsgroups_train.target)
        predicts = clf.predict(X_test_vectors)
        print (f1_score(newsgroups_test.target, predicts,average='weighted'), alpha, max_df)
        
        


0.7137246978257326 0.35 0.06
0.7131880755422063 0.3607142857142857 0.06
0.7135551271306998 0.3714285714285714 0.06
0.7134260661397482 0.3821428571428571 0.06
0.7128147340707082 0.39285714285714285 0.06
0.712639380833793 0.4035714285714286 0.06
0.7123353664376769 0.41428571428571426 0.06
0.7125581544584413 0.425 0.06
0.712547816893407 0.4357142857142857 0.06
0.7133298128493906 0.4464285714285714 0.06
0.7133009589712676 0.45714285714285713 0.06
0.7133813076506276 0.46785714285714286 0.06
0.7136775786981622 0.47857142857142854 0.06
0.7135226480762317 0.4892857142857143 0.06
0.7134331928410167 0.5 0.06
0.7135370549810064 0.35 0.06333333333333332
0.7129347125957205 0.3607142857142857 0.06333333333333332
0.7129942478513603 0.3714285714285714 0.06333333333333332
0.7128836316169027 0.3821428571428571 0.06333333333333332
0.7124316050558308 0.39285714285714285 0.06333333333333332
0.7127130014669112 0.4035714285714286 0.06333333333333332
0.7128261258485635 0.41428571428571426 0.06333333333333332
