In [1]:
from sklearn import datasets
import matplotlib.pyplot as plt

newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space'])

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.svm import SVC
import numpy as np

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(newsgroups['data'])

splitter = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)

In [17]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=splitter)

In [20]:
x = vectorizer.transform(newsgroups['data']) 
y = newsgroups['target']

In [21]:
gs.fit(x,y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [23]:
# C = 1.0
for sample in gs.grid_scores_:
    print(sample.mean_validation_score)
    print(sample.parameters,end='\n\n')

0.5526315789473685
{'C': 1e-05}

0.5526315789473685
{'C': 0.0001}

0.5526315789473685
{'C': 0.001}

0.5526315789473685
{'C': 0.01}

0.9501679731243001
{'C': 0.1}

0.9932810750279956
{'C': 1.0}

0.9932810750279956
{'C': 10.0}

0.9932810750279956
{'C': 100.0}

0.9932810750279956
{'C': 1000.0}

0.9932810750279956
{'C': 10000.0}

0.9932810750279956
{'C': 100000.0}





In [24]:
best_clf = SVC(C=1.0, kernel='linear', random_state=241)
best_clf.fit(x,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [96]:
abs_coefs = np.array([np.abs(x) for x in best_clf.coef_.todense()]).reshape(28382)

In [104]:
mvw = np.argsort(abs_coefs)[-10:]

In [107]:
vocab = vectorizer.get_feature_names()
words = []
for index in mvw:
    words.append(vocab[index])
    print(vocab[index])

sci
keith
bible
religion
sky
moon
atheists
atheism
god
space


In [110]:
words

['atheism',
 'atheists',
 'bible',
 'god',
 'keith',
 'moon',
 'religion',
 'sci',
 'sky',
 'space']