In [126]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV

In [3]:
newsgroups = fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space'])

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [19]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target
feature_mapping = vectorizer.get_feature_names()

In [30]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=241,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [42]:
best_clf = SVC(C=gs.best_params_['C'], kernel='linear', random_state=241)
best_clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=241,
    shrinking=True, tol=0.001, verbose=False)

In [123]:
coefs = np.abs(best_clf.coef_.toarray()[0])
ind = coef.argsort()[-10:][::-1]
words = [feature_mapping[i] for i in ind]
print(' '.join(sorted(words)))

atheism atheists bible god keith moon religion sci sky space


In [146]:
word = pd.DataFrame(vectorizer.get_feature_names())
coef = pd.DataFrame(np.abs(np.asarray(best_clf.coef_.todense()).reshape(-1)))
data = pd.concat([word, coef], axis=1)
data.columns = ['Word', 'Coef']
answ = data.sort_values(by=['Coef'])[-10:].sort_values(by=['Word'])
answ

Unnamed: 0,Word,Coef
5088,atheism,1.25469
5093,atheists,1.24918
5776,bible,1.130612
12871,god,1.920379
15606,keith,1.097094
17802,moon,1.201611
21850,religion,1.139081
22936,sci,1.029307
23673,sky,1.180132
24019,space,2.663165


In [147]:
' '.join(answ['Word'].array)

'atheism atheists bible god keith moon religion sci sky space'