In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [2]:
data = fetch_20newsgroups(subset='all',
                          categories=['alt.atheism', 'sci.space'])

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [10]:
X, y = data['data'], data['target']

In [13]:
vectorizer = TfidfVectorizer()

In [14]:
%%time
X_vectorized = vectorizer.fit_transform(X)

CPU times: user 445 ms, sys: 23.3 ms, total: 468 ms
Wall time: 526 ms


In [16]:
svm = SVC(kernel='linear', random_state=241)
kfold = KFold(n_splits=5, random_state=241)

In [17]:
params = {'C': [10 ** i for i in range(-5, 6)]}

In [19]:
grid = GridSearchCV(svm, params, scoring='accuracy',
                    cv=kfold, n_jobs=-1, verbose=1)

In [20]:
%%time
grid.fit(X_vectorized, y)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:   47.3s finished


CPU times: user 1.95 s, sys: 107 ms, total: 2.06 s
Wall time: 49 s


GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=False),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=241,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                               1000, 10000, 100000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [37]:
weights = grid.best_estimator_.coef_.toarray()

In [27]:
sort = np.sort(np.abs(weights))

In [33]:
sort[0][-10:]

array([1.08992368, 1.10711424, 1.12672512, 1.1557961 , 1.19595386,
       1.2483784 , 1.25727196, 1.2999352 , 1.97192381, 2.72030446])

In [38]:
weights = pd.Series(np.abs(weights[0]))

In [43]:
top = weights.sort_values(ascending=False)[:10].index

In [46]:
features = vectorizer.get_feature_names()

In [48]:
words = []
for i in top:
    words.append(features[i])
    print(features[i])

space
god
atheism
atheists
moon
sky
religion
bible
keith
nick


In [52]:
for i in sorted(words):
    print(i, end=' ')

atheism atheists bible god keith moon nick religion sky space 