In [30]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter('ignore')

In [2]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)


In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

In [23]:
X

<1786x28382 sparse matrix of type '<class 'numpy.float64'>'
	with 303138 stored elements in Compressed Sparse Row format>

In [20]:
feature_mapping = vectorizer.get_feature_names()


In [27]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [31]:
for a in gs.grid_scores_:
    print(a.mean_validation_score)
    print(a.parameters)

0.552631578947
{'C': 1.0000000000000001e-05}
0.552631578947
{'C': 0.0001}
0.552631578947
{'C': 0.001}
0.552631578947
{'C': 0.01}
0.950167973124
{'C': 0.10000000000000001}
0.993281075028
{'C': 1.0}
0.993281075028
{'C': 10.0}
0.993281075028
{'C': 100.0}
0.993281075028
{'C': 1000.0}
0.993281075028
{'C': 10000.0}
0.993281075028
{'C': 100000.0}


In [33]:
clf = SVC(C=1, kernel='linear', random_state=241)
clf.fit(X, y)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [43]:
A = np.argsort(np.abs(np.asarray(clf.coef_.todense())).reshape(-1))[-10:]
A

array([22936, 15606,  5776, 21850, 23673, 17802,  5093,  5088, 12871, 24019], dtype=int64)

In [53]:
B = []
for i in range(10):
    B.append(vectorizer.get_feature_names()[A[i]]) 
    print(B[i])

sci
keith
bible
religion
sky
moon
atheists
atheism
god
space


In [58]:
B.sort()
print(B)

['atheism', 'atheists', 'bible', 'god', 'keith', 'moon', 'religion', 'sci', 'sky', 'space']
