In [7]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter('ignore')

In [2]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

In [3]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target
feature_mapping = vectorizer.get_feature_names()
print(X)

  (0, 12328)	0.0137045393774
  (0, 3083)	0.183545829049
  (0, 16212)	0.130870716213
  (0, 26748)	0.358641401851
  (0, 10446)	0.0877895428146
  (0, 5173)	0.147007441189
  (0, 25602)	0.113127021266
  (0, 9436)	0.160322175265
  (0, 6206)	0.183545829049
  (0, 24745)	0.0137045393774
  (0, 21441)	0.0319711500148
  (0, 15606)	0.179738178431
  (0, 22911)	0.105890061373
  (0, 24461)	0.0896603504628
  (0, 20381)	0.065773758988
  (0, 19110)	0.0143081539732
  (0, 8823)	0.0896603504628
  (0, 9768)	0.0973296270647
  (0, 16346)	0.0137275657721
  (0, 1668)	0.0532164165719
  (0, 14361)	0.0444677816001
  (0, 4890)	0.0212421598026
  (0, 1191)	0.106828889895
  (0, 12512)	0.0593325757435
  (0, 6741)	0.0958731456593
  :	:
  (1785, 8616)	0.0961857077738
  (1785, 11782)	0.055635809034
  (1785, 10058)	0.0742115230561
  (1785, 970)	0.041884777141
  (1785, 16405)	0.0559553567536
  (1785, 28298)	0.0641466708717
  (1785, 8301)	0.0655664206707
  (1785, 13477)	0.0690845681599
  (1785, 11783)	0.0462980883169
  (1785,

In [5]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [8]:
for a in gs.grid_scores_:
    print(a.mean_validation_score)
    print(a.parameters)

0.552631578947
{'C': 1.0000000000000001e-05}
0.552631578947
{'C': 0.0001}
0.552631578947
{'C': 0.001}
0.552631578947
{'C': 0.01}
0.950167973124
{'C': 0.10000000000000001}
0.993281075028
{'C': 1.0}
0.993281075028
{'C': 10.0}
0.993281075028
{'C': 100.0}
0.993281075028
{'C': 1000.0}
0.993281075028
{'C': 10000.0}
0.993281075028
{'C': 100000.0}


In [9]:
clf = SVC(C=1, kernel='linear', random_state=241)
clf.fit(X, y)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
A = np.argsort(np.abs(np.asarray(clf.coef_.todense())).reshape(-1))[-10:]
print(A)

[22936 15606  5776 21850 23673 17802  5093  5088 12871 24019]


In [11]:
B = []
for i in range(10):
    B.append(vectorizer.get_feature_names()[A[i]]) 
    print(B[i])

sci
keith
bible
religion
sky
moon
atheists
atheism
god
space


In [12]:
B.sort()
print(B)

['atheism', 'atheists', 'bible', 'god', 'keith', 'moon', 'religion', 'sci', 'sky', 'space']
