In [1]:
import numpy as np
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.svm import SVC

newsgroups = datasets.fetch_20newsgroups(
                    subset='all',
                    categories=['alt.atheism', 'sci.space']
             )

In [3]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(newsgroups.data)
y = newsgroups.target

In [5]:
cls = SVC(kernel='linear', random_state=241)
grid = {'C': np.power(10.0, range(-5, 6))}
kfold = KFold(n_splits=5, shuffle=True, random_state=241)
gridCV = GridSearchCV(cls, grid, scoring='accuracy', cv=kfold, n_jobs=-1)

In [6]:
gridCV.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': array([  1.00000e-05,   1.00000e-04,   1.00000e-03,   1.00000e-02,
         1.00000e-01,   1.00000e+00,   1.00000e+01,   1.00000e+02,
         1.00000e+03,   1.00000e+04,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [8]:
for a in gridCV.grid_scores_:
    print('C = {c}, score = {a}'.format(a=a.mean_validation_score, c=a.parameters['C']))

C = 1e-05, score = 0.5526315789473685
C = 0.0001, score = 0.5526315789473685
C = 0.001, score = 0.5526315789473685
C = 0.01, score = 0.5526315789473685
C = 0.1, score = 0.9501679731243001
C = 1.0, score = 0.9932810750279956
C = 10.0, score = 0.9932810750279956
C = 100.0, score = 0.9932810750279956
C = 1000.0, score = 0.9932810750279956
C = 10000.0, score = 0.9932810750279956
C = 100000.0, score = 0.9932810750279956




In [9]:
C = 1.0
best_cls = SVC(C=C, random_state=241, kernel='linear')
best_cls.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
feature_names = np.array(tfidf.get_feature_names())
best_weights = np.argsort(np.ravel(np.abs(best_cls.coef_.todense())))[-10:]
np.savetxt('output/week3/task2.txt', sorted(feature_names[best_weights]), newline=' ', fmt='%s')

[17469 17422 16746 16935 17536  9015 17045 17449 18092 16693]
