In [5]:
import pandas as pd
import numpy as np

from sklearn import datasets

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [6]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )
newsgroups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [7]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
X

<1786x28382 sparse matrix of type '<class 'numpy.float64'>'
	with 303138 stored elements in Compressed Sparse Row format>

In [8]:
y = newsgroups.target
y

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [11]:
clf = SVC(kernel='linear', random_state=241)
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf

SVC(kernel='linear', random_state=241)

In [12]:
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=-1)
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             estimator=SVC(kernel='linear', random_state=241), n_jobs=-1,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             scoring='accuracy')

In [14]:
gs.cv_results_, gs.best_params_

({'mean_fit_time': array([3.08244991, 3.12326212, 3.24821668, 3.12564502, 2.64679885,
         1.62092957, 1.6146584 , 1.63785725, 1.63765383, 1.54781899,
         1.37443728]),
  'std_fit_time': array([0.02710916, 0.04318844, 0.04862886, 0.04265586, 0.03753966,
         0.02518633, 0.02408809, 0.0328495 , 0.02596519, 0.08893224,
         0.0423318 ]),
  'mean_score_time': array([0.72770343, 0.72138925, 0.72187824, 0.72894588, 0.6064992 ,
         0.35367451, 0.36341219, 0.36688194, 0.35966768, 0.31978855,
         0.27174053]),
  'std_score_time': array([0.01003025, 0.00884059, 0.0082745 , 0.01330377, 0.00645101,
         0.0090596 , 0.00947888, 0.02019623, 0.01468391, 0.02278867,
         0.01428819]),
  'param_C': masked_array(data=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                     1000.0, 10000.0, 100000.0],
               mask=[False, False, False, False, False, False, False, False,
                     False, False, False],
         fill_value='?',
         

In [15]:
clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': 241,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [16]:
clf.fit(X, y)

SVC(kernel='linear', random_state=241)

In [20]:
res_sparse = clf.coef_
res_sparse

<1x28382 sparse matrix of type '<class 'numpy.float64'>'
	with 18404 stored elements in Compressed Sparse Row format>

In [26]:
res_dense = res_sparse.todense()
res_dense, res_dense.shape

(matrix([[ 0.29258057, -0.12314757,  0.        , ...,  0.01972862,
           0.05831336, -0.00297347]]),
 (1, 28382))

In [40]:
weights = np.asarray(res_dense)
max_weights = np.argsort(np.abs(weights.reshape(-1)))[-10:]
max_weights

array([22936, 15606,  5776, 21850, 23673, 17802,  5093,  5088, 12871,
       24019], dtype=int64)

In [68]:
result_list = []
with open('D:/Work/Data_files/working_dir/task_2.txt', 'w') as f:
    for weight in max_weights:
        result_list.append(vectorizer.get_feature_names()[weight]) 
    result_list.sort()
    for word in result_list:
        print("%s"%word)
        f.write("%s,"%word)

atheism
atheists
bible
god
keith
moon
religion
sci
sky
space
