In [1]:
#importing basic libraries
import pandas as pd
import numpy as np

In [2]:
#importing neccessary libraries
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import f1_score

In [3]:
#fetching 20news groups dataset
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [4]:
#printing the available categories
from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [5]:
newsgroups_train.filenames.shape

(11314,)

In [6]:
newsgroups_train.target.shape

(11314,)

In [7]:
categories = [
    'alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
    'talk.religion.misc',
]

In [8]:
newsgroups_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'), categories=categories)

In [9]:
print("%d documents" % len(newsgroups_train.filenames))
print("%d categories" % len(newsgroups_train.target_names))

4378 documents
8 categories


In [10]:
#pipeline with text vectorizer and classifier
pipeline = Pipeline([('vectorizer', HashingVectorizer(n_features=1000)), ('clf', SVC()),
])

In [14]:
param_grid = [{'clf__kernel': ['rbf'],
'clf__C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
'clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]},
{'clf__kernel': ['linear'],
'clf__C': [0.0001, 0.001, 0.01, 0.1, 1, 10]}]


In [15]:
#defining gridsearchcv
grid_search = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1)

In [16]:
#fitting the model
grid_search.fit(newsgroups_train.data, newsgroups_train.target)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 42 candidates, totalling 126 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 126 out of 126 | elapsed: 18.1min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1000, ngram_range=(1, 1), non_negative=False,
         norm='l2',...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'clf__kernel': ['rbf'], 'clf__C': [0.0001, 0.001, 0.01, 0.1, 1, 10], 'clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]}, {'clf__kernel': ['linear'], 'clf__C': [0.0001, 0.001, 0.01, 0.1, 1, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [18]:
#printing test and train scores, f1 score
print("Best score: %0.3f" % grid_search.best_score_)
print("Training score: %0.3f" % grid_search.score(newsgroups_train.data, newsgroups_train.target))
print("Testing score: %0.3f" % grid_search.score(newsgroups_test.data, newsgroups_test.target))
print("fscore: %0.3f" % f1_score(newsgroups_test.target,grid_search.predict(newsgroups_test.data),average='macro'))

Best score: 0.621
Training score: 0.979
Testing score: 0.578
fscore: 0.571
