In [0]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn import metrics

*Selecting oinly a set among the target labels to reduce computational time*

In [3]:
categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
#remove = ('headers', 'footers', 'quotes') uncomment and use this to remove metadata, which will be a realistic f1 score, less overfitting 
data_train = datasets.fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
data_test = datasets.fetch_20newsgroups(subset='test', categories=categories, shuffle=True)
y_train, y_test = data_train.target, data_test.target
#print(data_train.DESCR)
print(data_train.target_names)
print(data_test.target_names)
print(data_train.data)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [0]:
target_labels =  data_train.target_names

*TFidfVectorizer : Equivalent to CountVectorizer followed by TfidfTransformer.*

In [5]:
vectorizer = TfidfVectorizer(analyzer='word', strip_accents=ascii,min_df=0.001, max_df=0.2, stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)
feature_names = vectorizer.get_feature_names()
print(len(feature_names))
print(feature_names)

13448


*selecting k best features froim chi squared with k = 1000*

In [6]:
chi_2 = SelectKBest(chi2, 1000)
X_train = chi_2.fit_transform(X_train, y_train)
X_test = chi_2.transform(X_test)
if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in chi_2.get_support(indices=True)]
print(feature_names)

['1993apr3', '2000', '24', '249', '256', '2d', '3539', '386', '3d', '3do', '3ds', '42', '575', '602', '621', '666', '68070', '9615', '9630', '__', '___', '_____', 'abortion', 'ac', 'acad3', 'acc', 'accelerations', 'accept', 'access', 'acpub', 'act', 'actions', 'adams', 'added', 'advance', 'advertising', 'aerospace', 'agents', 'agr00', 'agree', 'air', 'aircraft', 'alaska', 'alexia', 'algorithm', 'algorithms', 'alicea', 'alink', 'alizard', 'allah', 'allan', 'allen', 'alphacdc', 'alt', 'amail', 'amdahl', 'amiga', 'amorc', 'andrew', 'anecdote', 'angels', 'animals', 'animation', 'anthony', 'anybody', 'ap', 'apple', 'appreciated', 'april', 'argue', 'argument', 'arguments', 'arizona', 'arromdee', 'aspects', 'ass', 'astro', 'astronaut', 'astronomy', 'atheism', 'atheist', 'atheists', 'ati', 'atmosphere', 'aucun', 'aurora', 'authority', 'aws', 'b645zaw', 'b64635', 'baalke', 'backing', 'bailey', 'bake', 'baptist', 'batf', 'batman', 'baube', 'bcci', 'bcstec', 'bd', 'beam', 'bears', 'beast', 'beauc

*Selecting k best features on Mutual information with k = 1000*

In [7]:
mu = SelectKBest(mutual_info_classif, 1000)
X_train = mu.fit_transform(X_train, y_train)
X_test = mu.transform(X_test)
if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in mu.get_support(indices=True)]
print(feature_names)

['1993apr3', '2000', '24', '249', '256', '2d', '3539', '386', '3d', '3do', '3ds', '42', '575', '602', '621', '666', '68070', '9615', '9630', '__', '___', '_____', 'abortion', 'ac', 'acad3', 'acc', 'accelerations', 'accept', 'access', 'acpub', 'act', 'actions', 'adams', 'added', 'advance', 'advertising', 'aerospace', 'agents', 'agr00', 'agree', 'air', 'aircraft', 'alaska', 'alexia', 'algorithm', 'algorithms', 'alicea', 'alink', 'alizard', 'allah', 'allan', 'allen', 'alphacdc', 'alt', 'amail', 'amdahl', 'amiga', 'amorc', 'andrew', 'anecdote', 'angels', 'animals', 'animation', 'anthony', 'anybody', 'ap', 'apple', 'appreciated', 'april', 'argue', 'argument', 'arguments', 'arizona', 'arromdee', 'aspects', 'ass', 'astro', 'astronaut', 'astronomy', 'atheism', 'atheist', 'atheists', 'ati', 'atmosphere', 'aucun', 'aurora', 'authority', 'aws', 'b645zaw', 'b64635', 'baalke', 'backing', 'bailey', 'bake', 'baptist', 'batf', 'batman', 'baube', 'bcci', 'bcstec', 'bd', 'beam', 'bears', 'beast', 'beauc

Truncated SVD

SkLearn Used Truncated SVD for performing LSI on sparse matrices


In [8]:
sc = MaxAbsScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
tsvd = TruncatedSVD(n_components=150)
X_train = tsvd.fit_transform(X_train)
X_test = tsvd.transform(X_test)
print(tsvd.explained_variance_ratio_)

[0.01787292 0.02543487 0.02298616 0.02123373 0.01823537 0.01754264
 0.01565367 0.01435762 0.01361761 0.01301996 0.0123856  0.01113323
 0.01002537 0.00972781 0.00882557 0.00854569 0.00842922 0.0082082
 0.00770861 0.00741217 0.00734608 0.00700608 0.00678543 0.0066479
 0.00622815 0.00621471 0.00597438 0.00580935 0.00571814 0.00565194
 0.00551084 0.00538846 0.00527728 0.00524181 0.00511803 0.00506992
 0.00494436 0.00477749 0.00472496 0.00462356 0.00447138 0.00441957
 0.00435839 0.00430242 0.0042652  0.00424652 0.00414769 0.00406382
 0.00402973 0.00392692 0.00388417 0.00387562 0.00380718 0.00379451
 0.00376261 0.00369022 0.00366383 0.00357562 0.00350171 0.00347701
 0.00341461 0.00337538 0.00332012 0.0032729  0.00326316 0.00318751
 0.00316694 0.00311692 0.00309395 0.00307103 0.00301512 0.002988
 0.00296463 0.00292299 0.00288181 0.0028623  0.00282283 0.00280071
 0.00278485 0.00273249 0.00272357 0.00268663 0.00266564 0.00265377
 0.00262515 0.00259869 0.00258446 0.00256538 0.00252912 0.00251571

A Classify Method

In [0]:
def classify(clf):
  print("-----------------")
  print(clf)
  clf.fit(X_train, y_train)
  pred = clf.predict(X_test)
  score = metrics.accuracy_score(y_test, pred)
  score1 = metrics.classification_report(y_test, pred)
  print(score1)
  print("accuracy:   %0.5f" % score)

In [10]:
clf0 = Perceptron(max_iter=100)
clf1 = svm.SVC(verbose=1)
clf1_1 = svm.NuSVC(verbose=1)
clf2 = RandomForestClassifier(verbose=1)

classify(clf0)
classify(clf1)
classify(clf2)


-----------------
Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=100, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.84      0.71      0.77       319
           1       0.92      0.90      0.91       389
           2       0.91      0.91      0.91       394
           3       0.66      0.80      0.72       251

    accuracy                           0.84      1353
   macro avg       0.83      0.83      0.83      1353
weighted avg       0.85      0.84      0.84      1353

accuracy:   0.84109
-----------------
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

           0       0.79      0.75      0.77       319
           1       0.88      0.90      0.89       389
           2       0.89      0.90      0.89       394
           3       0.71      0.72      0.71       251

    accuracy                           0.83      1353
   macro avg       0.82      0.82      0.82      1353
weighted avg       0.83      0.83      0.83      1353

accuracy:   0.83075


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
