<a href="https://colab.research.google.com/github/HofstraDoboli/TextMining/blob/main/naive_bayes_20_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# text classification reuters dataset
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
twenty_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle=True)
print(dir(twenty_train))
pprint(twenty_train.target_names) #prints all the categories
pprint(twenty_train.data[0])

['DESCR', 'data', 'filenames', 'target', 'target_names']
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']
('I was wondering if anyone out there could enlighten me on this car I saw\n'
 'the other day. It was a 2-door sports car, looked to be from the late 60s/\n'
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition,\n'
 'the front bumper was separate from the rest of the body. This is \n'
 'all I know. If anyone can tellme a model name, engine specs, years\n'
 'of production, where this car is made, history, or whatever info you\n'
 'have on this funky looking car, please e-mail.')


In [None]:
print(twenty_train.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [2]:
import numpy as np

unique, counts = np.unique(twenty_train.target, return_counts=True)
print(unique)
print(counts)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[480 584 591 590 578 593 585 594 598 597 600 595 591 594 593 599 546 564
 465 377]


In [7]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def class_results(predicted, true, labels):

  print("Precision, recall, f_score")
  pp.pprint(precision_recall_fscore_support(true,predicted, average='macro'))
  pp.pprint(precision_recall_fscore_support(true, predicted, average='micro'))

  # confusion matrix

  #pp.pprint(confusion_matrix(predicted_train_counts, twenty_train.target))
  #pp.pprint(confusion_matrix(predicted_test_counts, twenty_test.target))

  # classification report
  print('Classification report')
  pp.pprint(classification_report(true, predicted,  target_names = labels))

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

count_vect = CountVectorizer(stop_words = 'english',lowercase = True)# min_df = 30, max_df = 0.5)
train_counts = count_vect.fit_transform(twenty_train.data)
print(train_counts.shape)

twenty_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), shuffle=True)
test_counts = count_vect.transform(twenty_test.data)
print(test_counts.shape)

(11314, 101322)
(7532, 101322)


In [62]:
# MultinomialNB = Naive Bayes for more than 2 classes.
import pprint as pp
from sklearn.naive_bayes import MultinomialNB
kb_class_counts = MultinomialNB(alpha = 0.01).fit(train_counts, twenty_train.target)  # uses tf-idf
#print(dir(kb_class))
# test it
predicted_train = kb_class_counts.predict(train_counts)
predicted_test = kb_class_counts.predict(test_counts)
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))

class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)

0.8962347534028637
0.6526818906001062
Precision, recall, f_score
(0.6537765899741197, 0.6420815207074401, 0.6268816896587931, None)
(0.6526818906001062, 0.6526818906001062, 0.6526818906001062, None)
Classification report
('                          precision    recall  f1-score   support\n'
 '\n'
 '             alt.atheism       0.50      0.52      0.51       319\n'
 '           comp.graphics       0.57      0.71      0.63       389\n'
 ' comp.os.ms-windows.misc       0.71      0.01      0.02       394\n'
 'comp.sys.ibm.pc.hardware       0.52      0.71      0.60       392\n'
 '   comp.sys.mac.hardware       0.59      0.70      0.64       385\n'
 '          comp.windows.x       0.75      0.73      0.74       395\n'
 '            misc.forsale       0.82      0.69      0.75       390\n'
 '               rec.autos       0.69      0.72      0.71       396\n'
 '         rec.motorcycles       0.72      0.72      0.72       398\n'
 '      rec.sport.baseball       0.90      0.79      0.84      

In [63]:
# with or without this
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(norm = 'l2', sublinear_tf = True)
train_tfidf = tfidf_transformer.fit_transform(train_counts)
print(train_tfidf.shape)

test_tfidf = tfidf_transformer.transform(test_counts)
print(test_tfidf.shape)

(11314, 101322)
(7532, 101322)


In [64]:
# MultinomialNB = Naive Bayes for more than 2 classes.
import pprint as pp
from sklearn.naive_bayes import MultinomialNB
# alpha = smoothing coefficient (we talked about alpha = 1 or add one smoothing)
kb_class_tfidf = MultinomialNB(alpha = 0.2).fit(train_tfidf, twenty_train.target)  # uses tf-idf
#kb_class = MultinomialNB().fit(train_counts, twenty_train.target) # uses counts
#print(dir(kb_class))
predicted_train = kb_class_tfidf.predict(train_counts)
predicted_test  = kb_class_tfidf.predict(test_counts)
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))

class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)


0.9074597843379884
0.674057355284121
Precision, recall, f_score
(0.7099654944796506, 0.6547136522853936, 0.6471363909025288, None)
(0.674057355284121, 0.674057355284121, 0.674057355284121, None)
Classification report
('                          precision    recall  f1-score   support\n'
 '\n'
 '             alt.atheism       0.70      0.22      0.33       319\n'
 '           comp.graphics       0.65      0.68      0.66       389\n'
 ' comp.os.ms-windows.misc       0.69      0.53      0.60       394\n'
 'comp.sys.ibm.pc.hardware       0.58      0.74      0.65       392\n'
 '   comp.sys.mac.hardware       0.76      0.66      0.71       385\n'
 '          comp.windows.x       0.78      0.76      0.77       395\n'
 '            misc.forsale       0.80      0.76      0.78       390\n'
 '               rec.autos       0.81      0.72      0.76       396\n'
 '         rec.motorcycles       0.86      0.70      0.77       398\n'
 '      rec.sport.baseball       0.93      0.79      0.85       397

In [None]:
# if your dataset is not balanced (a class with a lot more points than another) then look into the
# ComplementNB classifier

In [65]:
# MultinomialNB = Naive Bayes for more than 2 classes.
import pprint as pp
from sklearn.naive_bayes import ComplementNB
kb_complement_counts = ComplementNB(alpha = 0.5).fit(train_counts, twenty_train.target)  # uses tf-idf
#print(dir(kb_class))
# test it
predicted_train = kb_complement_counts.predict(train_counts)
predicted_test = kb_complement_counts.predict(test_counts)
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))

class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)

0.876789817924695
0.6745884227296867
Precision, recall, f_score
(0.6720382562599662, 0.6594521328127686, 0.6453624273921665, None)
(0.6745884227296867, 0.6745884227296867, 0.6745884227296867, None)
Classification report
('                          precision    recall  f1-score   support\n'
 '\n'
 '             alt.atheism       0.32      0.44      0.37       319\n'
 '           comp.graphics       0.69      0.72      0.70       389\n'
 ' comp.os.ms-windows.misc       0.67      0.10      0.18       394\n'
 'comp.sys.ibm.pc.hardware       0.55      0.72      0.62       392\n'
 '   comp.sys.mac.hardware       0.78      0.68      0.73       385\n'
 '          comp.windows.x       0.60      0.84      0.70       395\n'
 '            misc.forsale       0.77      0.65      0.71       390\n'
 '               rec.autos       0.84      0.73      0.78       396\n'
 '         rec.motorcycles       0.86      0.73      0.79       398\n'
 '      rec.sport.baseball       0.92      0.82      0.87       

In [66]:
# MultinomialNB = Naive Bayes for more than 2 classes.
from sklearn.naive_bayes import ComplementNB
# alpha = smoothing coefficient (we talked about alpha = 1 or add one smoothing)
kb_complement_tfidf = ComplementNB(alpha = 0.2).fit(train_tfidf, twenty_train.target)  # uses tf-idf
#kb_class = MultinomialNB().fit(train_counts, twenty_train.target) # uses counts
#print(dir(kb_class))

predicted_train = kb_complement_tfidf.predict(train_counts)
predicted_test = kb_complement_tfidf.predict(test_counts)
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))

class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)


0.9267279476754463
0.7177376526818906
Precision, recall, f_score
(0.7176054123310702, 0.701438580726102, 0.6991196663416589, None)
(0.7177376526818906, 0.7177376526818906, 0.7177376526818906, None)
Classification report
('                          precision    recall  f1-score   support\n'
 '\n'
 '             alt.atheism       0.31      0.43      0.36       319\n'
 '           comp.graphics       0.73      0.72      0.72       389\n'
 ' comp.os.ms-windows.misc       0.71      0.60      0.65       394\n'
 'comp.sys.ibm.pc.hardware       0.62      0.71      0.66       392\n'
 '   comp.sys.mac.hardware       0.77      0.73      0.75       385\n'
 '          comp.windows.x       0.82      0.80      0.81       395\n'
 '            misc.forsale       0.75      0.74      0.74       390\n'
 '               rec.autos       0.81      0.75      0.78       396\n'
 '         rec.motorcycles       0.83      0.76      0.79       398\n'
 '      rec.sport.baseball       0.92      0.84      0.88       

In [44]:
# simple knn
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors= 13, algorithm='auto', p=2, metric='cosine')
knn.fit(train_tfidf, twenty_train.target)

predicted_test = knn.predict(test_tfidf)
predicted_train = knn.predict(train_tfidf)
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))
class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)

0.7852218490365919
0.6480350504514073
Precision, recall, f_score
(0.6522904910494797, 0.6358496206893856, 0.6356622670747122, None)
(0.6480350504514073, 0.6480350504514073, 0.6480350504514073, None)
Classification report
('                          precision    recall  f1-score   support\n'
 '\n'
 '             alt.atheism       0.43      0.50      0.46       319\n'
 '           comp.graphics       0.56      0.69      0.62       389\n'
 ' comp.os.ms-windows.misc       0.54      0.66      0.59       394\n'
 'comp.sys.ibm.pc.hardware       0.56      0.68      0.62       392\n'
 '   comp.sys.mac.hardware       0.63      0.63      0.63       385\n'
 '          comp.windows.x       0.76      0.72      0.74       395\n'
 '            misc.forsale       0.46      0.71      0.56       390\n'
 '               rec.autos       0.63      0.70      0.67       396\n'
 '         rec.motorcycles       0.68      0.70      0.69       398\n'
 '      rec.sport.baseball       0.79      0.81      0.80      

In [None]:
from sklearn.SVM import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors= 13, algorithm='auto', p=2, metric='cosine')
knn.fit(train_tfidf, twenty_train.target)

predicted_test = knn.predict(test_tfidf)
predicted_train = knn.predict(train_tfidf)
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))
class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)

In [5]:
!pip install faiss-cpu --no-cache
#!pip install faiss-cpu -c pytorch #!pip install swigfaiss

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m212.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [67]:
# knn with faiss
import faiss                   # make faiss available
#from faiss import swigfaiss
dim = train_tfidf.shape[1]
print(dim)
index = faiss.IndexFlatIP(dim) # indexFlat = pairwise comparison, no speed up
print(train_tfidf.toarray().dtype) # it has to be float32
index.add(train_tfidf.toarray().astype(np.float32))

101322
float64


In [68]:
k = 5                          # we want 5 similar vectors to the test vectors
dist_knn, index_knn = index.search(test_tfidf.toarray().astype(np.float32), k)     # actual search
print(type(dist_knn), dist_knn.shape)
print(type(index_knn), index_knn.shape)


KeyboardInterrupt: 

In [12]:
print(dist_knn[:5,:]) # shortest k = 5 distances to each test case
print(index_knn[:5,:]) # indexes to closest k = 5 training tf-idf vectors

[[0.2457444  0.17160115 0.16329828 0.15972638 0.14375842]
 [0.34550226 0.21668756 0.20922291 0.19734934 0.19083253]
 [0.7409506  0.40241027 0.39349103 0.39083683 0.33061358]
 [0.18521333 0.1715529  0.17121649 0.1706531  0.16718757]
 [0.26313308 0.23956817 0.23420517 0.2257334  0.21672462]]


In [None]:
ind = 1 # test the search on a test index - see the results
cat_names = twenty_test.target_names
print(twenty_test.data[ind])
print(cat_names[twenty_test.target[ind]])
for i in index_knn[ind,:]:
  print(cat_names[twenty_train.target[i]])
for i in index_knn[ind,:]:
  print(i, twenty_train.data[i])

I'm not familiar at all with the format of these "X-Face:" thingies, but
after seeing them in some folks' headers, I've *got* to *see* them (and
maybe make one of my own)!

I've got "dpg-view" on my Linux box (which displays "uncompressed X-Faces")
and I've managed to compile [un]compface too... but now that I'm *looking*
for them, I can't seem to find any X-Face:'s in anyones news headers!  :-(

Could you, would you, please send me your "X-Face:" header?

I *know* I'll probably get a little swamped, but I can handle it.

	...I hope.
comp.windows.x
comp.sys.mac.hardware
rec.sport.baseball
comp.graphics
sci.space
comp.sys.mac.hardware
155 
164 
171 
205 
12 --



In [None]:
twenty_train.data[100]

"1.  Software publishing SuperBase 4 windows v.1.3           --->$80\n\n2.  OCR System ReadRight v.3.1 for Windows                  --->$65\n\n3.  OCR System ReadRight  v.2.01 for DOS                    --->$65\n\n4.  Unregistered Zortech 32 bit C++ Compiler v.3.1          --->$ 250\n     with Multiscope windows Debugger,\n     WhiteWater Resource Toolkit, Library Source Code\n\n5.  Glockenspiel/ImageSoft Commonview 2 Windows\n     Applications Framework for Borland C++                 --->$70\n\n6.  Spontaneous Assembly Library With Source Code           --->$50\n\n7.  Microsoft Macro Assembly 6.0                            --->$50\n\n8.  Microsoft Windows v.3.1 SDK Documentation               --->$125\n\n9.  Microsoft FoxPro V.2.0                                  --->$75\n\n10.  WordPerfect 5.0 Developer's Toolkit                    --->$20\n\n11.  Kedwell Software DataBoss v.3.5 C Code Generator       --->$100\n\n12.  Kedwell InstallBoss v.2.0 Installation Generator       --->$35\n\

In [49]:
# do feature selection using Chi-square
# eliminate features which are independent of the target class
# those which have a large chi-square value
from sklearn.feature_selection import chi2
chi_val, p_val = chi2(train_counts, twenty_train.target)

In [27]:
chi_val[:20]

array([5739.50881601, 1017.63016886,  839.18581504, 1041.13858313,
       1125.1865885 , 1130.64644404,  308.56118279,  521.45555446,
        126.8534625 ,  135.32908213,  146.74191104, 1637.18283841,
        183.24045195,   41.47965983,  123.63172214,  163.19352815,
        139.78935595,  170.82728135,  108.65450313,  116.23851667])

In [28]:
p_val[:20]

array([0.00000000e+000, 9.13829195e-204, 9.98638292e-166, 8.71404202e-209,
       9.45098586e-227, 6.42276191e-228, 3.50965061e-054, 1.74856175e-098,
       5.72977024e-018, 1.42037999e-019, 9.29753578e-022, 0.00000000e+000,
       7.11081252e-029, 2.08233860e-003, 2.31445338e-017, 6.06299377e-025,
       2.00289560e-020, 1.95710068e-026, 1.41005329e-014, 5.57792884e-016])

In [50]:
ind_small_pval = p_val < 0.05
print(sum(ind_small_pval))

34457


In [51]:
# extract only those terms and do the classification
#chi2_terms = train_tfidf(ind_small_pval)
import pprint as pp
from sklearn.naive_bayes import MultinomialNB

kb_chi2 = MultinomialNB().fit(train_counts[:,ind_small_pval], twenty_train.target)


predicted = kb_chi2.predict(test_counts[:,ind_small_pval])
print(np.mean(predicted == twenty_test.target))
predicted_test = kb_chi2.predict(test_counts[:,ind_small_pval])
predicted_train = kb_chi2.predict(train_counts[:,ind_small_pval])
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))
class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)


0.6510886882634095
0.808202227328973
0.6510886882634095
Precision, recall, f_score
(0.6396050095588381, 0.6373058432657016, 0.6185782156458735, None)
(0.6510886882634095, 0.6510886882634095, 0.6510886882634095, None)
Classification report
('                          precision    recall  f1-score   support\n'
 '\n'
 '             alt.atheism       0.56      0.46      0.51       319\n'
 '           comp.graphics       0.54      0.72      0.62       389\n'
 ' comp.os.ms-windows.misc       0.33      0.00      0.01       394\n'
 'comp.sys.ibm.pc.hardware       0.51      0.73      0.60       392\n'
 '   comp.sys.mac.hardware       0.65      0.63      0.64       385\n'
 '          comp.windows.x       0.66      0.75      0.70       395\n'
 '            misc.forsale       0.83      0.72      0.77       390\n'
 '               rec.autos       0.78      0.72      0.75       396\n'
 '         rec.motorcycles       0.86      0.70      0.78       398\n'
 '      rec.sport.baseball       0.92      0.

In [69]:
from sklearn.feature_selection import f_classif
f_val, p_val = f_classif(train_counts, twenty_train.target)
ind_small_pval = p_val < 0.05
print(sum(ind_small_pval))

23672


In [70]:
# extract only those terms and do the classification
#chi2_terms = train_tfidf(ind_small_pval)
import pprint as pp
from sklearn.naive_bayes import MultinomialNB

kb_anova = MultinomialNB().fit(train_counts[:,ind_small_pval], twenty_train.target)


predicted = kb_anova.predict(test_counts[:,ind_small_pval])
print(np.mean(predicted == twenty_test.target))
predicted_test = kb_anova.predict(test_counts[:,ind_small_pval])
predicted_train = kb_anova.predict(train_counts[:,ind_small_pval])
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))
class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)

0.6500265533722783
0.7994520063637971
0.6500265533722783
Precision, recall, f_score
(0.6395324413538426, 0.6371040933822892, 0.6203169222997431, None)
(0.6500265533722783, 0.6500265533722783, 0.6500265533722783, None)
Classification report
('                          precision    recall  f1-score   support\n'
 '\n'
 '             alt.atheism       0.51      0.47      0.49       319\n'
 '           comp.graphics       0.52      0.72      0.60       389\n'
 ' comp.os.ms-windows.misc       0.33      0.00      0.01       394\n'
 'comp.sys.ibm.pc.hardware       0.49      0.72      0.58       392\n'
 '   comp.sys.mac.hardware       0.66      0.64      0.65       385\n'
 '          comp.windows.x       0.70      0.72      0.71       395\n'
 '            misc.forsale       0.83      0.72      0.77       390\n'
 '               rec.autos       0.79      0.74      0.76       396\n'
 '         rec.motorcycles       0.84      0.72      0.78       398\n'
 '      rec.sport.baseball       0.90      0

In [25]:
# use knn - not better than NB
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors= 13, algorithm='auto', p=2, metric='cosine')
knn.fit(train_tfidf[:,ind_small_pval], twenty_train.target)
print(knn.score(test_tfidf[:,ind_small_pval], twenty_test.target))

0.5665161975570897


In [73]:
# try complement NB - for conditional probabilities
# uses the counts in all classes but one - less bias with # of training
# samples
from sklearn.naive_bayes import ComplementNB
cnb = ComplementNB().fit(train_tfidf[:,ind_small_pval], twenty_train.target)


predicted_test = cnb.predict(test_counts[:,ind_small_pval])
predicted_train= cnb.predict(train_counts[:,ind_small_pval])
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))
class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)

0.8548700724765776
0.7052575677110993
Precision, recall, f_score
(0.7111216249299093, 0.6884398453291108, 0.6864621645659623, None)
(0.7052575677110993, 0.7052575677110993, 0.7052575677110993, None)
Classification report
('                          precision    recall  f1-score   support\n'
 '\n'
 '             alt.atheism       0.28      0.40      0.33       319\n'
 '           comp.graphics       0.70      0.69      0.70       389\n'
 ' comp.os.ms-windows.misc       0.67      0.62      0.65       394\n'
 'comp.sys.ibm.pc.hardware       0.62      0.73      0.67       392\n'
 '   comp.sys.mac.hardware       0.77      0.71      0.74       385\n'
 '          comp.windows.x       0.80      0.77      0.79       395\n'
 '            misc.forsale       0.77      0.75      0.76       390\n'
 '               rec.autos       0.81      0.72      0.77       396\n'
 '         rec.motorcycles       0.86      0.77      0.81       398\n'
 '      rec.sport.baseball       0.90      0.82      0.86      

In [74]:
# SVM
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(train_tfidf, twenty_train.target)


predicted_test = svm.predict(test_counts)
predicted_train = svm.predict(train_counts)
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))
class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)

0.9650875022096518
0.6700743494423792
Precision, recall, f_score
(0.6687035142339159, 0.6581336204408201, 0.6586289009657954, None)
(0.6700743494423792, 0.6700743494423792, 0.6700743494423792, None)
Classification report
('                          precision    recall  f1-score   support\n'
 '\n'
 '             alt.atheism       0.48      0.45      0.46       319\n'
 '           comp.graphics       0.64      0.68      0.66       389\n'
 ' comp.os.ms-windows.misc       0.60      0.64      0.62       394\n'
 'comp.sys.ibm.pc.hardware       0.56      0.64      0.60       392\n'
 '   comp.sys.mac.hardware       0.71      0.66      0.68       385\n'
 '          comp.windows.x       0.78      0.68      0.73       395\n'
 '            misc.forsale       0.72      0.79      0.76       390\n'
 '               rec.autos       0.76      0.69      0.72       396\n'
 '         rec.motorcycles       0.81      0.68      0.74       398\n'
 '      rec.sport.baseball       0.55      0.81      0.65      

In [75]:
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(train_tfidf[:,ind_small_pval], twenty_train.target)


predicted_test = svm.predict(test_counts[:,ind_small_pval])
predicted_train= svm.predict(train_counts[:,ind_small_pval])
print(np.mean(predicted_train == twenty_train.target))
print(np.mean(predicted_test == twenty_test.target))
class_results(predicted = predicted_test, true = twenty_test.target, labels = twenty_train.target_names)

0.9553650344705674
0.6623738714816781
Precision, recall, f_score
(0.6602789519042865, 0.6503744645297267, 0.6508950185010659, None)
(0.6623738714816781, 0.6623738714816781, 0.6623738714816781, None)
Classification report
('                          precision    recall  f1-score   support\n'
 '\n'
 '             alt.atheism       0.48      0.45      0.46       319\n'
 '           comp.graphics       0.64      0.68      0.66       389\n'
 ' comp.os.ms-windows.misc       0.59      0.63      0.61       394\n'
 'comp.sys.ibm.pc.hardware       0.57      0.64      0.60       392\n'
 '   comp.sys.mac.hardware       0.70      0.67      0.68       385\n'
 '          comp.windows.x       0.77      0.66      0.71       395\n'
 '            misc.forsale       0.73      0.78      0.75       390\n'
 '               rec.autos       0.75      0.69      0.72       396\n'
 '         rec.motorcycles       0.81      0.68      0.73       398\n'
 '      rec.sport.baseball       0.54      0.80      0.64      