In [33]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

## Extracting features from texts

### bags of words

- X[i, j]  i is document, j is feature (word)
- sparse matrix.  scipy.sparse

eg. 10w documents, 1w words, float32

10w * 1w * 4 bytes ~= 3.7 GB

In [43]:
10e4 * 1e4 * 4 / 1024 ** 3

3.725290298461914

---

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [8]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

news = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [19]:
dir(news)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [20]:
news.__class__

sklearn.utils.Bunch

In [26]:
# print(news.DESCR)

In [34]:
news.filenames
news.filenames.shape

array(['/Users/jiaru2014/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38440',
       '/Users/jiaru2014/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38479',
       '/Users/jiaru2014/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20737',
       ...,
       '/Users/jiaru2014/scikit_learn_data/20news_home/20news-bydate-train/sci.med/58112',
       '/Users/jiaru2014/scikit_learn_data/20news_home/20news-bydate-train/sci.med/58578',
       '/Users/jiaru2014/scikit_learn_data/20news_home/20news-bydate-train/sci.med/58895'],
      dtype='<U97')

(2257,)

In [35]:
news.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [37]:
news.target
news.target.shape

array([1, 1, 3, ..., 2, 2, 2])

(2257,)

In [39]:
len(news.data)
news.data[0]

2257

'From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n'

In [40]:
print(news.data[0])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



### CountVector

In [123]:
from sklearn.feature_extraction.text import CountVectorizer

In [124]:
cm = CountVectorizer()
doc_word_mtx = cm.fit_transform(news.data)

In [125]:
doc_word_mtx.__class__   # scipy.sparse
doc_word_mtx
doc_word_mtx.shape
doc_word_mtx[:3, :5].toarray()

scipy.sparse.csr.csr_matrix

<2257x35788 sparse matrix of type '<class 'numpy.int64'>'
	with 365886 stored elements in Compressed Sparse Row format>

(2257, 35788)

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

### TfidfVectorizer: from occurences to frequency

tf-idf:  **Term Frequency** times inverse-**Document Frequency**

In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [127]:
tm = TfidfVectorizer()
doc_word_mtx = tm.fit_transform(news.data)

In [128]:
doc_word_mtx.__class__
doc_word_mtx
doc_word_mtx.shape
doc_word_mtx[-5:, -5:].toarray()

scipy.sparse.csr.csr_matrix

<2257x35788 sparse matrix of type '<class 'numpy.float64'>'
	with 365886 stored elements in Compressed Sparse Row format>

(2257, 35788)

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [129]:
tm.get_feature_names()[-5:]

['zyxel', 'zz', 'zzz', 'ªl', 'íålittin']

In [130]:
new_corpus = [
    'God is love', 
    'OpenGL on the GPU is fast'
]

new_X = tm.transform(new_corpus)

In [131]:
new_X[:10].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## classification: Multinomial Naive Bayes vs. SGDClassifier

In [170]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

In [171]:
X = doc_word_mtx
y = news.target

In [175]:
clf = MultinomialNB()
clf = SGDClassifier(tol=None)
clf.fit(X, y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [176]:
clf.predict(new_X)

array([3, 1])

## Use pipeline

In [177]:
from sklearn.pipeline import Pipeline

In [212]:
steps = [
    ('CountVector', CountVectorizer()),
    ('TFIDF', TfidfTransformer(use_idf=True)),
    #('naive bayes classification', MultinomialNB(alpha=1.0)),
    ('SGDClf', SGDClassifier(tol=None)),
]

text_clf = Pipeline(steps)

In [213]:
text_clf.fit(news.data, news.target)



Pipeline(memory=None,
     steps=[('CountVector', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
      ...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [214]:
news_test = fetch_20newsgroups(subset='test', categories=categories)

y_hat = text_clf.predict(news_test.data)

In [215]:
import numpy as np

y_true = news_test.target
np.mean(y_true == y_hat)

0.9267643142476698

In [216]:
from sklearn.metrics import classification_report, confusion_matrix

In [217]:
report_txt = classification_report(y_true, y_hat, target_names=news_test.target_names)
print(report_txt)

                        precision    recall  f1-score   support

           alt.atheism       0.94      0.85      0.89       319
         comp.graphics       0.91      0.97      0.94       389
               sci.med       0.96      0.91      0.93       396
soc.religion.christian       0.91      0.96      0.93       398

             micro avg       0.93      0.93      0.93      1502
             macro avg       0.93      0.92      0.92      1502
          weighted avg       0.93      0.93      0.93      1502



In [218]:
confusion_matrix(y_true, y_hat)

array([[271,   6,  10,  32],
       [  5, 376,   4,   4],
       [  6,  24, 362,   4],
       [  5,   7,   3, 383]])

In [219]:
from sklearn.model_selection import GridSearchCV

In [222]:
text_clf.steps

[('CountVector',
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None)),
 ('TFIDF',
  TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('SGDClf', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
         early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
         l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
         n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
         power_t=0.5, random_state=None, shuffle=True, tol=None,
         validation_fraction=0.1, verbose=0, warm_start=False))]

In [228]:
grid_params = {
    'SGDClf__alpha': [0.0001, 0.0005],
    'TFIDF__use_idf': [True, False],
}


grid = GridSearchCV(text_clf, grid_params, cv=5)

In [229]:
grid.fit(news.data, news.target)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('CountVector', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
      ...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'SGDClf__alpha': [0.0001, 0.0005], 'TFIDF__use_idf': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [232]:
import pandas as pd
pd.DataFrame(grid.cv_results_)

  return f(*args, **kwds)
  return f(*args, **kwds)


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_SGDClf__alpha,param_TFIDF__use_idf,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.56899,0.125732,0.970758,0.999668,0.0001,True,"{'SGDClf__alpha': 0.0001, 'TFIDF__use_idf': True}",2,0.971239,0.999446,...,0.964602,1.0,0.969027,0.999446,0.975501,1.0,0.008634,0.00995,0.003764,0.000271
1,0.534806,0.119443,0.942844,0.995347,0.0001,False,"{'SGDClf__alpha': 0.0001, 'TFIDF__use_idf': Fa...",3,0.931416,0.997784,...,0.946903,0.997784,0.95354,0.998892,0.953229,0.998341,0.006565,0.006472,0.01054,0.005721
2,0.571616,0.133649,0.972973,0.999114,0.0005,True,"{'SGDClf__alpha': 0.0005, 'TFIDF__use_idf': True}",1,0.969027,0.998892,...,0.969027,0.999446,0.971239,0.998892,0.977728,0.998894,0.027394,0.005027,0.004019,0.000271
3,0.569152,0.131276,0.9393,0.985713,0.0005,False,"{'SGDClf__alpha': 0.0005, 'TFIDF__use_idf': Fa...",4,0.955752,0.992798,...,0.929204,0.987812,0.940265,0.988366,0.937639,0.979535,0.010671,0.006698,0.009044,0.005134
