This notebook will be collected automatically at **6pm on Monday** from `/home/data_scientist/assignments/Week9` directory on the course JupyterHub server. If you work on this assignment on the course Jupyterhub server, just make sure that you save your work and instructors will pull your notebooks automatically after the deadline. If you work on this assignment locally, the only way to submit assignments is via Jupyterhub, and you have to place the notebook file in the correct directory with the correct file name before the deadline.

1. Make sure everything runs as expected. First, restart the kernel (in the menubar, select `Kernel` → `Restart`) and then run all cells (in the menubar, select `Cell` → `Run All`).
2. Make sure you fill in any place that says `YOUR CODE HERE`. Do not write your answer in anywhere else other than where it says `YOUR CODE HERE`. Anything you write anywhere else will be removed by the autograder.
3. Do not change the file path or the file name of this notebook.
4. Make sure that you save your work (in the menubar, select `File` → `Save and CheckPoint`)

# Problem 9.2. NLP: Topic Modeling.

In this problem, we explore the concept of topic modeling.

In [1]:
import numpy as np
import pandas as pd

from scipy.sparse.csr import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import check_random_state
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from gensim.matutils import Sparse2Corpus
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

from nose.tools import assert_equal, assert_is_instance, assert_true
from numpy.testing import assert_array_equal, assert_array_almost_equal

We use the twenty newsgroup data.

In [2]:
from sklearn.datasets import fetch_20newsgroups

train = fetch_20newsgroups(
    data_home='/home/data_scientist/data/textdm', 
    subset='train',
    shuffle=True,
    random_state=check_random_state(0),
    remove=('headers', 'footers', 'quotes')
    )

test = fetch_20newsgroups(
    data_home='/home/data_scientist/data/textdm', 
    subset='test',
    shuffle=True,
    random_state=check_random_state(0),
    remove=('headers', 'footers', 'quotes')
    )

## Document term matrix

- Use TfidfVectorizer to create a document term matrix for both `train['data']` and `test['data']`.
- Use English stop words.
- Use unigrams and bigrams.
- Ignore terms that have a document frequency strictly lower than 2.
- Build a vocabulary that only consider the top 20,000 features ordered by term frequency across the corpus.

In [3]:
def get_document_term_matrix(train_data, test_data):
    '''
    Uses TfidfVectorizer to create a document term matrix for "train_data" and "test_data".
    
    Paramters
    ---------
    train_data: A list of strings
    test_data:A list of strings
    
    Returns
    -------
    A 3-tuple of (model, train_matrix, test_matrix).
    model: A TfidfVectorizer instance
    train_matrix: A scipy.csr_matrix
    test_matrix: A scipy.csr_matrix
    '''
    
    # YOUR CODE HERE
    model = TfidfVectorizer(stop_words = 'english',
                            ngram_range=(1, 2),
                            min_df=2,
                            max_features=20000)
                     
    train_matrix = model.fit_transform(train_data)
    test_matrix = model.transform(test_data)
    
    return model, train_matrix, test_matrix

In [4]:
cv, train_data, test_data = get_document_term_matrix(train['data'], test['data'])

In [5]:
assert_is_instance(cv, TfidfVectorizer)
assert_is_instance(train_data, csr_matrix)
assert_is_instance(test_data, csr_matrix)
assert_equal(cv.stop_words, 'english')
assert_equal(cv.ngram_range, (1, 2))
assert_equal(cv.min_df, 2)
assert_equal(cv.max_features, 20000)
assert_equal(train_data.data.size, 680499)
assert_array_almost_equal(
    train_data.data[:5],
    [0.04590546,  0.05614672,  0.05849851,  0.05614672,  0.06487626]
    )
assert_equal(test_data.data.size, 415292)
assert_array_almost_equal(
    test_data.data[:5],
    [0.16046961,  0.3429567 ,  0.2124038 ,  0.28698678,  0.22300288]
    )

## Non-negative matrix factorization

- Apply non-negative matrix factorization (NMF) to compute topics in `train_data`.
- Use 60 topics.
- Normalize the transformed data to have unit probability.

In [6]:
def apply_nmf(data, random_state):
    '''
    Applies non-negative matrix factorization (NMF) to compute topics.
    
    Parameters
    ----------
    data: A csr_matrix
    random_state: A RandomState instance for NMF
    
    Returns
    -------
    A tuple of (nmf, transformed_data)
    nmf: An sklearn.NMF instance
    transformed_data: A numpy.ndarray
    '''
    
    # YOUR CODE HERE

    nmf = NMF(n_components=60, max_iter=200, random_state=random_state).fit(data)
    td = nmf.transform(data)
    transformed_data = normalize(td, norm='l1', axis=1)
    
    return nmf, transformed_data

In [7]:
nmf, td_norm = apply_nmf(train_data, random_state=check_random_state(0))

# We use a DataFrame to simplify the collecting of the data for display.
df = pd.DataFrame(td_norm)
df.fillna(value=0, inplace=True)
df['label'] = pd.Series(train['target_names'], dtype="category")

df.groupby('label').mean()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alt.atheism,0.0,0.004216,0.0,0.120597,0.001416,0.010014,0.000491,0.0,0.0,0.0,...,0.0,0.0,0.001455,0.0,0.0,0.0,0.434272,0.0,0.003139,0.0
comp.graphics,0.037898,0.0,0.0,0.209269,0.0,0.0,0.0,0.00367,0.0,0.012231,...,0.0,0.0,0.0,0.0,0.034581,0.0,0.005427,0.003008,0.033312,0.019869
comp.os.ms-windows.misc,0.012951,0.207267,0.0,0.0,0.0,0.0,0.0,0.002893,0.057188,0.007003,...,0.0,0.015903,0.0,0.0,0.0,0.0,0.0,0.059286,0.0,0.0
comp.sys.ibm.pc.hardware,0.253397,0.015348,0.003114,0.0,0.241721,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.109778,0.0,0.0,0.03412,0.0,0.0,0.017412,0.0
comp.sys.mac.hardware,0.0,0.0,0.0,0.641944,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comp.windows.x,0.002888,0.00413,0.0,0.0,0.048298,0.0,0.0,0.0,0.006685,0.0,...,0.003866,0.0,0.0,0.002701,0.0,0.013505,0.0,0.014806,0.367371,0.019263
misc.forsale,0.0,0.0,0.0,0.0,0.0,0.0,0.086419,0.0,0.007352,0.0,...,0.00567,0.04885,0.579381,0.0,0.0,0.007527,0.00942,0.0,0.083633,0.001053
rec.autos,0.044051,0.0,0.0,0.0,0.149592,0.0,0.0,0.03026,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rec.motorcycles,0.00088,0.0,0.004938,0.002977,0.001808,0.0,0.00366,0.0,0.0,0.0,...,0.0,0.0,0.067102,0.0,0.0,0.0,0.002896,0.0,0.104846,0.0
rec.sport.baseball,0.0,0.0,0.0,0.004005,0.0,0.0,0.009396,0.0,0.042552,0.012623,...,0.0,0.0,0.0,0.0,0.0,0.675968,0.0,0.0,0.0,0.001574


In [8]:
assert_is_instance(nmf, NMF)
assert_is_instance(td_norm, np.ndarray)
assert_equal(nmf.n_components, 60)
assert_equal(nmf.max_iter, 200)
assert_equal(td_norm.shape, (11314, 60))
assert_array_almost_equal(
    td_norm[0, :5],
    [0.        ,  0.00421649,  0.        ,  0.120597  ,  0.00141566]
    )
assert_array_almost_equal(
    td_norm[-1, -5:],
    [ 0.05955216,  0.        ,  0.00094186,  0.        ,  0.06290102]
    )

## Topic-based Classification

- Train a Random Forest classifier on the topics in the training data sample of the twenty newsgroup data set.
- Use default parameters for the random forest classifier. Don't forget to set the `random_state` parameter.
- Compute the topics, by using the previously created NMF model, for the test data and compute classifications from these topic models. 

In [9]:
def classify_topics(nmf, X_train, y_train, X_test, random_state):
    '''
    
    Paramters
    ---------
    nmf: An sklearn.NMF model.
    X_train: A numpy array.
    y_train: A numpy array.
    X_test: A scipy csr_matrix.
    random_state: A RandomState instance for Random Forest Classifier.
    
    Returns
    -------
    A tuple of (clf, y_pred)
    clf: A RandomForestClassifier instance.
    y_pred: A numpy array.
    '''
    
    # YOUR CODE HERE
    clf = RandomForestClassifier(random_state=random_state).fit(X_train, y_train)
    y_pred = clf.predict(nmf.transform(X_test))
    
    return clf, y_pred

The resulting classification report and confusion matrix are shown to demonstrate the quality of this classification method.

In [10]:
clf, ts_preds = classify_topics(
    nmf, nmf.transform(train_data), train['target'], test_data, check_random_state(0)
    )
print(classification_report(test['target'], ts_preds, target_names=test['target_names']))

                          precision    recall  f1-score   support

             alt.atheism       0.26      0.34      0.29       319
           comp.graphics       0.38      0.52      0.44       389
 comp.os.ms-windows.misc       0.45      0.47      0.46       394
comp.sys.ibm.pc.hardware       0.47      0.49      0.48       392
   comp.sys.mac.hardware       0.52      0.51      0.52       385
          comp.windows.x       0.62      0.57      0.59       395
            misc.forsale       0.73      0.69      0.71       390
               rec.autos       0.42      0.66      0.52       396
         rec.motorcycles       0.66      0.62      0.64       398
      rec.sport.baseball       0.52      0.52      0.52       397
        rec.sport.hockey       0.65      0.61      0.63       399
               sci.crypt       0.66      0.59      0.62       396
         sci.electronics       0.38      0.31      0.34       393
                 sci.med       0.61      0.56      0.58       396
         

In [11]:
assert_is_instance(clf, RandomForestClassifier)
assert_is_instance(ts_preds, np.ndarray)
assert_equal(len(ts_preds), len(test['target']))
assert_array_equal(ts_preds[:5], [8, 1, 16, 15, 6])
assert_array_equal(ts_preds[-5:], [7, 9, 3, 1, 9])

## Topic Modeling with Gensim

- Use the gensim library to perform topic modeling of the twenty newsgroup data. First transform a sparse matrix into a gensim corpus, and then construct a vocabulary dictionary. Finally, create a  Latent Dirichlet allocation (LDA) model with 20 topics for the newsgroup text, and return 5 most significant words for each topic.
- You should specify three parameters in `LdaModel()`: `corpus`, `id2word`, and `num_topics`. Use default values for all other paramters. Ignore any warnings about `passes` or `iterations`.

In [12]:
def get_topics(cv, train_data):
    '''
    Uses gensim to perform topic modeling.
    
    Paramters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.
    
    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    '''
    
    # YOUR CODE HERE
    td_gensim = Sparse2Corpus(train_data)
    
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)
    
    lda_gs = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20)
    
    topics = lda_gs.top_topics(corpus=td_gensim, num_words=5)
    
    return topics

In [13]:
topics = get_topics(cv, train_data)

for idx, (lst, val) in enumerate(topics):
    print('Topic {0}'.format(idx))
    print(35*('-'))
    for i, z in lst:
        print('    {0:20s}: {1:5.4f}'.format(z, i))
    print(35*('-'))

Topic 0
-----------------------------------
    franchises          : 0.0009
    386sx               : 0.0009
    373                 : 0.0008
    clutch              : 0.0008
    announcing          : 0.0008
-----------------------------------
Topic 1
-----------------------------------
    macx                : 0.0034
    health insurance    : 0.0033
    graeme              : 0.0033
    jpwu45              : 0.0032
    coptic church       : 0.0032
-----------------------------------
Topic 2
-----------------------------------
    lineup              : 0.0073
    loosely             : 0.0057
    11th                : 0.0055
    0t                  : 0.0054
    cpsr                : 0.0052
-----------------------------------
Topic 3
-----------------------------------
    kingdom heaven      : 0.0017
    bread               : 0.0017
    9y                  : 0.0017
    anti semitic        : 0.0016
    claremont           : 0.0016
-----------------------------------
Topic 4
------------

In [14]:
assert_is_instance(topics, list)
assert_equal(len(topics), 20)

for topic, score in topics:
    assert_is_instance(topic, list)
    assert_is_instance(score, float)
    assert_equal(len(topic), 5)
    for v, k in topic:
        assert_is_instance(k, str)
        assert_is_instance(v, float)