In [4]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


In [5]:
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))


Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)


Loading dataset...
done in 40.464s.


In [7]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 0.684s.


In [14]:

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 0.611s.



In [17]:
x = [int(_) for _ in "15  12  8   8   7   7   7   6   5   3".split()]
y = [int (_) for _ in "10  25  17  11  13  17  20  13  9   15".split()]
z = zip(x,y)

In [18]:
tuple(z)

((15, 10),
 (12, 25),
 (8, 17),
 (8, 11),
 (7, 13),
 (7, 17),
 (7, 20),
 (6, 13),
 (5, 9),
 (3, 15))

In [23]:
from scipy.stats import pearsonr
print(round(pearsonr(x,y)[0],3))


0.145


In [21]:
help(pearsonr)

Help on function pearsonr in module scipy.stats.stats:

pearsonr(x, y)
    Calculates a Pearson correlation coefficient and the p-value for testing
    non-correlation.
    
    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed, and not necessarily zero-mean.
    Like other correlation coefficients, this one varies between -1 and +1
    with 0 implying no correlation. Correlations of -1 or +1 imply an exact
    linear relationship. Positive correlations imply that as x increases, so
    does y. Negative correlations imply that as x increases, y decreases.
    
    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 

In [4]:
import nltk

def print_tree(tree, filename):
    '''
        A method to save the parsed NLTK tree to a PS file
    '''
    # create the canvas
    canvasFrame = nltk.draw.util.CanvasFrame()

    # create tree widget
    widget = nltk.draw.TreeWidget(canvasFrame.canvas(), tree)

    # add the widget to canvas
    canvasFrame.add_widget(widget, 10, 10)

    # save the file
    canvasFrame.print_to_file(filename)

    # release the object
    canvasFrame.destroy()

# two sentences from the article
sentences = ['Washington state voters last fall passed Initiative 594', 'The White House also said it planned to ask Congress for $500 million to improve mental health care, and Obama issued a memorandum directing federal agencies to conduct or sponsor research into smart gun technology that reduces the risk of accidental gun discharges.']

# the simplest possible word tokenizer
sentences = [s.split() for s in sentences]

# part-of-speech tagging
sentences = [nltk.pos_tag(s) for s in sentences]

# pattern for recognizing structures of the sentence
pattern = '''
  NP: {<DT|JJ|NN.*|CD>+}   # Chunk sequences of DT, JJ, NN
  VP: {<VB.*><NP|PP>+}     # Chunk verbs and their arguments
  PP: {<IN><NP>}           # Chunk prepositions followed by NP
'''

# identify the chunks
NPChunker = nltk.RegexpParser(pattern)
chunks = [NPChunker.parse(s) for s in sentences]

# save to file
print_tree(chunks[0], '../../Data/Chapter09/charts/sent1.ps')
print_tree(chunks[1], '../../Data/Chapter09/charts/sent2.ps')

In [5]:
import nltk

# read the text
guns_laws = '../../Data/Chapter09/ST_gunLaws.txt'

with open(guns_laws, 'r') as f:
    article = f.read()

# load NLTK modules
sentencer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = nltk.RegexpTokenizer(r'\w+')
stemmer = nltk.PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()

# split the text into sentences
sentences = sentencer.tokenize(article)

words = []
stemmed_words = []
lemmatized_words = []

# and for each sentence
for sentence in sentences:
    # split the sentence into words
    words.append(tokenizer.tokenize(sentence))

    # stemm the words
    stemmed_words.append([stemmer.stem(word) 
        for word in words[-1]])

    # and lemmatize them
    lemmatized_words.append([lemmatizer.lemmatize(word) 
        for word in words[-1]])

# and save the results to files
file_words  = '../../Data/Chapter09/ST_gunLaws_words.txt'
file_stems  = '../../Data/Chapter09/ST_gunLaws_stems.txt'
file_lemmas = '../../Data/Chapter09/ST_gunLaws_lemmas.txt'

with open(file_words, 'w') as f:
    for w in words:
        for word in w:
            f.write(word + '\n')

with open(file_stems, 'w') as f:
    for w in stemmed_words:
        for word in w:
            f.write(word + '\n')

with open(file_lemmas, 'w') as f:
    for w in lemmatized_words:
        for word in w:
            f.write(word + '\n')

FileNotFoundError: [Errno 2] No such file or directory: '../../Data/Chapter09/ST_gunLaws.txt'