In [0]:
!apt update

In [0]:
!apt install -y cmake

In [0]:
!pip install -U spacy

In [0]:
!pip install textacy

In [0]:
!pip install textacy[viz]

In [0]:
!pip install textacy[lang]

In [0]:
import textacy

In [0]:
import textacy.datasets

In [0]:
cw = textacy.datasets.CapitolWords()

In [0]:
cw.download()

In [0]:
records = cw.records(speaker_name={'Hillary Clinton', 'Barack Obama'})

In [0]:
import spacy

In [117]:
!python -m spacy download es

Collecting https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.0.0/es_core_news_sm-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.0.0/es_core_news_sm-2.0.0.tar.gz (36.7MB)
[K    100% |████████████████████████████████| 36.7MB 59.3MB/s 

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/es_core_news_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/es

    You can now load the model via spacy.load('es')



In [118]:
spacy.load('es')

<spacy.lang.es.Spanish at 0x7f82f40789e8>

In [0]:
text_stream, metadata_stream = textacy.io.split_record_fields(
    records, 'text')

In [0]:
corpus = textacy.Corpus('es', texts=text_stream, metadatas=metadata_stream)

In [121]:
corpus

Corpus(1241 docs; 858604 tokens)

In [0]:
vectorizer = textacy.Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth', min_df=2, max_df=0.95)

In [0]:
doc_term_matrix = vectorizer.fit_transform((doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)for doc in corpus))

In [124]:
print(repr(doc_term_matrix))

<1241x16318 sparse matrix of type '<class 'numpy.float64'>'
	with 316477 stored elements in Compressed Sparse Row format>


In [0]:
model = textacy.TopicModel('nmf', n_topics=10)

In [0]:
model.fit(doc_term_matrix)

In [0]:
doc_topic_matrix = model.transform(doc_term_matrix)

In [128]:
doc_topic_matrix.shape

(1241, 10)

In [134]:
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=10):
  print('topic', topic_idx, ':', '   '.join(top_terms))

topic 0 : to   we   of   and   that   a   i   in   arar   have
topic 1 : of   to   Postal   and   Court   Tax   a   in   torturar   Service
topic 2 : Justice   her   of   to   Texas   and   a   majority   dissent   in
topic 3 : gun   guns   of   to   dealers   and   that   a      in
topic 4 : of   and   to   in   visar   a   States   United   for   shall
topic 5 : and   of   to   in   a      women   for   that   health
topic 6 : blind   to   TANF   of   tribes   Indians   earnings   in   welfare   and
topic 7 : Brother   Stephen   Brother Stephen   College   and   of   his   in   was   Manhattan
topic 8 : eat   meat   product   ready   poultry   of   to   Secretary   a   risk
topic 9 : EPA   and   to   air   of   White   that   in      what


In [0]:
obama_docs = list(corpus.get(lambda doc: doc.metadata['speaker_name'] == 'Barack Obama'))

In [0]:
doc = corpus[-1]

In [137]:
doc

Doc(2999 tokens; "In the Federalist Papers, we often hear the ref...")

In [138]:
textacy.preprocess_text(doc.text, lowercase=True, no_punct=True)[:70]

'in the federalist papers we often hear the reference to the senate s r'

In [139]:
textacy.text_utils.keyword_in_context(doc.text, 'America', window_width=35)

g on this tiny piece of Senate and  America n history. Some 10 years ago, I ask
o do the hard work in New York and  America , who get up every day and do the v
say: You know, you never can count  America  out. Whenever the chips are down, 
 what we know will give our fellow  America ns a better shot at the kind of fut
aith in this body and in my fellow  America ns. I remain an optimist, that Amer
ricans. I remain an optimist, that  America 's best days are still ahead of us.


In [140]:
list(textacy.extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:15]

[In the,
 the Federalist,
 Federalist Papers,
 we often,
 often hear,
 hear the,
 the reference,
 reference to,
 to the,
 the Senate,
 Senate's,
 's role,
 to avert,
 avert the,
 the consequences]

In [0]:
ts = textacy.TextStats(doc)

In [142]:
ts.n_unique_words

1107

In [143]:
ts.basic_counts

{'n_chars': 11498,
 'n_long_words': 512,
 'n_monosyllable_words': 1355,
 'n_polysyllable_words': 386,
 'n_sents': 101,
 'n_syllables': 4183,
 'n_unique_words': 1107,
 'n_words': 2516}

In [144]:
ts.flesch_kincaid_grade_level

13.74345102236774

In [145]:
ts.readability_stats

{'automated_readability_index': 12.549920902265107,
 'coleman_liau_index': 9.882109957869638,
 'flesch_kincaid_grade_level': 13.74345102236774,
 'flesch_reading_ease': 81.67731398259063,
 'gulpease_index': 55.34340222575517,
 'gunning_fog_index': 16.10108139589794,
 'lix': 45.26065261534103,
 'smog_index': 14.29716418426051,
 'wiener_sachtextformel': 7.137036602181681}

In [0]:
bot = doc.to_bag_of_terms(ngrams={2, 3}, as_strings=True)

In [147]:
sorted(bot.items(), key=lambda x: x[1], reverse=True)[:10]

[('', 26),
 ('I', 21),
 ('New York', 18),
 ('of the', 17),
 ('to the', 8),
 ('and the', 8),
 ('in the', 8),
 ('the Senate', 7),
 ('I have', 7),
 ('and I', 7)]