In [3]:
%matplotlib inline
import sys
from pathlib import Path

import pandas as pd

import spacy
from spacy import displacy
from textacy.extract import ngrams, entities

In [4]:
#!{sys.executable} -m spacy validate

In [5]:
nlp = spacy.load('en_core_web_lg')
type(nlp) 


spacy.lang.en.English

In [6]:
def get_attributes(f):
    print([a for a in dir(f) if not a.startswith('_')], end=' ')

In [7]:
get_attributes(nlp)


['Defaults', 'add_pipe', 'analyze_pipes', 'batch_size', 'begin_training', 'component', 'component_names', 'components', 'config', 'create_optimizer', 'create_pipe', 'create_pipe_from_source', 'default_config', 'default_error_handler', 'disable_pipe', 'disable_pipes', 'disabled', 'enable_pipe', 'evaluate', 'factories', 'factory', 'factory_names', 'from_bytes', 'from_config', 'from_disk', 'get_factory_meta', 'get_factory_name', 'get_pipe', 'get_pipe_config', 'get_pipe_meta', 'has_factory', 'has_pipe', 'initialize', 'lang', 'make_doc', 'max_length', 'meta', 'path', 'pipe', 'pipe_factories', 'pipe_labels', 'pipe_names', 'pipeline', 'rehearse', 'remove_pipe', 'rename_pipe', 'replace_listeners', 'replace_pipe', 'resume_training', 'select_pipes', 'set_error_handler', 'set_factory_meta', 'to_bytes', 'to_disk', 'tokenizer', 'update', 'use_params', 'vocab'] 

In [8]:
path = Path('data/movie_data.csv')
files = sorted(list(path.glob('**/*.csv')))
doc_list = []
for i, file in enumerate(files):
    with open(str(file), encoding='latin1') as f:
        topic = file.parts[-2]
        lines = f.readlines()
        heading = lines[0].strip()
        body = ' '.join([l.strip() for l in lines[1:]])
        doc_list.append([topic.capitalize(), heading, body])

In [14]:
topic

'ZX'

In [18]:
doc.is_sentenced


  doc.is_sentenced


True

In [19]:
doc.is_tagged

  doc.is_tagged


True

In [20]:
doc.text

'While higher interest rates, slower growth, and softer labor market conditions will bring down inflation, they will also bring some pain to households and businesses,'

In [21]:
get_attributes(doc.vocab)

['add_flag', 'cfg', 'deduplicate_vectors', 'from_bytes', 'from_disk', 'get_noun_chunks', 'get_vector', 'has_vector', 'lang', 'length', 'lex_attr_getters', 'lookups', 'morphology', 'prune_vectors', 'reset_vectors', 'set_vector', 'strings', 'to_bytes', 'to_disk', 'vectors', 'vectors_length', 'writing_system'] 

In [22]:
pd.Series([token.text for token in doc])

0          While
1         higher
2       interest
3          rates
4              ,
5         slower
6         growth
7              ,
8            and
9         softer
10         labor
11        market
12    conditions
13          will
14         bring
15          down
16     inflation
17             ,
18          they
19          will
20          also
21         bring
22          some
23          pain
24            to
25    households
26           and
27    businesses
28             ,
dtype: object

In [23]:
pd.DataFrame([[t.text, t.lemma_, t.pos_, t.tag_, t.dep_, t.shape_, t.is_alpha, t.is_stop]
              for t in doc],
             columns=['text', 'lemma', 'pos', 'tag', 'dep', 'shape', 'is_alpha', 'is_stop'])

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,While,while,SCONJ,IN,mark,Xxxxx,True,True
1,higher,high,ADJ,JJR,amod,xxxx,True,False
2,interest,interest,NOUN,NN,compound,xxxx,True,False
3,rates,rate,NOUN,NNS,nsubj,xxxx,True,False
4,",",",",PUNCT,",",punct,",",False,False
5,slower,slow,ADJ,JJR,amod,xxxx,True,False
6,growth,growth,NOUN,NN,conj,xxxx,True,False
7,",",",",PUNCT,",",punct,",",False,False
8,and,and,CCONJ,CC,cc,xxx,True,True
9,softer,soft,ADJ,JJR,amod,xxxx,True,False


In [24]:
options = {'compact': True, 'bg': 'white',
           'color': 'black', 'font': 'Source Sans Pro', 'notebook': True}

In [25]:
displacy.render(doc, style='dep', options=options)

In [26]:
displacy.render(doc, style='ent', jupyter=True)



In [28]:
files = Path('bbc').glob('**/*.txt')
bbc_articles = []
for i, file in enumerate(sorted(list(files))):
    with file.open(encoding='latin1') as f:
        lines = f.readlines()
        body = ' '.join([l.strip() for l in lines[1:]]).strip()
        bbc_articles.append(body)

In [43]:
doc = nlp(bbc_articles[5])
doc

Japan's economy teetered on the brink of a technical recession in the three months to September, figures show.  Revised figures indicated growth of just 0.1% - and a similar-sized contraction in the previous quarter. On an annual basis, the data suggests annual growth of just 0.2%, suggesting a much more hesitant recovery than had previously been thought. A common technical definition of a recession is two successive quarters of negative growth.  The government was keen to play down the worrying implications of the data. "I maintain the view that Japan's economy remains in a minor adjustment phase in an upward climb, and we will monitor developments carefully," said economy minister Heizo Takenaka. But in the face of the strengthening yen making exports less competitive and indications of weakening economic conditions ahead, observers were less sanguine. "It's painting a picture of a recovery... much patchier than previously thought," said Paul Sheard, economist at Lehman Brothers in T

In [31]:
sentences = [s for s in doc.sents]
sentences[:3]

[Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.  ,
 The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales.,
 TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.]

In [32]:
pd.DataFrame([[t.text, t.pos_, spacy.explain(t.pos_)] for t in sentences[0]], 
             columns=['Token', 'POS Tag', 'Meaning']).head(15)

Unnamed: 0,Token,POS Tag,Meaning
0,Quarterly,ADJ,adjective
1,profits,NOUN,noun
2,at,ADP,adposition
3,US,PROPN,proper noun
4,media,NOUN,noun
5,giant,NOUN,noun
6,TimeWarner,PROPN,proper noun
7,jumped,VERB,verb
8,76,NUM,numeral
9,%,NOUN,noun


In [33]:
displacy.render(sentences[0].as_doc(), style='ent', jupyter=True)

In [34]:
entities = [e.text for e in entities(doc)]
pd.Series(entities).value_counts().head()

TimeWarner        7
AOL               5
fourth quarter    3
AOL Europe        2
27%               2
dtype: int64

In [35]:
pd.Series([n.text for n in ngrams(doc, n=2, min_freq=2)]).value_counts()

fourth quarter     3
Time Warner        2
quarter profits    2
company said       2
AOL Europe         2
dtype: int64

In [37]:
iter_texts = (bbc_articles[i] for i in range(len(bbc_articles)))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)):
    if i % 100 == 0:
        print(i, end = ' ')
    assert doc.is_parsed

0 

  assert doc.is_parsed


100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 

In [44]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [46]:
prerect = preprocessor(bbc_articles[5])

In [50]:
prerect

'japan s economy teetered on the brink of a technical recession in the three months to september figures show revised figures indicated growth of just 0 1 and a similar sized contraction in the previous quarter on an annual basis the data suggests annual growth of just 0 2 suggesting a much more hesitant recovery than had previously been thought a common technical definition of a recession is two successive quarters of negative growth the government was keen to play down the worrying implications of the data i maintain the view that japan s economy remains in a minor adjustment phase in an upward climb and we will monitor developments carefully said economy minister heizo takenaka but in the face of the strengthening yen making exports less competitive and indications of weakening economic conditions ahead observers were less sanguine it s painting a picture of a recovery much patchier than previously thought said paul sheard economist at lehman brothers in tokyo improvements in the jo