In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")

In [3]:
doc = nlp("The hardest choices require the strongest givn will!")
tokens = [token.text for token in doc if token]
tokens

['The',
 'hardest',
 'choices',
 'require',
 'the',
 'strongest',
 'givn',
 'will',
 '!']

In [4]:
# Customize to capture slangs and custom words
from spacy.symbols import ORTH
nlp.tokenizer.add_special_case("givn", [
    {ORTH: "giv"},
    {ORTH: "n"}
    ])
doc = nlp("The hardest choices require the strongest givn will!")
tokens = [token.text for token in doc if token]
tokens

['The',
 'hardest',
 'choices',
 'require',
 'the',
 'strongest',
 'giv',
 'n',
 'will',
 '!']

# Sentence Tokenization

In [5]:
nlp.add_pipe('sentencizer')
doc = nlp("The hardest choices require the strongest givn will! In the end it cost The great sacrifice. Now I can finally rest, and watch the sun rise on the great universe.")
for sentence in doc.sents:
    print(sentence)


The hardest choices require the strongest givn will!
In the end it cost The great sacrifice.
Now I can finally rest, and watch the sun rise on the great universe.


In [6]:
nlp.pipe_names

['sentencizer']

                                                    EXERCISE

In [7]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.\
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

# TODO: Write code here
# Hint: token has an attribute that can be used to detect a url
def get_token(text):
    doc = nlp(text)
    for element in doc:
        if element.like_url:
            print(element)
    return element

print(get_token(text))

http://www.data.gov/
http://www.science.gov/
http://data.gov.uk/.
http://www3.norc.org/gss+website/
http://www.europeansocialsurvey.org/.




Exercise (2) Extract all money transaction from below sentence along with currency. Output should be,

two $

500 €

In [8]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

# TODO: Write code here
# Hint: Use token.i for the index of a token and token.is_currency for currency symbol detection
doc = nlp(transactions)
currency_transaction_list = [
    (doc[i - 1].text + token.text if doc[i - 1].like_num else token.text + doc[i + 1].text)
    for i, token in enumerate(doc) 
    if token.is_currency and 
       ((i > 0 and doc[i - 1].like_num) or (i < len(doc) - 1 and doc[i + 1].like_num))
]
for transaction in currency_transaction_list:
    print(transaction)

two$
500€
