# Text Tokenizing example
### Using Spacy & nltk for tokenization

In [13]:
# import spacy
# from spacy.cli.download import download
# download(model="en_core_web_sm")

import spacy

nlp = spacy.load('en_core_web_sm') ##('en')
text = "Mary, don't slap the green witch"
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch']


In [7]:
from nltk.tokenize import TweetTokenizer
tweet = u"Snow White and the Seven Degrees" \
        u"#MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))


['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


In [14]:
def n_grams(text, n):
    '''
    takes tokens or text, returns a list of n-grams
    '''
    return [text[i:i+n] for i in range(len(text)-n+1)]

cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']
print([n_grams(cleaned,3)])

[[['mary', ',', "n't"], [',', "n't", 'slap'], ["n't", 'slap', 'green'], ['slap', 'green', 'witch'], ['green', 'witch', '.']]]


### lemmatization(표제어 추출)
##### ex) fly -> flow, flew, flown, flowing : fly는 뒤의 모든 단어의 표제어

In [15]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"he was running late")
for token in doc:
    print('{} --> {}'.format(token, token.lemma_))


he --> he
was --> be
running --> run
late --> late


### POS(part of speech) Tagging

In [16]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Mary slapped the green witch.")

for token in doc :
    print("{} - {}".format(token, token.pos_))

Mary - PROPN
slapped - VERB
the - DET
green - ADJ
witch - NOUN
. - PUNCT


### Chunking (shallow parsing)

In [17]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
    print("{} - {}".format(chunk, chunk.label_))





Mary - NP
the green witch - NP
