In [2]:
#import the MeTA python bindings
import metapy
#If you'd like, you can tell MeTA to log to stderr so you can get progress output when running long-running function calls.
metapy.log_to_stderr()

In [3]:
doc = metapy.index.Document()
doc.content("I said that I can't believe that it only costs $19.95!")

In [4]:
tok = metapy.analyzers.ICUTokenizer()

In [5]:
tok.set_content(doc.content()) # this could be any string
tokens = [token for token in tok]
print(tokens)

['<s>', 'I', 'said', 'that', 'I', "can't", 'believe', 'that', 'it', 'only', 'costs', '$', '19.95', '!', '</s>']


In [6]:
doc.content("I said that I can't believe that it only costs $19.95! I could only find it for more than $30 before.")
tok.set_content(doc.content())
tokens = [token for token in tok]
print(tokens)

['<s>', 'I', 'said', 'that', 'I', "can't", 'believe', 'that', 'it', 'only', 'costs', '$', '19.95', '!', '</s>', '<s>', 'I', 'could', 'only', 'find', 'it', 'for', 'more', 'than', '$', '30', 'before', '.', '</s>']


In [7]:
tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)
tok.set_content(doc.content())
tokens = [token for token in tok]
print(tokens)

['I', 'said', 'that', 'I', "can't", 'believe', 'that', 'it', 'only', 'costs', '$', '19.95', '!', 'I', 'could', 'only', 'find', 'it', 'for', 'more', 'than', '$', '30', 'before', '.']


In [8]:
tok = metapy.analyzers.LengthFilter(tok, min=2, max=30)
tok.set_content(doc.content())
tokens = [token for token in tok]
print(tokens)

['said', 'that', "can't", 'believe', 'that', 'it', 'only', 'costs', '19.95', 'could', 'only', 'find', 'it', 'for', 'more', 'than', '30', 'before']


In [22]:
import wget

url = "https://raw.githubusercontent.com/meta-toolkit/meta/master/data/lemur-stopwords.txt"

filename = wget.download(url)

filename

'lemur-stopwords.txt'

In [23]:
tok = metapy.analyzers.ListFilter(tok, "lemur-stopwords.txt", metapy.analyzers.ListFilter.Type.Reject)
tok.set_content(doc.content())
tokens = [token for token in tok]
print(tokens)

["can't", 'believe', 'costs', '19.95', 'find', '30']


In [24]:
tok = metapy.analyzers.Porter2Filter(tok)
tok.set_content(doc.content())
tokens = [token for token in tok]
print(tokens)

["can't", 'believ', 'cost', '19.95', 'find', '30']


In [25]:
tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)
tok = metapy.analyzers.LowercaseFilter(tok)
tok.set_content(doc.content())
tokens = [token for token in tok]
print(tokens)

['i', 'said', 'that', 'i', "can't", 'believe', 'that', 'it', 'only', 'costs', '$', '19.95', '!', 'i', 'could', 'only', 'find', 'it', 'for', 'more', 'than', '$', '30', 'before', '.']


In [26]:
ana = metapy.analyzers.NGramWordAnalyzer(1, tok)
print(doc.content())
unigrams = ana.analyze(doc)
print(unigrams)

I said that I can't believe that it only costs $19.95! I could only find it for more than $30 before.
{'30': 1, '.': 1, 'more': 1, 'it': 2, 'said': 1, 'only': 2, "can't": 1, 'believe': 1, '19.95': 1, 'before': 1, 'could': 1, 'find': 1, 'that': 2, 'than': 1, '!': 1, 'for': 1, 'i': 3, '$': 2, 'costs': 1}


In [27]:
ana = metapy.analyzers.NGramWordAnalyzer(2, tok)
bigrams = ana.analyze(doc)
print(bigrams)

{('it', 'only'): 1, ('19.95', '!'): 1, ('only', 'costs'): 1, ('it', 'for'): 1, ('than', '$'): 1, ('$', '19.95'): 1, ('believe', 'that'): 1, ('$', '30'): 1, ('!', 'i'): 1, ('30', 'before'): 1, ('more', 'than'): 1, ('i', "can't"): 1, ('find', 'it'): 1, ('for', 'more'): 1, ('before', '.'): 1, ("can't", 'believe'): 1, ('that', 'i'): 1, ('i', 'said'): 1, ('that', 'it'): 1, ('only', 'find'): 1, ('could', 'only'): 1, ('said', 'that'): 1, ('costs', '$'): 1, ('i', 'could'): 1}


In [28]:
tok = metapy.analyzers.CharacterTokenizer()
ana = metapy.analyzers.NGramWordAnalyzer(4, tok)
fourchar_ngrams = ana.analyze(doc)
print(fourchar_ngrams)

{(' ', 'o', 'n', 'l'): 2, ('s', 't', 's', ' '): 1, ('t', 'h', 'a', 't'): 2, ('t', ' ', 'I', ' '): 1, ('i', 'n', 'd', ' '): 1, ('h', 'a', 'n', ' '): 1, ('m', 'o', 'r', 'e'): 1, ('c', 'a', 'n', "'"): 1, ('$', '3', '0', ' '): 1, ('n', 'd', ' ', 'i'): 1, ('t', ' ', 'b', 'e'): 1, ('I', ' ', 's', 'a'): 1, ('a', 'i', 'd', ' '): 1, ('l', 'y', ' ', 'c'): 1, ('r', 'e', ' ', 't'): 1, ('l', 'y', ' ', 'f'): 1, ('e', 'l', 'i', 'e'): 1, ('o', 'r', ' ', 'm'): 1, ('i', 'd', ' ', 't'): 1, ('$', '1', '9', '.'): 1, ('t', ' ', 'f', 'o'): 1, ('l', 'i', 'e', 'v'): 1, ('d', ' ', 't', 'h'): 1, ('.', '9', '5', '!'): 1, ('t', ' ', 'i', 't'): 1, ("'", 't', ' ', 'b'): 1, (' ', 'b', 'e', 'f'): 1, ('9', '.', '9', '5'): 1, ('9', '5', '!', ' '): 1, ('d', ' ', 'o', 'n'): 1, ('3', '0', ' ', 'b'): 1, ('b', 'e', 'f', 'o'): 1, ('f', 'o', 'r', 'e'): 1, ('r', ' ', 'm', 'o'): 1, ('i', 't', ' ', 'o'): 1, (' ', 'c', 'o', 'u'): 1, ('n', ' ', '$', '3'): 1, ('t', ' ', 'o', 'n'): 1, (' ', 'I', ' ', 'c'): 2, ('b', 'e', 'l', 'i'): 1,

In [29]:
seq = metapy.sequence.Sequence()

In [30]:
for word in ["The", "dog", "ran", "across", "the", "park", "."]:
    seq.add_symbol(word)

print(seq)

(The, ???), (dog, ???), (ran, ???), (across, ???), (the, ???), (park, ???), (., ???)


## wget @ POS Tagging Section

In [None]:
# wget -nc https://github.com/meta-toolkit/meta/releases/download/v3.0.1/greedy-perceptron-tagger.tar.gz
# tar xvf greedy-perceptron-tagger.tar.gz

In [None]:
tok = metapy.analyzers.ListFilter(tok, "lemur-stopwords.txt", metapy.analyzers.ListFilter.Type.Reject)
tok.set_content(doc.content())
tokens = [token for token in tok]
print(tokens)

In [33]:
doc = metapy.index.Document()
doc.content("I said that I can't believe that it only costs $19.95!")
tok = metapy.analyzers.ICUTokenizer() # keep sentence boundaries!
tok = metapy.analyzers.PennTreebankNormalizer(tok)
tok.set_content(doc.content())
tokens = [token for token in tok]
print(tokens)

['<s>', 'I', 'said', 'that', 'I', 'ca', "n't", 'believe', 'that', 'it', 'only', 'costs', '$', '19.95', '!', '</s>']


In [34]:
def extract_sequences(tok):
    sequences = []
    for token in tok:
        if token == '<s>':
            sequences.append(metapy.sequence.Sequence())
        elif token != '</s>':
            sequences[-1].add_symbol(token)
    return sequences

doc = metapy.index.Document()
doc.content("I said that I can't believe that it only costs $19.95!")
tok.set_content(doc.content())
for seq in extract_sequences(tok):
    tagger.tag(seq)
    print(seq)

NameError: name 'tagger' is not defined

In [35]:
ana = metapy.analyzers.load('config.toml')
doc = metapy.index.Document()
doc.content("I said that I can't believe that it only costs $19.95!")
print(ana.analyze(doc))

RuntimeError: missing feature id mapping: crf/feature.mapping.gz