## Tutorial 1

In [None]:
from flair.data import Sentence

In [None]:
sentence = Sentence("The grass is green .")

In [None]:
print(sentence)

In [None]:
sentence.get_token(3)

In [None]:
sentence[2]

In [None]:
sentence = Sentence("The grass is green.",use_tokenizer=True)

In [None]:
sentence

In [None]:
sentence[3].add_tag('ner','color')

In [None]:
print(sentence.to_tagged_string())

In [None]:
from flair.data import Label

In [None]:
sentence[3].get_tag('ner')

In [None]:
# type annotation syntax
tag: Label = sentence[3].get_tag('ner')

In [None]:
tag = sentence[3].get_tag('ner')

In [None]:
f'"{sentence[3]}" is tagged as "{tag.value}" with confidence score "{tag.score}"'

In [None]:
sentence = Sentence('France is the current world cup winner.')

In [None]:
sentence.add_labels(['sports','world cup'])

In [None]:
sentence = Sentence('France is the current world cup winner.',labels=['sports','world cup'])

In [None]:
for label in sentence.labels:
    print(label)

## Tutorial 2

In [None]:
from flair.models import SequenceTagger

In [None]:
tagger = SequenceTagger.load('ner')

In [None]:
sentence = Sentence('George Washington went to Washington')
tagger.predict(sentence)

In [None]:
sentence.to_tagged_string()

In [None]:
for entity in sentence.get_spans('ner'):
    print(entity)

In [None]:
sentence.to_dict(tag_type='ner')

## Semantic frames

In [None]:
tagger = SequenceTagger.load('frame')

In [None]:
sentence = Sentence("The machine caused severe disruption.",use_tokenizer=True)

In [None]:
tagger.predict(sentence)

In [None]:
sentence.to_tagged_string()

In [None]:
sentence.to_dict()

In [None]:
text = "This is a sentence. This is another sentence. I love Berlin."

In [None]:
from segtok.segmenter import split_single
sentences = [Sentence(sent,use_tokenizer=True) for sent in split_single(text)]

In [None]:
tagger: SequenceTagger = SequenceTagger.load('ner')

In [None]:
tagger.predict(sentences)

In [None]:
from flair.models import TextClassifier
classifier = TextClassifier.load('en-sentiment')

In [None]:
sentence = Sentence('I really did not like that movie!')
classifier.predict(sentence)
sentence.labels

## Tutorial 3 Word Embeddings

In [None]:
from flair.embeddings import WordEmbeddings

In [None]:
glove_embedding = WordEmbeddings('glove')

In [None]:
sentence = Sentence('The grass is green .')
glove_embedding.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

In [None]:
ls ../../models

In [1]:
#!pip3 install --upgrade git+https://github.com/zalandoresearch/flair.git
from flair.embeddings import FastTextEmbeddings 

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
fasttest_kb = FastTextEmbeddings('../../models/kranten_pd_1875-6_model.fasttext')

2019-07-26 13:48:31,852 this function is deprecated, use smart_open.open instead


NotImplementedError: Supervised fastText models are not supported

In [None]:
sentence = Sentence('De Heer besliste over zijn lot')
fasttest_kb.embed(sentence)

In [None]:
sentence[0].embedding

In [4]:
from flair.embeddings import WordEmbeddings, CharacterEmbeddings

In [5]:
glove_embedding = WordEmbeddings('glove')

2019-07-26 13:54:02,842 this function is deprecated, use smart_open.open instead


In [6]:
character_embeddings = CharacterEmbeddings()

2019-07-26 13:54:22,027 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/common_characters not found in cache, downloading to /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmp60qhomtc


100%|██████████| 2887/2887 [00:00<00:00, 574101.82B/s]

2019-07-26 13:54:22,146 copying /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmp60qhomtc to cache at /Users/kbeelen/.flair/datasets/common_characters
2019-07-26 13:54:22,150 removing temp file /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmp60qhomtc





In [7]:
from flair.embeddings import StackedEmbeddings

In [8]:
stacked_embeddings = StackedEmbeddings(
    embeddings=[glove_embedding,character_embeddings]
    )

In [10]:
from flair.data import Sentence
sentence = Sentence('The grass is green .')

## Tutorial 4

In [12]:
from flair.embeddings import FlairEmbeddings
flair_embedding_forward = FlairEmbeddings('news-forward')

In [15]:
sentence = Sentence('The grass is green .')
flair_embedding_forward.embed(sentence)

[Sentence: "The grass is green ." - 5 Tokens]

In [16]:
from flair.embeddings import WordEmbeddings,FlairEmbeddings,StackedEmbeddings

stacked_embeddings = StackedEmbeddings([
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward')
])

2019-07-26 14:01:03,602 this function is deprecated, use smart_open.open instead
2019-07-26 14:01:04,796 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt not found in cache, downloading to /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpshh11n83


100%|██████████| 73034575/73034575 [00:19<00:00, 3803823.69B/s]

2019-07-26 14:01:24,166 copying /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpshh11n83 to cache at /Users/kbeelen/.flair/embeddings/news-backward-0.4.1.pt





2019-07-26 14:01:24,301 removing temp file /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpshh11n83


## Tutorial 5

In [17]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence

In [18]:
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

2019-07-26 14:06:18,964 this function is deprecated, use smart_open.open instead


In [19]:
document_embeddings = DocumentPoolEmbeddings([glove_embedding,
                                             flair_embedding_backward,
                                             flair_embedding_forward])

In [20]:
sentence = Sentence('The grass is green . And the sky is blue .')
document_embeddings.embed(sentence)

In [21]:
sentence.get_embedding()

tensor([-0.3197,  0.2621,  0.4037,  ..., -0.0013, -0.0026,  0.0170],
       grad_fn=<CatBackward>)

In [25]:
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings

glove_embedding = WordEmbeddings('glove')
document_embeddings = DocumentRNNEmbeddings([glove_embedding],rnn_type='LSTM')

2019-07-26 14:22:36,981 this function is deprecated, use smart_open.open instead


In [26]:
sentence = Sentence('The grass is green . And sky is blue .')
document_embeddings.embed(sentence)
print(sentence.get_embedding())

tensor([ 1.0580e-03,  8.6338e-02,  1.1168e-01, -2.7024e-01, -5.4331e-02,
         3.0991e-01,  1.1335e-01,  1.2424e-01, -4.4725e-02, -1.0919e-01,
         2.5089e-02,  2.1561e-01,  1.2661e-01,  3.3203e-02, -2.0583e-02,
        -4.8562e-02,  2.9461e-02,  2.0718e-01,  1.9399e-01, -1.8463e-01,
        -4.3655e-02, -7.0158e-03,  3.1458e-02,  1.7692e-01,  1.3177e-01,
         1.5268e-01,  2.2838e-01, -1.9117e-02,  1.3557e-01, -2.0827e-01,
         1.2222e-01,  1.7133e-01, -1.5888e-01, -9.2618e-02, -2.7804e-01,
        -4.2703e-02,  7.8886e-02,  4.8838e-02,  2.6226e-02, -8.8694e-02,
         3.4525e-01, -7.7469e-02, -1.3248e-01, -3.4263e-02, -1.3785e-01,
        -1.2376e-01, -1.4253e-01,  4.4431e-02, -4.6080e-02,  9.0680e-02,
         1.1729e-01,  4.1246e-02, -7.0335e-02, -8.4158e-02,  2.3115e-01,
        -3.7526e-02,  9.7894e-02, -1.6192e-01, -1.4767e-02, -6.6481e-02,
         5.4451e-02,  3.9360e-02,  1.1441e-01,  2.4753e-01, -3.7049e-02,
        -2.6242e-02,  7.8224e-02,  1.0155e-01,  5.0

## Tutorial 6

In [28]:
import flair.datasets
corpus = flair.datasets.UD_ENGLISH()

2019-07-26 14:26:00,232 https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-dev.conllu not found in cache, downloading to /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpc_x8j7kj


1668174B [00:00, 38817358.56B/s]         

2019-07-26 14:26:00,313 copying /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpc_x8j7kj to cache at /Users/kbeelen/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2019-07-26 14:26:00,317 removing temp file /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpc_x8j7kj





2019-07-26 14:26:01,154 https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-test.conllu not found in cache, downloading to /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpd5ipwcir


1661985B [00:00, 38151186.44B/s]         

2019-07-26 14:26:01,255 copying /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpd5ipwcir to cache at /Users/kbeelen/.flair/datasets/ud_english/en_ewt-ud-test.conllu
2019-07-26 14:26:01,260 removing temp file /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpd5ipwcir





2019-07-26 14:26:01,871 https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu not found in cache, downloading to /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpnwkpgcbi


13303045B [00:00, 35650766.63B/s]                             

2019-07-26 14:26:02,288 copying /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpnwkpgcbi to cache at /Users/kbeelen/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2019-07-26 14:26:02,312 removing temp file /var/folders/2_/fcdvqwzs6j75cfr97nggzfn499sjqf/T/tmpnwkpgcbi
2019-07-26 14:26:02,313 Reading data from /Users/kbeelen/.flair/datasets/ud_english
2019-07-26 14:26:02,314 Train: /Users/kbeelen/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2019-07-26 14:26:02,314 Test: /Users/kbeelen/.flair/datasets/ud_english/en_ewt-ud-test.conllu
2019-07-26 14:26:02,315 Dev: /Users/kbeelen/.flair/datasets/ud_english/en_ewt-ud-dev.conllu





In [30]:
print(len(corpus.train))

12543


In [31]:
corpus.test[0]

Sentence: "What if Google Morphed Into GoogleOS ?" - 7 Tokens

In [33]:
corpus.test[0].to_tagged_string('pos')

'What <WP> if <IN> Google <NNP> Morphed <VBD> Into <IN> GoogleOS <NNP> ? <.>'

In [34]:
downsampled_corpus = flair.datasets.UD_ENGLISH().downsample(0.1)


2019-07-26 14:28:18,880 Reading data from /Users/kbeelen/.flair/datasets/ud_english
2019-07-26 14:28:18,881 Train: /Users/kbeelen/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2019-07-26 14:28:18,881 Test: /Users/kbeelen/.flair/datasets/ud_english/en_ewt-ud-test.conllu
2019-07-26 14:28:18,882 Dev: /Users/kbeelen/.flair/datasets/ud_english/en_ewt-ud-dev.conllu


<flair.datasets.UD_ENGLISH at 0x192410128>

In [35]:
print(downsampled_corpus)

Corpus: 1254 train + 200 dev + 208 test sentences


In [37]:
corpus = flair.datasets.CONLL_03_DUTCH()


2019-07-26 14:30:44,040 Reading data from /Users/kbeelen/.flair/datasets/conll_03_dutch
2019-07-26 14:30:44,041 Train: /Users/kbeelen/.flair/datasets/conll_03_dutch/ned.train
2019-07-26 14:30:44,041 Dev: /Users/kbeelen/.flair/datasets/conll_03_dutch/ned.testa
2019-07-26 14:30:44,042 Test: /Users/kbeelen/.flair/datasets/conll_03_dutch/ned.testb
2019-07-26 14:30:44,043 UTF-8 can't read: /Users/kbeelen/.flair/datasets/conll_03_dutch/ned.train ... using "latin-1" instead.
2019-07-26 14:30:49,442 UTF-8 can't read: /Users/kbeelen/.flair/datasets/conll_03_dutch/ned.testb ... using "latin-1" instead.
2019-07-26 14:30:52,201 UTF-8 can't read: /Users/kbeelen/.flair/datasets/conll_03_dutch/ned.testa ... using "latin-1" instead.


In [41]:
tag_dict = corpus.make_tag_dictionary('ner')
print(tag_dict)

<flair.data.Dictionary object at 0x16fa487f0>


In [48]:
tag_dict.get_item_for_index(2)

'S-ORG'

In [49]:
tag_dict.get_idx_for_item('O')

1

In [50]:
stats = corpus.obtain_statistics()
print(stats)

{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 15806,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 202931,
            "min": 1,
            "max": 859,
            "avg": 12.838858661267873
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 5195,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 68994,
            "min": 1,
            "max": 409,
            "avg": 13.280846968238691
        }
    },
    "DEV": {
        "dataset": "DEV",
        "total_number_of_documents": 2895,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 37761,
            "min": 1,
            "max": 84,
            "avg": 13.043523316062176
        }
    }
}


## Tutorial 7 

### later 

## Tutorial 8 

### later

## Tutorial 9

See FineTuneModel.ipynb