[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IINemo/ostrov2019-seminar/blob/master/src/symptom_checker.ipynb)

# Preparations

In [None]:
!wget http://www.nactem.ac.uk/tsujii/GENIA/ERtask/Genia4ERtraining.tar.gz
!wget http://www.nactem.ac.uk/tsujii/GENIA/ERtask/Genia4ERtest.tar.gz

In [2]:
!tar -xf Genia4ERtraining.tar.gz
!tar -xf Genia4ERtest.tar.gz

In [None]:
!pip install flair
!pip install allennlp

# Train a tagging model

In [None]:
from flair.datasets import ColumnCorpus
from flair.data import Corpus


# 1. Get corpus
corpus: Corpus = ColumnCorpus(data_folder='./',
                      column_format={0: 'text', 
                                     1: 'ner'},
                      train_file='Genia4ERtask1.iob2', 
                      test_file='Genia4EReval1.iob2').downsample(0.5)
print(corpus)
for tag in corpus.make_tag_dictionary('ner').get_items():
    print(tag)

In [None]:
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, FlairEmbeddings, ELMoEmbeddings
from typing import List
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer


tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)


embedding_types: List[TokenEmbeddings] = [

    # TODO: <==================== Try uncommenting different lines
    WordEmbeddings('en-glove'),
    # WordEmbeddings('en')
    # WordEmbeddings('en-crawl')

    # TODO: <==================== Try uncommenting different lines
    # CharacterEmbeddings(),
    # FlairEmbeddings('pubmed-forward'),
    # FlairEmbeddings('pubmed-backward'),
    # FlairEmbeddings('news-forward'),
    # FlairEmbeddings('news-backward'),
    # ELMoEmbeddings('pubmed')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)




tagger: SequenceTagger = SequenceTagger(hidden_size=128,   # <========= TODO: Try to modify this: 200, 250
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

trainer: ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train('taggers/bio-ner',
              learning_rate=0.1,
              mini_batch_size=80, # <=========== TODO: Try to decrease this: 60, 30
              max_epochs=10, # <============ TODO: Try to increase this: 15, 20
              eval_mini_batch_size=100) 

# Plot training curves

In [None]:
# 8. plot training curves (optional)

loss_stats_path = 'taggers/bio-ner/loss.tsv'
with open(loss_stats_path, 'r') as f:
    lines = f.readlines()

with open(loss_stats_path, 'w') as f:
    f.write('\n'.join(e.strip() for e in lines if e.strip()))

from flair.visual.training_curves import Plotter
plotter = Plotter()
plotter.plot_training_curves(loss_stats_path)
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
%matplotlib inline
img = mpimg.imread('taggers/bio-ner/training.png')
plt.figure(dpi=300)
plt.imshow(img, )

# Evaluate model

Homework: You have to get at least F1=0.62 !

In [12]:
trained_model = SequenceTagger.load('taggers/bio-ner/best-model.pt')
eval_res, eval_loss = trained_model.evaluate(
                        corpus.test,
                        eval_mini_batch_size=100)
print('Test F1 score:', eval_res.main_score)

2019-07-13 22:01:16,730 loading file taggers/bio-ner/best-model.pt
Test F1 score: 0.5512


# Use model to analyze texts

In [13]:
trained_model = SequenceTagger.load('taggers/bio-ner/best-model.pt')

2019-07-13 22:01:47,994 loading file taggers/bio-ner/best-model.pt


In [None]:
from flair.data import Sentence

#text = 'Freshly isolated cells contain high levels of G0S2 mRNA which rapidly decline .'
#text = 'In THP-1 cells , TPA also induced a new , faster-migrating NF kappa B species not induced in monocytes .'
text = 'Fibroblast growth factors ( FGFs ) are heparin-binding proteins crucial to embryogenesis , angiogenesis , and wound healing .'

trained_model.predict(Sentence(text))[0].to_tagged_string()