In [None]:
from utils.classificaton_utils import binarize
from sklearn.model_selection import train_test_split
from flair.models.text_classification_model import TARSClassifier
from flair.datasets import SentenceDataset
from flair.data import Sentence, Corpus
import pandas as pd

In [None]:
import flair
flair.__version__

In [None]:
import sys
print(sys.version)

In [None]:
lemma, pos = 'machine','NN'
senses = {'machine_nn01-38475835','machine_nn01-38475923'}
relations = ['seed','synonym'] # 'descendant','sibling'

train, val, test  = binarize(lemma,
                            pos, 
                            senses, 
                            relations,
                            strict_filter=True,
                            start=1700,
                            end=2000)

In [None]:
print(train.shape,test.shape)

In [None]:
train.head()

In [None]:
def enclose_keyword(row:pd.Series,
                    enclose_token:str='$') -> str:
    """enclose keyword with specific token to point
    learner towards to word it has to focus on. this
    is part of the weak supervision when learning
    from context/quotations.
    Arguments:
        row (pd.Series): row of quotations dataframe
        enclose_token (str): use token to mark target expression
                    effectively this serves begin and end token
    Returns:
        quotation with target token marked by `enclose_token`
    """
    sentence = ''
    for i,c in enumerate(row.full_text):
        if i == int(row.keyword_offset):
            sentence+=enclose_token + ' '
        elif i ==int(row.keyword_offset + len(row.keyword)):
            sentence+= ' ' + enclose_token
        sentence+=c
    return sentence

In [None]:
train_sentences = [(enclose_keyword(row),
                    row.text.get('keyword',''),
                    row.label)
                        for i,row in train.iterrows()]

# Add definitions to train set:
train_sentences += [(row.keyword + ": " + row.definition,
                    row.keyword,
                    row.label)
                        for i,row in train[["definition", "label", "keyword"]].drop_duplicates(subset=["definition", "label"]).iterrows()]
                
train_sentences = [Sentence(s + ' ' + t).add_label('machine_or_not',l) for s,t,l in train_sentences if s and t]


In [None]:
test_sentences = [(enclose_keyword(row),
                    row.text.get('keyword',''),
                    row.label)
                        for i,row in test.iterrows()]
                
test_sentences = [Sentence(s + ' ' + t).add_label('machine_or_not',l) for s,t,l in test_sentences if s and t]


In [None]:
trainset = SentenceDataset(train_sentences)
testset = SentenceDataset(test_sentences)
corpus = Corpus(train=trainset, test=testset)

In [None]:
from flair.trainers import ModelTrainer

# 1. load base TARS
tars = TARSClassifier.load('tars-base')

# 2. make the model aware of the desired set of labels from the new corpus
tars.add_and_switch_to_new_task("machine_or_not", label_dictionary=corpus.make_label_dictionary())

# 3. initialize the text classifier trainer with your corpus
trainer = ModelTrainer(tars, corpus)

# 4. train model
trainer.train(base_path='resources/taggers/machine_or_not', # path to store the model artifacts
              learning_rate=1e-4, # use very small learning rate
              mini_batch_size=1, # small mini-batch size since corpus is tiny
              max_epochs=2, # terminate after 10 epochs
              train_with_dev=True,
              )