In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from utils.dataset_download import *
import pickle
import json
from pathlib import Path, PosixPath
import pandas as pd

In [5]:
# import API credentials
with open('../oed_experiments/oed_credentials.json') as f:
    credentials = json.load(f)

In [6]:
# define lemma
lemma_id = "machine_nn01"

In [41]:
dp = "../data"

In [7]:
save_path = Path(dp)
save_path.mkdir(exist_ok=True)

In [None]:
#query the API and get the json response
sense_json = query_oed(credentials,'word',lemma_id,flags='include_senses=true&include_quotations=true')

# convert the json in a dataframe
senses_df = convert_json_to_dataframe(sense_json)

In [None]:
# save the dataframe
# as pickle
senses_df.to_pickle(save_path / f"senses_{lemma_id}.pickle")
# as csv
senses_df.to_csv(save_path / f"senses_{lemma_id}.tsv",sep='\t')

In [None]:
# open pickle file to avoid calling the API again
with open(save_path / f"senses_{lemma_id}.pickle",'rb') as in_pickle:
    machine_senses_df = pickle.load(in_pickle)

In [None]:
# get all senses that are siblings and descendants
# of the semantic class of senses listed in previously obtained query 
responses = traverse_thesaurus(credentials,machine_senses_df)

In [None]:
# traverse tree or load responses 
# responses = traverse_thesaurus(credentials,machine_senses_df)
with open(f'{dp}/tree_traversal.pickle','rb') as in_pickle:
    responses = pickle.load(in_pickle)

In [None]:
# get all quoations for the senses in the responses variable
quotations = get_quotations_from_thesaurus(credentials,responses)

In [None]:
# merge and save all information stored in the seperate pickle files
df = merge_pickled(Path("./data/senses_machine_nn01.pickle"),
                   Path("./data/tree_traversal.pickle"),
                   Path("./data/tree_traversal_quotations.pickle"))

In [None]:
df.to_pickle(f"{dp}/{lemma_id}_all.pickle")

In [None]:
df.head()

## Fin.

In [8]:
!git branch

* [32m19-machine-tagger[m
  dev[m


In [9]:
start,end = 1750,1950
lemma_id = 'machine_nn01'

In [10]:
get_last_id = lambda nested_list :[l[-1] for l in nested_list]

In [11]:
def extend_from_lemma_query(auth,lemma_id,start=1750,end=1950):
    """Extends senses from a dataframe generate from accessing
    the API via the word endpoint. The script first retrieves all
    senses, then synonyms for these senses, then other senses that 
    match the semantic classes of the retrieved senses.
    
    This script also aims to record the "provenance" of words, 
    their relation to the initial query, which can help to 
    select of filter words later on.
    
    Arguments:
        lemma_id (str)
        start (int)
        end (int)
    Returns
        a pandas.DataFrame
    """
    # load seed query dataframe
    query_df = pd.read_pickle(f"./data/senses_{lemma_id}.pickle")
    
    # use the sense endpoint to ensure all information 
    # can be properly concatenated in one dataframe
    
    # retrieve all sense ids
    query_sense_ids = query_df.id.unique()
    
    # get all senses by sense id
    print(f"Get all sense for the lemma {lemma_id}")
    seeds = [(s,query_oed(auth,'sense',s,
                    flags=f"current_in='{start}-{end}'&limit=1000"))
                        for s in tqdm(query_sense_ids)]
    
    # convert to dataframe
    seeds_df = pd.DataFrame([seed['data'] for s_id,seed in seeds])
    
    # define provenance, these words are "seed"
    seeds_df['provenance'] = seeds_df.id
    seeds_df['provenance_type'] = 'seed'
    
    # get all synonyms for the seed senses
    print(f"Get all synonyms of the senses listed in {lemma_id}")
    synonyms = [(s,query_oed(auth,'sense',s,
                    level='synonyms',
                    flags=f"current_in='{start}-{end}'&limit=1000"))
                            for s in tqdm(query_sense_ids)]

    # transform list of synonyms to a dataframe
    synonyms_df = pd.DataFrame([s for s_id,syn in synonyms for s in syn['data']])
    
    # these items have provenancy type "synonym"
    synonyms_df['provenance'] = [s_id for s_id,syn in synonyms for s in syn['data']]
    synonyms_df['provenance_type'] = 'synonym'
    
    # seed + synonyms constitute the nucleas of our query
    # branch from there
    core_df = pd.concat([seeds_df,synonyms_df])
    core_df['semantic_class_last_id'] = core_df['semantic_class_ids'].apply(get_last_id)
    
    # retrieve all semantic class ids for the senses so far
    semantic_class_ids = set([s for l in core_df.semantic_class_last_id.to_list() for s in l])

    # get all the branches for the retrieve semantic class ids
    print("Get all branches for seed senses and synonyms")
    branches = [(idx,query_oed(auth,'semanticclass', idx, 
                        level='branchsenses',
                        flags=f"current_in='{start}-{end}'&limit=1000"))
                            for idx in tqdm(semantic_class_ids)]
    
    # convert API response to dataframe
    branches_df = pd.DataFrame([s for idx,branch in branches for s in branch['data']])
    
    # provenance_type is branch with semantic class id 
    # that was use for retrieving the sense is the provenance
    branches_df['provenance'] = [idx for idx,branch in branches for s in branch['data']]
    branches_df['provenance_type'] = 'branch'
    
    branches_df['semantic_class_last_id'] = branches_df.semantic_class_ids.apply(get_last_id)
    
    # remove senses that already appear in the core_df
    branches_df_red = branches_df.loc[~branches_df.id.isin(core_df.id)]
    
    # concatenate core and branch senses
    extended_df = pd.concat([core_df,branches_df_red])
    
    # refine the provenance type
    # if the last semantic class id is not equal to provenance
    # this row is a child or descendant
    check_membership = lambda row : row.provenance in row.semantic_class_last_id
    extended_df.loc[(~extended_df.apply(check_membership,axis=1)) & (extended_df.provenance_type=='branch'),
                ["provenance_type"]]  = "branch_descendant"
    
    # save information
    extended_df.to_pickle(f'{dp}/senses_{lemma_id}_extended.pickle')
    return extended_df

In [None]:
extended_df = extend_from_lemma_query(credentials,lemma_id,start,end)

In [None]:
extended_df.head(3)

In [35]:
set(merged[(merged.provenance_type=='synonym') & (merged.lemma!='machine')].lemma)

{"Aaron's rod",
 'Durex',
 'French letter',
 'Frenchy',
 "God's image",
 'Jacuzzi',
 'John',
 'John Henry',
 'John Thomas',
 'Johnny',
 'Johnson',
 'Lizzie',
 'M.F.V.',
 'Mudian',
 'Percy',
 'Peter',
 'Roger',
 'Trojan',
 'Turing machine',
 'address',
 'aerobat',
 'aerocar',
 'aerostat',
 'affair',
 'air machine',
 'air vessel',
 'aircraft',
 'anatomy',
 'appliance',
 'ark',
 'armour',
 'arrangement',
 'arrow',
 'art',
 'artifice',
 'automaton',
 'avion',
 'baby-maker',
 'barge',
 'bark',
 'bastiment',
 'bathing-machine',
 'beam',
 'being',
 'belly',
 'bike',
 'biocomputer',
 'blood bulk',
 'board',
 'bodiȝlich',
 'body',
 'bone house',
 'bones',
 'bottom',
 'bouk',
 'bubble',
 'buggy',
 'bulk',
 'bus',
 'buzz-box',
 'buzz-wagon',
 'cabinet',
 'cade',
 'cadre',
 'car',
 'carcass',
 'carriage',
 'carrion',
 'casa',
 'case',
 'cast',
 'cautel',
 'chariot',
 'chassis',
 'chevisance',
 'chode',
 'class',
 'clay',
 'clipper',
 'clod',
 'cock',
 'cod',
 'compass',
 'compassing',
 'compilemen

In [None]:
def get_quotations(lemma_id,sense_df=''):
    """Obtain and store all quotations from a dataframe constructed
    from information retrieved via de sense endpoint.
    
    Arguments:
        sense_df (pandas.DataFrame,str)
        lemma_id (str)
        
    Returns:
        pandas.DataFrame
    """
    if isinstance(sense_df,str):
        quotations_df = pd.read_pickle(f'./data/quotations_{lemma_id}.pickle')
    else:
        quotations = [query_oed(credentials,'sense',sense_idx,level='quotations')
                        for sense_idx in tqdm(set(sense_df.id))]
            
        quotations_df = pd.concat([pd.DataFrame(q['data']) for q in quotations])
        quotations_df.to_pickle(f'{dp}/quotations_{lemma_id}.pickle')
    return quotations_df
    

In [None]:
quotations_df = get_quotations(lemma_id)
quotations_df.shape

In [None]:
merged_df = extended_df.merge(quotations_df[['id','sense_id',"text","year","source"]],left_on='id',right_on='sense_id',suffixes=['',"_quotation"])


In [None]:
merged_df.columns

## Experiments with seed and synonyms

In [None]:
surface_forms = set(extended_df[extended_df.provenance_type.isin(['seed','synonym'])].lemma)

In [None]:
def get_lemma_content_from_surface_form(surface_forms):
    """Get all quotations and senses from a list of surface forms
    """
    lemmatized = [query_oed(credentials,'lemmatize','',flags=f"form={sf}")
                             for sf in tqdm(surface_forms)]
    
    lemmas_content = [query_oed(credentials, 'word',
                       l['word']['id'], 
                       flags="include_senses=true&include_quotations=true") 
                             for lemma in tqdm(lemmatized)
                                 for l in lemma['data']]
    
    lemmas_content_df = pd.DataFrame(sense for lemma in lemmas_content for sense in lemma['data']['senses'])
    
    quotations = [quotation
                        for i,row in lemmas_content_df.iterrows()
                             for quotation in row.quotations]
    
    related_quotations_df = pd.DataFrame(quotations)
    
    related_merged_df = lemmas_content_df.merge(related_quotations_df[['id','sense_id',"text","year","source"]],left_on='id',right_on='sense_id',suffixes=['',"_quotation"])
    
    related_merged_df['provenance'] = None
    related_merged_df['provenance_type'] = 'related'
    #related_merged_df.loc[related_merged_df.id.isin(merged_df.id),['provenance_type']] = 'in_core'

    return related_merged_df

## Start here if data is downloaded and put into a dataframe

In [12]:
related_merged_df = pd.read_pickle(f'{dp}/related_merged_{lemma_id}.pickle')
merged = pd.read_pickle(f'{dp}/merged_{lemma_id}.pickle')

In [13]:
#related_merged_df.to_pickle(f'./data/related_merged_{lemma_id}.pickle')

In [14]:
core_df = merged[merged.provenance_type.isin(['seed','synonym'])]
related_df = related_merged_df.loc[~related_merged_df.id.isin(core_df.id)]

print(core_df.shape,related_df.shape)

(2975, 24) (86977, 23)


In [15]:
include_senses = {'machine_nn01-38474233','machine_nn01-38474548','machine_nn01-38475164','machine_nn01-38475286',
          "machine_nn01-38474607","machine_nn01-38475923","machine_nn01-38474877","machine_nn01-38475046",
          "machine_nn01-38475099"
         }

def select_senses(df,list_senses):
    sc_ids = set([s for l in df.loc[df.id.isin(list_senses)].semantic_class_last_id.to_list() for s in l])
    
    overlap = lambda x,l: bool(set(x).intersection(l))
    
    df = df.loc[(df.id.isin(list_senses))  | (df.provenance.isin(list_senses)) | (df.provenance.isin(sc_ids))]
    
    return df

core_selected_df = select_senses(core_df,include_senses)

In [16]:
core_not_selected_df =  core_df.loc[~core_df.id.isin(core_selected_df.id)]
print(core_df.shape,core_selected_df.shape,core_not_selected_df.shape)

(2975, 24) (1113, 24) (1833, 24)


In [17]:
related_sample_df = related_df.sample(frac=.1)

# subsampling here!! remove later
related_sample_df.shape

(8698, 23)

In [18]:
related_sample_df['label'] = 'not_machine'
core_not_selected_df['label'] = 'not_machine'
core_selected_df['label'] = 'machine'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
import unidecode
import re
    
def preprocess_sent(sent: str) -> str: # check if setting sent_id to None effects anything?
    """preprocessing function for formatting raw text before training word2vec
    # Credits: Kasra Hosseini and Kaspar Beelen
    Arguments:
        sent (string): input sentence
        sent_id (string): idx of the inpute sentence
        tokenized (boolean): if True then return the string as a list of tokens
    Returns:
    """
    # --- replace .- and . in the middle of the word
    sent = re.sub(r'(?<=\w)(\.-)(?=\w)', '-', sent)
    sent = re.sub(r'(?<=\w)(\.)(?=\w)', '', sent)
    # --- remove accent
    sent = unidecode.unidecode(sent)
    # --- remove 2 or more .
    sent = re.sub(r'[.]{2,}', '.', sent)
    # --- add a space before and after a list of punctuations
    sent = re.sub(r"([.,!?:;\"\'])", r" \1 ", sent)
    # --- remove everything except:
    sent = re.sub(r"([^a-zA-Z\-.:;,!?\d+]+)", r" ", sent)
    # --- replace numbers with <NUM>
    sent = re.sub(r'\b\d+\b', '<NUM>', sent)
    sent = re.sub(r'--', '', sent)
    # --- normalize white spaces
    sent = re.sub(r'\s+', ' ', sent)
    # --- lowercase
    sent = sent.lower()

    
    return sent    

def process_for_classification(text_col):
    sentence = preprocess_sent(text_col["full_text"])
    
    return sentence

## Sentence classification

In [20]:
import numpy as np
combined_all = pd.concat([related_sample_df[['text','label']],
                          core_not_selected_df[['text','label']],
                          core_selected_df[['text','label']],
                         ])
combined_all['processed_text'] = combined_all.text.apply(process_for_classification)

In [25]:
combined_all.drop(combined_all[combined_all.processed_text==''].index,inplace=True)


In [26]:
combined_all['partition'] = [np.random.choice(['train','dev','test'], p=[0.6, 0.2, 0.2]) for _ in range(combined_all.shape[0])]

In [27]:
for split in ['train','dev','test']:
    combined_all.loc[combined_all.partition==split][['processed_text','label']].to_csv(f'../data/{split}.csv',sep="\t")


In [28]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus

# this is the folder in which train, test and dev files reside
path = Path(dp)

# column format indicating which columns hold the text and label(s)
columns = {1: 'text', 2: 'label'}

# load corpus containing training, test and dev data and if CSV has a header, you can skip it
corpus: Corpus = CSVClassificationCorpus(path,
                                         columns,
                                         skip_header=True,
                                         delimiter='\t',    # tab-separated files
                                            ) 
    

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


2020-10-06 15:14:32,672 Reading data from ../data
2020-10-06 15:14:32,673 Train: ../data/train.csv
2020-10-06 15:14:32,673 Dev: ../data/dev.csv
2020-10-06 15:14:32,674 Test: ../data/test.csv
2020-10-06 15:14:32,702 Computing label dictionary. Progress:


100%|██████████| 9315/9315 [00:02<00:00, 4176.50it/s]

2020-10-06 15:14:35,373 [b'not_machine', b'machine']





In [29]:
from torch.optim.adam import Adam
from flair.data import Corpus
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

# 3. initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)

# 4. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

# 5. initialize the text classifier trainer with Adam optimizer
trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

# 6. start the training
trainer.train(f'{dp}/resources/classifier/machine',
              learning_rate=1e-1, # use very small learning rate
              mini_batch_size=16,
              mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
              max_epochs=5, # terminate after 5 epochs
              )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…


2020-10-06 15:14:46,840 ----------------------------------------------------------------------------------------------------
2020-10-06 15:14:46,841 Model: "TextClassifier(
  (document_embeddings): TransformerDocumentEmbeddings(
    (model): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(i

2020-10-06 15:17:41,268 epoch 1 - iter 344/437 - loss 0.27381029 - samples/sec: 32.24 - lr: 0.000030
2020-10-06 15:18:02,719 epoch 1 - iter 387/437 - loss 0.26759013 - samples/sec: 32.29 - lr: 0.000030
2020-10-06 15:18:24,260 epoch 1 - iter 430/437 - loss 0.26022790 - samples/sec: 32.13 - lr: 0.000030
2020-10-06 15:18:27,814 ----------------------------------------------------------------------------------------------------
2020-10-06 15:18:27,815 EPOCH 1 done: loss 0.2574 - lr 0.0000300
2020-10-06 15:18:50,179 DEV : loss 0.19068484008312225 - score 0.9325
2020-10-06 15:18:51,315 BAD EPOCHS (no improvement): 0
saving best model
2020-10-06 15:18:51,674 ----------------------------------------------------------------------------------------------------
2020-10-06 15:19:14,627 epoch 2 - iter 43/437 - loss 0.08855720 - samples/sec: 30.54 - lr: 0.000030
2020-10-06 15:19:37,148 epoch 2 - iter 86/437 - loss 0.11388903 - samples/sec: 30.78 - lr: 0.000030
2020-10-06 15:19:59,808 epoch 2 - iter 

{'test_score': 0.935,
 'dev_score_history': [0.9325, 0.9351, 0.9364, 0.9373, 0.9325],
 'train_loss_history': [0.2574387380610342,
  0.12017576302693692,
  0.05461023818249697,
  0.013235492687203518,
  0.008282100172282902],
 'dev_loss_history': [0.19068484008312225,
  0.2303922325372696,
  0.3466345965862274,
  0.5440820455551147,
  0.5825539231300354]}

## Sequence tagging

In [30]:
def process_for_sequence_tagging(row,mask_target=False):
    
    # try spaCy
    
    punct = [',','.',' ','?','!',";",":","(",")"]
    sentence = row.text["full_text"]
    
    sentence = np.array([i for i in sentence])
    offset = row.text["keyword_offset"]
    target = row.text["keyword"]
    
    if not target:
        return None
    
    labels = np.array([0]*len(sentence))
    end = offset + len(target)
    labels[offset:end] = 1
    
    for ch in punct:
        labels[np.where(sentence==ch)] = 2
    
    rows = []
    word,labs = [],[]
    
    for i in range(len(sentence)):
        if labels[i] < 2:
            word.append(sentence[i])
            labs.append(labels[i])
        
        if labels[i] == 2 and word:
            rows.append((''.join(word),{0:"not_machine",1:row.label}[list(set(labs))[0]]))
            word,labs = [],[]
    return rows

In [32]:
combined_all["tagged"] = combined_all.apply(process_for_sequence_tagging,axis=1)

In [37]:
combined_all.iloc[1001].tagged

[('The', 'not_machine'),
 ('mesuring', 'not_machine'),
 ('of', 'not_machine'),
 ('salte', 'not_machine'),
 ('and', 'not_machine'),
 ('corne', 'not_machine'),
 ('that', 'not_machine'),
 ('sholde', 'not_machine'),
 ('long', 'not_machine'),
 ('to', 'not_machine'),
 ('the', 'not_machine'),
 ('shifte', 'not_machine'),
 ('of', 'not_machine'),
 ('the', 'not_machine'),
 ('communes', 'not_machine')]

In [38]:
combined_all['partition'] = [np.random.choice(['train','dev','test'], p=[0.6, 0.2, 0.2]) for _ in range(combined_all.shape[0])]

In [39]:
def df2string(df):
    return "\n\n".join(['\n'.join(['\t'.join(e) for e in l]) for l in df.tagged.to_list() if l])


In [42]:
for split in ["train","test","dev"]:
    string = df2string(combined_all[combined_all['partition']==split])
    with open(f'{dp}/{split}.csv','w') as out_doc:
        out_doc.write(string)

In [43]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'label'}


# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(path, columns,
                              train_file='train.csv',
                              test_file='test.csv',
                              dev_file='dev.csv')
    
# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type='label')
print(tag_dictionary)


2020-10-06 15:44:47,412 Reading data from ../data
2020-10-06 15:44:47,413 Train: ../data/train.csv
2020-10-06 15:44:47,413 Dev: ../data/dev.csv
2020-10-06 15:44:47,413 Test: ../data/test.csv
Dictionary with 6 tags: <unk>, O, not_machine, machine, <START>, <STOP>


In [45]:
from flair.data import Corpus
from flair.datasets import UD_ENGLISH
from flair.embeddings import WordEmbeddings,FlairEmbeddings,StackedEmbeddings,TransformerWordEmbeddings



# 4. initialize embeddings
embedding_types = [

    WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

#embeddings = TransformerWordEmbeddings('bert-base-cased',fine_tune=True, allow_long_sentences=True)
    
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type='label',
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train(f'{dp}/taggers/machine',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=20)

2020-10-06 15:46:05,537 ----------------------------------------------------------------------------------------------------
2020-10-06 15:46:05,538 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 256, batch_first=True, b

2020-10-06 15:52:57,220 BAD EPOCHS (no improvement): 0
saving best model
2020-10-06 15:53:04,250 ----------------------------------------------------------------------------------------------------
2020-10-06 15:53:09,093 epoch 5 - iter 20/209 - loss 0.36932054 - samples/sec: 132.23 - lr: 0.100000
2020-10-06 15:53:13,664 epoch 5 - iter 40/209 - loss 0.38986864 - samples/sec: 140.08 - lr: 0.100000
2020-10-06 15:53:17,936 epoch 5 - iter 60/209 - loss 0.40185719 - samples/sec: 150.01 - lr: 0.100000
2020-10-06 15:53:22,189 epoch 5 - iter 80/209 - loss 0.39843519 - samples/sec: 150.55 - lr: 0.100000
2020-10-06 15:53:26,594 epoch 5 - iter 100/209 - loss 0.39489666 - samples/sec: 145.37 - lr: 0.100000
2020-10-06 15:53:31,003 epoch 5 - iter 120/209 - loss 0.39567720 - samples/sec: 145.22 - lr: 0.100000
2020-10-06 15:53:35,442 epoch 5 - iter 140/209 - loss 0.37708616 - samples/sec: 144.24 - lr: 0.100000
2020-10-06 15:53:39,988 epoch 5 - iter 160/209 - loss 0.38599806 - samples/sec: 140.82 - lr:

2020-10-06 15:58:22,371 epoch 10 - iter 160/209 - loss 0.36188491 - samples/sec: 155.90 - lr: 0.100000
2020-10-06 15:58:26,693 epoch 10 - iter 180/209 - loss 0.35614456 - samples/sec: 148.16 - lr: 0.100000
2020-10-06 15:58:31,218 epoch 10 - iter 200/209 - loss 0.35553813 - samples/sec: 141.51 - lr: 0.100000
2020-10-06 15:58:33,158 ----------------------------------------------------------------------------------------------------
2020-10-06 15:58:33,158 EPOCH 10 done: loss 0.3541 - lr 0.1000000
2020-10-06 15:58:41,649 DEV : loss 0.34489959478378296 - score 0.9934
2020-10-06 15:58:41,830 BAD EPOCHS (no improvement): 3
2020-10-06 15:58:41,831 ----------------------------------------------------------------------------------------------------
2020-10-06 15:58:46,595 epoch 11 - iter 20/209 - loss 0.27771169 - samples/sec: 134.41 - lr: 0.100000
2020-10-06 15:58:51,021 epoch 11 - iter 40/209 - loss 0.27482983 - samples/sec: 144.65 - lr: 0.100000
2020-10-06 15:58:55,582 epoch 11 - iter 60/209

2020-10-06 16:03:34,273 ----------------------------------------------------------------------------------------------------
2020-10-06 16:03:34,274 Exiting from training early.
2020-10-06 16:03:34,274 Saving model ...
2020-10-06 16:03:37,136 Done.
2020-10-06 16:03:37,137 ----------------------------------------------------------------------------------------------------
2020-10-06 16:03:37,138 Testing using best model ...
2020-10-06 16:03:37,139 loading file ../data/taggers/machine/best-model.pt
2020-10-06 16:04:18,429 	0.994
2020-10-06 16:04:18,431 
Results:
- F-score (micro) 0.994
- F-score (macro) 0.6462
- Accuracy 0.994

By class:
              precision    recall  f1-score   support

 not_machine     0.9944    0.9996    0.9970     30579
     machine     0.7500    0.1840    0.2955       212

    accuracy                         0.9940     30791
   macro avg     0.8722    0.5918    0.6462     30791
weighted avg     0.9927    0.9940    0.9921     30791

2020-10-06 16:04:18,431 -----

{'test_score': 0.994,
 'dev_score_history': [0.9929,
  0.9916,
  0.9929,
  0.993,
  0.9934,
  0.9926,
  0.9934,
  0.9931,
  0.9931,
  0.9934,
  0.9937,
  0.9936,
  0.9939,
  0.993,
  0.993],
 'train_loss_history': [0.8465260845479783,
  0.44291141336899625,
  0.4226515189027102,
  0.4084052837779077,
  0.39420261636875464,
  0.3861422339124543,
  0.3694431799236667,
  0.3669909275889967,
  0.35908130367406815,
  0.35407543681455,
  0.3343518824811187,
  0.334928142183135,
  0.3195464206797084,
  0.3156196296785437,
  0.3117124591575285],
 'dev_loss_history': [0.4454134702682495,
  0.5290618538856506,
  0.3890500068664551,
  0.3738522529602051,
  0.3528309464454651,
  0.3910372853279114,
  0.34435370564460754,
  0.36366531252861023,
  0.37558695673942566,
  0.34489959478378296,
  0.3167349398136139,
  0.3140784204006195,
  0.31143057346343994,
  0.33148807287216187,
  0.3337770998477936]}