In [4]:
%load_ext autoreload

In [5]:
%autoreload 2

In [6]:
from utils.dataset_download import *
import pickle
import json
from pathlib import Path, PosixPath
import pandas as pd

In [7]:
# import API credentials
with open('oed_experiments/oed_credentials.json') as f:
    credentials = json.load(f)

In [8]:
# define lemma
lemma_id = "machine_nn01"

In [None]:
save_path = Path("./data")
save_path.mkdir(exist_ok=True)

In [None]:
#query the API and get the json response
sense_json = query_oed(credentials,'word',lemma_id,flags='include_senses=true&include_quotations=true')

# convert the json in a dataframe
senses_df = convert_json_to_dataframe(sense_json)

In [None]:
# save the dataframe
# as pickle
senses_df.to_pickle(save_path / f"senses_{lemma_id}.pickle")
# as csv
senses_df.to_csv(save_path / f"senses_{lemma_id}.tsv",sep='\t')

In [None]:
# open pickle file to avoid calling the API again
with open(save_path / f"senses_{lemma_id}.pickle",'rb') as in_pickle:
    machine_senses_df = pickle.load(in_pickle)

In [None]:
# get all senses that are siblings and descendants
# of the semantic class of senses listed in previously obtained query 
responses = traverse_thesaurus(credentials,machine_senses_df)

In [None]:
# traverse tree or load responses 
# responses = traverse_thesaurus(credentials,machine_senses_df)
with open('./data/tree_traversal.pickle','rb') as in_pickle:
    responses = pickle.load(in_pickle)

In [None]:
# get all quoations for the senses in the responses variable
quotations = get_quotations_from_thesaurus(credentials,responses)

In [None]:
# merge and save all information stored in the seperate pickle files
df = merge_pickled(Path("./data/senses_machine_nn01.pickle"),
                   Path("./data/tree_traversal.pickle"),
                   Path("./data/tree_traversal_quotations.pickle"))

In [None]:
df.to_pickle(f"./data/{lemma_id}_all.pickle")

In [None]:
df.head()

## Fin.

In [38]:
!git branch

  1-dataframe[m
* [32m19-machine-tagger[m
  3-group-senses[m
  4-semantic-provenance[m
  dev[m
  master[m
  oed-experiments[m


In [None]:
start,end = 1750,1950
lemma_id = 'machine_nn01'

In [None]:
get_last_id = lambda nested_list :[l[-1] for l in nested_list]

In [None]:
def extend_from_lemma_query(auth,lemma_id,start=1750,end=1950):
    """Extends senses from a dataframe generate from accessing
    the API via the word endpoint. The script first retrieves all
    senses, then synonyms for these senses, then other senses that 
    match the semantic classes of the retrieved senses.
    
    This script also aims to record the "provenance" of words, 
    their relation to the initial query, which can help to 
    select of filter words later on.
    
    Arguments:
        lemma_id (str)
        start (int)
        end (int)
    Returns
        a pandas.DataFrame
    """
    # load seed query dataframe
    query_df = pd.read_pickle(f"./data/senses_{lemma_id}.pickle")
    
    # use the sense endpoint to ensure all information 
    # can be properly concatenated in one dataframe
    
    # retrieve all sense ids
    query_sense_ids = query_df.id.unique()
    
    # get all senses by sense id
    print(f"Get all sense for the lemma {lemma_id}")
    seeds = [(s,query_oed(auth,'sense',s,
                    flags=f"current_in='{start}-{end}'&limit=1000"))
                        for s in tqdm(query_sense_ids)]
    
    # convert to dataframe
    seeds_df = pd.DataFrame([seed['data'] for s_id,seed in seeds])
    
    # define provenance, these words are "seed"
    seeds_df['provenance'] = seeds_df.id
    seeds_df['provenance_type'] = 'seed'
    
    # get all synonyms for the seed senses
    print(f"Get all synonyms of the senses listed in {lemma_id}")
    synonyms = [(s,query_oed(auth,'sense',s,
                    level='synonyms',
                    flags=f"current_in='{start}-{end}'&limit=1000"))
                            for s in tqdm(query_sense_ids)]

    # transform list of synonyms to a dataframe
    synonyms_df = pd.DataFrame([s for s_id,syn in synonyms for s in syn['data']])
    
    # these items have provenancy type "synonym"
    synonyms_df['provenance'] = [s_id for s_id,syn in synonyms for s in syn['data']]
    synonyms_df['provenance_type'] = 'synonym'
    
    # seed + synonyms constitute the nucleas of our query
    # branch from there
    core_df = pd.concat([seeds_df,synonyms_df])
    core_df['semantic_class_last_id'] = core_df['semantic_class_ids'].apply(get_last_id)
    
    # retrieve all semantic class ids for the senses so far
    semantic_class_ids = set([s for l in core_df.semantic_class_last_id.to_list() for s in l])

    # get all the branches for the retrieve semantic class ids
    print("Get all branches for seed senses and synonyms")
    branches = [(idx,query_oed(auth,'semanticclass', idx, 
                        level='branchsenses',
                        flags=f"current_in='{start}-{end}'&limit=1000"))
                            for idx in tqdm(semantic_class_ids)]
    
    # convert API response to dataframe
    branches_df = pd.DataFrame([s for idx,branch in branches for s in branch['data']])
    
    # provenance_type is branch with semantic class id 
    # that was use for retrieving the sense is the provenance
    branches_df['provenance'] = [idx for idx,branch in branches for s in branch['data']]
    branches_df['provenance_type'] = 'branch'
    
    branches_df['semantic_class_last_id'] = branches_df.semantic_class_ids.apply(get_last_id)
    
    # remove senses that already appear in the core_df
    branches_df_red = branches_df.loc[~branches_df.id.isin(core_df.id)]
    
    # concatenate core and branch senses
    extended_df = pd.concat([core_df,branches_df_red])
    
    # refine the provenance type
    # if the last semantic class id is not equal to provenance
    # this row is a child or descendant
    check_membership = lambda row : row.provenance in row.semantic_class_last_id
    extended_df.loc[(~extended_df.apply(check_membership,axis=1)) & (extended_df.provenance_type=='branch'),
                ["provenance_type"]]  = "branch_descendant"
    
    # save information
    extended_df.to_pickle(f'./data/senses_{lemma_id}_extended.pickle')
    return extended_df

In [None]:
extended_df = extend_from_lemma_query(credentials,lemma_id,start,end)

In [None]:
extended_df.head(3)

In [35]:
set(merged[(merged.provenance_type=='synonym') & (merged.lemma!='machine')].lemma)

{"Aaron's rod",
 'Durex',
 'French letter',
 'Frenchy',
 "God's image",
 'Jacuzzi',
 'John',
 'John Henry',
 'John Thomas',
 'Johnny',
 'Johnson',
 'Lizzie',
 'M.F.V.',
 'Mudian',
 'Percy',
 'Peter',
 'Roger',
 'Trojan',
 'Turing machine',
 'address',
 'aerobat',
 'aerocar',
 'aerostat',
 'affair',
 'air machine',
 'air vessel',
 'aircraft',
 'anatomy',
 'appliance',
 'ark',
 'armour',
 'arrangement',
 'arrow',
 'art',
 'artifice',
 'automaton',
 'avion',
 'baby-maker',
 'barge',
 'bark',
 'bastiment',
 'bathing-machine',
 'beam',
 'being',
 'belly',
 'bike',
 'biocomputer',
 'blood bulk',
 'board',
 'bodiȝlich',
 'body',
 'bone house',
 'bones',
 'bottom',
 'bouk',
 'bubble',
 'buggy',
 'bulk',
 'bus',
 'buzz-box',
 'buzz-wagon',
 'cabinet',
 'cade',
 'cadre',
 'car',
 'carcass',
 'carriage',
 'carrion',
 'casa',
 'case',
 'cast',
 'cautel',
 'chariot',
 'chassis',
 'chevisance',
 'chode',
 'class',
 'clay',
 'clipper',
 'clod',
 'cock',
 'cod',
 'compass',
 'compassing',
 'compilemen

In [None]:
def get_quotations(lemma_id,sense_df=''):
    """Obtain and store all quotations from a dataframe constructed
    from information retrieved via de sense endpoint.
    
    Arguments:
        sense_df (pandas.DataFrame,str)
        lemma_id (str)
        
    Returns:
        pandas.DataFrame
    """
    if isinstance(sense_df,str):
        quotations_df = pd.read_pickle(f'./data/quotations_{lemma_id}.pickle')
    else:
        quotations = [query_oed(credentials,'sense',sense_idx,level='quotations')
                        for sense_idx in tqdm(set(sense_df.id))]
            
        quotations_df = pd.concat([pd.DataFrame(q['data']) for q in quotations])
        quotations_df.to_pickle(f'./data/quotations_{lemma_id}.pickle')
    return quotations_df
    

In [None]:
quotations_df = get_quotations(lemma_id)
quotations_df.shape

In [None]:
merged_df = extended_df.merge(quotations_df[['id','sense_id',"text","year","source"]],left_on='id',right_on='sense_id',suffixes=['',"_quotation"])


In [None]:
merged_df.columns

## Tagging experiment with seed and synonyms

In [None]:
surface_forms = set(extended_df[extended_df.provenance_type.isin(['seed','synonym'])].lemma)

In [None]:
def get_lemma_content_from_surface_form(surface_forms):
    """Get all quotations and senses from a list of surface forms
    """
    lemmatized = [query_oed(credentials,'lemmatize','',flags=f"form={sf}")
                             for sf in tqdm(surface_forms)]
    
    lemmas_content = [query_oed(credentials, 'word',
                       l['word']['id'], 
                       flags="include_senses=true&include_quotations=true") 
                             for lemma in tqdm(lemmatized)
                                 for l in lemma['data']]
    
    lemmas_content_df = pd.DataFrame(sense for lemma in lemmas_content for sense in lemma['data']['senses'])
    
    quotations = [quotation
                        for i,row in lemmas_content_df.iterrows()
                             for quotation in row.quotations]
    
    related_quotations_df = pd.DataFrame(quotations)
    
    related_merged_df = lemmas_content_df.merge(related_quotations_df[['id','sense_id',"text","year","source"]],left_on='id',right_on='sense_id',suffixes=['',"_quotation"])
    
    related_merged_df['provenance'] = None
    related_merged_df['provenance_type'] = 'related'
    #related_merged_df.loc[related_merged_df.id.isin(merged_df.id),['provenance_type']] = 'in_core'

    return related_merged_df

## Start here if data is downloaded and put into a dataframe

In [13]:
related_merged_df = pd.read_pickle(f'./data/related_merged_{lemma_id}.pickle')
merged = pd.read_pickle(f'./data/merged_{lemma_id}.pickle')

In [14]:
#related_merged_df.to_pickle(f'./data/related_merged_{lemma_id}.pickle')

In [15]:
core_df = merged[merged.provenance_type.isin(['seed','synonym'])]
related_df = related_merged_df.loc[~related_merged_df.id.isin(core_df.id)]

print(core_df.shape,related_df.shape)

(2975, 24) (86977, 23)


In [36]:
include_senses = {'machine_nn01-38474233','machine_nn01-38474548','machine_nn01-38475164','machine_nn01-38475286',
          "machine_nn01-38474607","machine_nn01-38475923","machine_nn01-38474877","machine_nn01-38475046",
          "machine_nn01-38475099"
         }

def select_senses(df,list_senses):
    sc_ids = set([s for l in df.loc[df.id.isin(list_senses)].semantic_class_last_id.to_list() for s in l])
    
    overlap = lambda x,l: bool(set(x).intersection(l))
    
    df = df.loc[(df.id.isin(list_senses))  | (df.provenance.isin(list_senses)) | (df.provenance.isin(sc_ids))]
    
    return df

core_selected_df = select_senses(core_df,include_senses)

In [37]:
set(core_selected_df.lemma)

{'Lizzie',
 'M.F.V.',
 'Mudian',
 'address',
 'aerobat',
 'aerocar',
 'aerostat',
 'affair',
 'air machine',
 'air vessel',
 'aircraft',
 'appliance',
 'ark',
 'art',
 'avion',
 'barge',
 'bark',
 'bastiment',
 'beam',
 'board',
 'bottom',
 'bubble',
 'buggy',
 'bus',
 'buzz-box',
 'buzz-wagon',
 'car',
 'carriage',
 'cautel',
 'chariot',
 'class',
 'clipper',
 'compassing',
 'contraption',
 'contrivance',
 'convenience',
 'conveniency',
 'conveyance',
 'craft',
 'cycle',
 'device',
 'dial',
 'diligence',
 'dilly',
 'dog',
 'drag',
 'embarkation',
 'energizer',
 'engine',
 'fabric',
 'fancy',
 'fanglement',
 'fashion',
 'flood-bickerer',
 'fly',
 'flyer',
 'flying post',
 'frame',
 'generator',
 'gimcrack',
 'gin',
 'graith',
 'gun',
 'high-flyer',
 'hooker',
 'horse',
 'implements',
 'instrument',
 'invention',
 'jam-jar',
 'jet',
 'keel',
 'loom',
 'low rider',
 'machina',
 'machinament',
 'machination',
 'machine',
 'machine power',
 'mail coach',
 'mail packet',
 'mail stage',
 'ma

In [17]:
core_not_selected_df =  core_df.loc[~core_df.id.isin(core_selected_df.id)]
print(core_df.shape,core_selected_df.shape,core_not_selected_df.shape)

(2975, 24) (1113, 24) (1833, 24)


In [18]:
related_sample_df = related_df.sample(frac=.1)

# subsampling here!! remove later
related_sample_df.shape

(8698, 23)

In [19]:
related_sample_df['label'] = 'not_machine'
core_not_selected_df['label'] = 'not_machine'
core_selected_df['label'] = 'machine'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
import unidecode
import re
    
def preprocess_sent(sent: str) -> str: # check if setting sent_id to None effects anything?
    """preprocessing function for formatting raw text before training word2vec
    # Credits: Kasra Hosseini and Kaspar Beelen
    Arguments:
        sent (string): input sentence
        sent_id (string): idx of the inpute sentence
        tokenized (boolean): if True then return the string as a list of tokens
    Returns:
    """
    # --- replace .- and . in the middle of the word
    sent = re.sub(r'(?<=\w)(\.-)(?=\w)', '-', sent)
    sent = re.sub(r'(?<=\w)(\.)(?=\w)', '', sent)
    # --- remove accent
    sent = unidecode.unidecode(sent)
    # --- remove 2 or more .
    sent = re.sub(r'[.]{2,}', '.', sent)
    # --- add a space before and after a list of punctuations
    sent = re.sub(r"([.,!?:;\"\'])", r" \1 ", sent)
    # --- remove everything except:
    sent = re.sub(r"([^a-zA-Z\-.:;,!?\d+]+)", r" ", sent)
    # --- replace numbers with <NUM>
    sent = re.sub(r'\b\d+\b', '<NUM>', sent)
    sent = re.sub(r'--', '', sent)
    # --- normalize white spaces
    sent = re.sub(r'\s+', ' ', sent)
    # --- lowercase
    sent = sent.lower()

    
    return sent    

def process_for_classification(text_col):
    sentence = preprocess_sent(text_col["full_text"])
    
    return sentence

## Sentence classification

In [21]:
import numpy as np
combined_all = pd.concat([related_sample_df[['text','label']],
                          core_not_selected_df[['text','label']],
                          core_selected_df[['text','label']],
                         ])
combined_all['processed_text'] = combined_all.text.apply(process_for_classification)

In [22]:
combined_all.drop(combined_all[combined_all.processed_text==''].index,inplace=True)


In [23]:
combined_all.shape

(11598, 3)

In [27]:
combined_all[combined_all.label=='machine'].iloc[0].text

{'keyword': 'machins',
 'full_text': 'For all that, their lucke was at that time, to loose man, moyle, and machins belonging to warre.',
 'keyword_offset': 69}

In [None]:
combined_all['partition'] = [np.random.choice(['train','dev','test'], p=[0.6, 0.2, 0.2]) for _ in range(combined_all.shape[0])]

In [None]:
for split in ['train','dev','test']:
    combined_all.loc[combined_all.partition==split][['processed_text','label']].to_csv(f'./data/{split}.csv',sep="\t")


In [None]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus

# this is the folder in which train, test and dev files reside
path = Path("./data")

# column format indicating which columns hold the text and label(s)
columns = {1: 'text', 2: 'label'}

# load corpus containing training, test and dev data and if CSV has a header, you can skip it
corpus: Corpus = CSVClassificationCorpus(path,
                                         columns,
                                         skip_header=True,
                                         delimiter='\t',    # tab-separated files
                                            ) 
    

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()


In [None]:
from torch.optim.adam import Adam
from flair.data import Corpus
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

# 3. initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)

# 4. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

# 5. initialize the text classifier trainer with Adam optimizer
trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

# 6. start the training
trainer.train('resources/classifier/machine',
              learning_rate=3e-5, # use very small learning rate
              mini_batch_size=16,
              mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
              max_epochs=5, # terminate after 5 epochs
              )

## Sequence tagging

In [None]:
def process_for_sequence_tagging(text_col):
    
    # try spaCy
    
    punct = [',','.',' ','?','!']
    sentence = text_col["full_text"]
    
    sentence = np.array([i for i in sentence])
    offset = text_col["keyword_offset"]
    target = text_col["keyword"]
    
    if not target:
        return None
    
    labels = np.array([0]*len(sentence))
    end = offset + len(target)
    labels[offset:end] = 1
    
    for ch in punct:
        labels[np.where(sentence==ch)] = 2
    
    rows = []
    word,labs = [],[]
    
    for i in range(len(sentence)):
        if labels[i] < 2:
            word.append(sentence[i])
            labs.append(labels[i])
        
        if labels[i] == 2 and word:
            rows.append((''.join(word),{0:"notmachine",1:"machine"}[list(set(labs))[0]]))
            word,labs = [],[]
    return rows

In [None]:
def df2string(df):
    return "\n\n".join(['\n'.join(['\t'.join(e) for e in l]) for l in df.tagged.to_list() if l])

train,test,dev = df2string(df_train),df2string(df_test),df2string(df_dev)

with open('./data/train.csv','w') as out_doc:
    out_doc.write(train)
with open('./data/test.csv','w') as out_doc:
    out_doc.write(test)
with open('./data/dev.csv','w') as out_doc:
    out_doc.write(dev)

In [None]:
from flair.data import Corpus
from flair.datasets import UD_ENGLISH
from flair.embeddings import WordEmbeddings,FlairEmbeddings,StackedEmbeddings,TransformerWordEmbeddings

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type='label')
print(tag_dictionary)


# 4. initialize embeddings
embedding_types = [

    WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

#embeddings = TransformerWordEmbeddings('bert-base-cased',fine_tune=True, allow_long_sentences=True)
    
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type='label',
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/example-pos',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)