In [1]:
!git branch

  1-dataframe[m
  19-machine-tagger[m
  3-group-senses[m
  4-semantic-provenance[m
* [32m44-kNN-BERT-baseline[m
  dev[m
  master[m
  oed-experiments[m


In [7]:
import pandas as pd
import numpy as np
import flair
from flair.data import Sentence
from utils.classificaton_utils import *
from flair.embeddings import TransformerWordEmbeddings

In [8]:
embedding_type = TransformerWordEmbeddings('bert-base-uncased',
                                           layers='-1,-2,-3,-4',
                                           pooling_operation='mean')

In [9]:
quotations_df = pd.read_pickle('./data/quotations_all_machine_nn01.pickle')
quotations_df.head()

Unnamed: 0,id,text,year,lemma,source,oed_url,word_id,sense_id,datestring,first_in_word,oed_reference,first_in_sense
0,pigmeat_nn01-13163366,"{'keyword': 'pig-meat', 'full_text': 'I was at...",1754,pigmeat,"{'title': 'Connoisseur', 'author': 'G. Colman'...",https://www.oed.com/view/Entry/237320#eid13163366,pigmeat_nn01,pigmeat_nn01-13163363,1754,True,"pigmeat, n., sense 1",True
1,pigmeat_nn01-13163379,"{'keyword': 'pig-meat', 'full_text': 'In short...",1784,pigmeat,"{'title': 'Year's Journey through Paix Bâs', '...",https://www.oed.com/view/Entry/237320#eid13163379,pigmeat_nn01,pigmeat_nn01-13163363,1784,False,"pigmeat, n., sense 1",False
2,pigmeat_nn01-13163399,"{'keyword': 'pig meat', 'full_text': 'It preve...",1817,pigmeat,"{'title': 'Parl. Deb.', 'author': None, 'gende...",https://www.oed.com/view/Entry/237320#eid13163399,pigmeat_nn01,pigmeat_nn01-13163363,1817,False,"pigmeat, n., sense 1",False
3,pigmeat_nn01-13163416,"{'keyword': 'pig meat', 'full_text': 'In most ...",1897,pigmeat,"{'title': 'Syst. Med.', 'author': 'T. C. Allbu...",https://www.oed.com/view/Entry/237320#eid13163416,pigmeat_nn01,pigmeat_nn01-13163363,1897,False,"pigmeat, n., sense 1",False
4,pigmeat_nn01-13163425,"{'keyword': 'pig meat', 'full_text': 'Beef tak...",1918,pigmeat,"{'title': 'Times', 'author': None, 'gender': N...",https://www.oed.com/view/Entry/237320#eid13163425,pigmeat_nn01,pigmeat_nn01-13163363,1918,False,"pigmeat, n., sense 1",False


## Run Code

In [10]:
selected_quotations = filter_quotations_by_year(quotations_df,1760,1920)
selected_quotations.shape

(69843, 9)

In [12]:
# selected_quotations.apply(get_target_token_vector,
#                                     embedding_type=embedding_type,
#                                     axis=1)

## Inspect code

In [5]:
def filter_quotations_by_year(
                      df_quotations:  pd.DataFrame,
                      start:int,
                      end: int
                    ) -> pd.DataFrame:
    """Create a dataframe with quotations and their metadata for 
    for a specific year range
    
    Arguments:
        df_quotations: dataframe with quotations, created using harvest_quotations_by_sense_id
        start (int): start year
        end (int):end year
        
    Returns:
        pd.DataFrame with quotations
        
    """
    df = pd.concat([
        pd.DataFrame.from_records(df_quotations.text.values),
        pd.DataFrame.from_records(df_quotations.source.values)
            ], axis=1)
    df['year'] = df_quotations['year']
    df['sense_id'] = df_quotations['sense_id']
    df['word_id'] = df_quotations['word_id']
    #df = df[df.sense_id.isin(senses)]
    df = df[(start <= df.year) & (df.year <= end)]
    
    df.drop_duplicates(inplace=True)
    
    return df

In [None]:

def get_target_token_vector(row: pd.Series, 
                            embedding_type: TransformerWordEmbeddings,
                            punctuation: str = '!"#—$%&\()*+,./:;\'\—-<=>?@[\\]^_`{|}~‘’'):
    """
    Get a vector representation for a target expression in context.
    If the target expression consists of multiple words we average the 
    multiple vector representations.
    
    Arguments:
        row (pd.Series): a row from a quotations dataframe created by 
                        the function filter_quotations_by_year
    Returns:
        a np.array that captures the last layer(s) of the transformer
    """
    # replace all punctuation with white spaces
    text = ''.join([' ' if c in punctuation else c  for c in row.full_text.lower()])
    
    # if there is no quotation return None
    if text is '':
        return None
    
    text = Sentence(text,use_tokenizer=False)
    target = row.keyword # the offset as recorded by the OED
    vectors = []; quotation_target_tokens = [] # we collect the target tokens collected in the quotation
                                               # and match those with the target expression as a check (see below)
    
    # if there is no target word return none
    # remove punctuation from target expression
    if target is not None:
        target = ''.join([' ' if c in punctuation else c  for c in target.lower()])
    else:
        return None
    
    # get offsets of the target expression in the quotations
    start_position = row.keyword_offset
    end_position = start_position + len(target)
    
    # embedd text
    embedding_type.embed(text)
    
    for token in text:
        # here we rely on the FLAIR offset annotation in combination with tokenisation
        # double check if this works properly
        if (token.start_pos >= start_position) and (token.start_pos < end_position):
            vectors.append(token.embedding.numpy())
            quotation_target_tokens.append(token.text)
    if vectors:
        if ' '.join(quotation_target_tokens) != ' '.join(target.split()):
            print('Warning: could not properly match',' '.join(target.split()), ' with ',' '.join(quotation_target_tokens))
        
        return np.mean(vectors, axis=0)
    
    return None
def get_target_token_vector(row: pd.Series, 
                            embedding_type: TransformerWordEmbeddings,
                            punctuation: str = '!"#—$%&\()*+,./:;\'\—-<=>?@[\\]^_`{|}~‘’'):
    """
    Get a vector representation for a target expression in context.
    If the target expression consists of multiple words we average the 
    multiple vector representations.
    
    Arguments:
        row (pd.Series): a row from a quotations dataframe created by 
                        the function filter_quotations_by_year
    Returns:
        a np.array that captures the last layer(s) of the transformer
    """
    # replace all punctuation with white spaces
    text = ''.join([' ' if c in punctuation else c  for c in row.full_text.lower()])
    
    # if there is no quotation return None
    if text is '':
        return None
    
    text = Sentence(text,use_tokenizer=False)
    target = row.keyword # the offset as recorded by the OED
    vectors = []; quotation_target_tokens = [] # we collect the target tokens collected in the quotation
                                               # and match those with the target expression as a check (see below)
    
    # if there is no target word return none
    # remove punctuation from target expression
    if target is not None:
        target = ''.join([' ' if c in punctuation else c  for c in target.lower()])
    else:
        return None
    
    # get offsets of the target expression in the quotations
    start_position = row.keyword_offset
    end_position = start_position + len(target)
    
    # embedd text
    embedding_type.embed(text)
    
    for token in text:
        # here we rely on the FLAIR offset annotation in combination with tokenisation
        # double check if this works properly
        if (token.start_pos >= start_position) and (token.start_pos < end_position):
            vectors.append(token.embedding.numpy())
            quotation_target_tokens.append(token.text)
    if vectors:
        if ' '.join(quotation_target_tokens) != ' '.join(target.split()):
            print('Warning: could not properly match',' '.join(target.split()), ' with ',' '.join(quotation_target_tokens))
        
        return np.mean(vectors, axis=0)
    
    return None

## Fin.

In [None]:



# X = [v.reshape(-1) for v in quot_sel.vector if v is not None]
# senses = [color_codes[s] for s,v in zip(quot_sel.sense_id,quot_sel.vector) if v is not None]
# #

# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE
# fig = plt.figure(figsize = (10,10))

# tsne = TSNE(n_components=2, random_state=0,metric='cosine')
# X_2d = tsne.fit_transform(X)

# for i,x in enumerate(X):

#     plt.scatter(X_2d[i, 0], X_2d[i, 1],c=senses[i]) 
    
# plt.show()