In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%matplotlib inline
import pickle
import pandas as pd
from tasks import wsd
from pathlib import Path
from tasks import wsd
from utils import nlp_tools
from tqdm.auto import tqdm
import numpy as np
import json
from sklearn.metrics import classification_report
from flair.embeddings import TransformerWordEmbeddings
from utils.dataset_download import harvest_data_from_extended_senses
from utils.classificaton_utils import binarize, vectorize_target_expressions,cosine_similiarity,eval_lemma

In [4]:
tqdm.pandas()

In [61]:
lemma = 'machine'
pos = 'NN'
senses = {'machine_nn01-38475772'} # machine_nn01-38475772 machine_nn01-38475923 machine_nn01-38475835 machine_nn01-38474140
relations = ['seed','synonym'] # ,'descendant','sibling'
eval_mode = "lemma_etal" # lemma or lemma_etal
    

In [62]:
df_train, df_val, df_test = binarize(lemma,
                        pos,
                        senses, 
                        relations,
                        strict_filter=True,
                        start=1760,
                        end=1950,
                        eval_mode=eval_mode)

# senses before filtering by date = 517
# senses after filtering by date = 386


# of seed senses 25 
# of synonyms 338 
# of branch senses 0


# of seeds selected 1 
# of synonyms selected 4 
# of branches selected 0
[LOG] #rows before removing None vector (146, 25)
[LOG] #rows after removing None vector (144, 25)


In [8]:
def weighted(df,year,vector_col,level='label'):
    # 1 over the distance in years
    df['temp_dist'] = (1 / (abs(year - df.year) + 1))
    # normalize, so weights add up to one
    df['temp_dist'] = df['temp_dist'] / sum(df['temp_dist'])
    # time weighted vector (tw_vector) is the product of the vector and the weight
    df['tw_vector'] = df[vector_col] * df['temp_dist']
    # sum vectors by label (sum or mean??)
    
    if level == 'label':
        return df.groupby(level)['tw_vector'].apply(np.sum,axis=0)
    
    elif level == 'sense_id':
        return df.groupby(level)['tw_vector'].apply(np.sum,axis=0)        



In [15]:
weighted(df_train,1850,'vector_bert_base_-1,-2,-3,-4_mean','label')

label
0    [0.19873405, 0.33055484, -0.18412821, 0.391625...
1    [0.00081474637, 0.0023800377, -0.002455221, 0....
Name: tw_vector, dtype: object

In [16]:
weighted(df_train,1850,'vector_bert_base_-1,-2,-3,-4_mean','sense_id')

sense_id
machine_nn01-38473945          [0.0012086951, 0.0006755147, -0.0025742562, 0....
machine_nn01-38474140          [0.00028779765, 0.0024441564, -0.0020951321, 0...
machine_nn01-38474233          [0.004874398, 0.00847157, -0.009767573, 0.0254...
machine_nn01-38474301          [-0.017662784, -0.010246061, 0.0058125993, 0.0...
machine_nn01-38474405          [0.0025907808, 0.003990819, 4.2261916e-05, 0.0...
machine_nn01-38474548          [-0.001017001, 0.0008860223, -0.00012323465, 0...
machine_nn01-38474607          [0.07086671, 0.14310484, -0.04483474, 0.098333...
machine_nn01-38474820          [0.004210249, 0.0030376466, -0.021875927, 0.00...
machine_nn01-38474877          [0.0071569914, -0.0017221039, -0.0069627096, 0...
machine_nn01-38474974          [0.008037301, 0.009152525, -0.022800643, 0.006...
machine_nn01-38475013          [0.002835615, 0.0025166515, -0.0027480246, 0.0...
machine_nn01-38475046          [0.0019872468, 0.0064697377, -0.0010599229, 0....
machine_nn01-384750

In [36]:
def nearest(df,year,vector_col,level='label'):
    # this methods obtains the quotation closest in time for each sense of a lemma. 
    # get idx of quotations nearest in time for each sense
    df['temp_dist'] = abs(df.year - year)
    quots_nn_time_idx = df.groupby(level)['temp_dist'].idxmin().values
    # get the quotations and the sense idx
    if level == 'label':
        return df.loc[quots_nn_time_idx][['label',vector_col]].set_index('label',inplace=False)[vector_col]

    elif level == 'sense_id':
        return df.loc[quots_nn_time_idx][['sense_id',vector_col]].set_index('sense_id',inplace=False)[vector_col]



In [51]:
# wsd functions

def bert_ts_binary_centroid_vector(row:pd.Series,
                            df_train:pd.DataFrame,
                            ts_method:str='weighted',
                            return_ranking:bool=False,
                            vector_col:str='vector_bert_base_-1,-2,-3,-4_mean') -> str:

    vector, year = row[vector_col],row.year     
    
    ts_methods = ['weighted','nearest']
    assert ts_method in ts_methods, f'ts_method should be one of the following options {ts_methods}'

    if ts_method=='weighted':
        centroid_vectors = weighted(df_train,year,vector_col)
    elif ts_method=='nearest':
        centroid_vectors = nearest(df_train,year,vector_col)#.set_index('sense_id', inplace=False)[vector_col]

    #print(len(vector),centroid_vectors.shape) 
    sims = centroid_vectors.apply(cosine_similiarity, target = vector)
    
    if return_ranking:
        return sims.to_dict()
    return str(np.argmax(sims))

In [63]:
df_test.progress_apply(bert_ts_binary_centroid_vector, 
                                df_train=df_train, 
                                return_ranking=False, 
                                ts_method='nearest',
                                axis=1)

HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




52     0
15     0
75     0
112    0
53     0
140    1
61     0
87     1
2      0
116    0
38     1
93     0
88     0
114    0
122    1
72     0
83     0
121    1
30     0
104    1
103    0
100    0
120    1
143    0
21     0
22     0
3      0
102    0
24     0
dtype: object

In [49]:
def bert_ts_sense_centroid_vector(row:pd.Series,
                                df_train:pd.DataFrame,
                                senseid2label:dict,
                                ts_method:str='nearest',
                                return_ranking:bool=False,
                                vector_col:str='vector_bert_base_-1,-2,-3,-4_mean') -> str:

    
    # what if the lemma only has one sense, include exception here
    df_train_lemma = df_train[df_train.lemma==row.lemma]

    ts_methods = ['nearest','weighted']
    assert ts_method in ts_methods, f'ts_method should be one of the following options {ts_methods}'

    if ts_method=='weighted':
        centroid_vectors = weighted(df_train_lemma,row.year,vector_col,level='sense_id')
    elif ts_method=='nearest':
        centroid_vectors = nearest(df_train_lemma,row.year,vector_col,level='sense_id')#.set_index('sense_id', inplace=False)

    #centroid_vectors
    sims = centroid_vectors.apply(
                    cosine_similiarity, target = row[vector_col]
                        ).to_dict()
    if return_ranking:
        return sims
    
    # there was a KeyError here, avoided it with `.get()` but check later what happened
    # return label as '0' or '1'
    return senseid2label.get(
            sorted(
                sims.items(),
                        key=lambda x: x[1], reverse=True)[0][0],'0'
                        )

In [64]:
senseid2label = dict(df_test[['sense_id','label']].values)

df_test.progress_apply(bert_ts_sense_centroid_vector, 
                                df_train=df_train, 
                                senseid2label=senseid2label,
                                return_ranking=False, 
                                ts_method='weighted',
                                axis=1)

HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




52     0
15     0
75     0
112    0
53     0
140    0
61     0
87     0
2      0
116    0
38     0
93     0
88     0
114    0
122    1
72     0
83     0
121    1
30     0
104    0
103    0
100    0
120    1
143    1
21     0
22     0
3      0
102    0
24     0
dtype: object

In [60]:
df_train.label

0     0
1     0
2     0
3     0
4     0
     ..
71    1
72    0
73    0
74    0
75    0
Name: label, Length: 76, dtype: object