In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy as np
import json
from tasks import wsd
from pathlib import Path
from utils import nlp_tools
from parhugin import multiFunc
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
from utils.dataset_download import harvest_data_from_extended_senses
from utils.classificaton_utils import binarize,generate_definition_df, vectorize_target_expressions
from tqdm.notebook import tqdm
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
tqdm.pandas()

In [3]:
# import API credentials
with open('oed_experiments/oed_credentials.json') as f:
    auth = json.load(f)

 
lemma_pos = [['anger',"NN"],["apple","NN"],["art","NN"],["democracy","NN"],["happiness","NN"],["labour","NN"],["machine","NN"],["man","NN"],["nation","NN"],["power","NN"],["slave","NN"],['woman','NN']]
#["technology","NN"] # got an error with technology

embedding_methods = {'bert_base': {"path":'bert-base-uncased',
                                   'layers':'-1,-2,-3,-4',
                                   'pooling_operation':'mean'},
                    'blert_base': {"path":'/deezy_datadrive/kaspar-playground/bert_model/FT_bert_base_uncased_all_books_v002',
                                   'layers':'-1,-2,-3,-4',
                                   'pooling_operation':'mean'},
                    'bert_1850':{"path":"/datadrive/khosseini/LM_with_bert_MOVED_to_another_VM_REMOVE_FROM_NOVEMBER/models/bert/FT_bert_base_uncased_before_1850_v001", 
                                   'layers':'-1,-2,-3,-4',
                                   'pooling_operation':'mean'}
                                   }

In [4]:

#for lemma, pos in lemma_pos:
#    # [WARNING] if you run code for the first time uncomment line below, comment again after running this cell
#    #quotations = harvest_data_from_extended_senses(auth,f"{lemma}_{pos}")
#    quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle"
#    vectorize_target_expressions(quotations_path,embedding_methods)

In [5]:
vector_cols = ['vector_bert_base_-1,-2,-3,-4_mean',
                "vector_blert_base_-1,-2,-3,-4_mean",
                'vector_bert_1850_-1,-2,-3,-4_mean']

In [17]:
def eval_sense(lemma,
                pos,
                senses,
                start,
                end,
                train_on_dev=train_on_dev,
                eval_mode=eval_mode,
                relations=relations,
                vector_cols=vector_cols):

    df_train, df_val, df_test = binarize(lemma,
                pos,
                senses, 
                start=start,
                end=end,
                relations=relations,
                eval_mode=eval_mode,
                strict_filter=True,)

    # no quotations for sense and timeframe
    if df_train is None:
        return None
    
    if train_on_dev:
        df_train = pd.concat([df_train, df_val], axis=0)
        df_train.reset_index(inplace=True,drop=True)

    df_train["nlp_full_text"] = df_train.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
    df_val["nlp_full_text"] = df_val.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
    df_test["nlp_full_text"] = df_test.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)

    # random
    print(f'[LOG] computing baselines for {senses}')
    df_test["random"] = df_test.apply (lambda row: wsd.random_predict(), axis=1)

    # retrieve and process definitions            
    df_selected_senses = generate_definition_df(df_train,lemma,eval_mode=eval_mode)
    df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)

    # token overlap
    df_test["def_tok_overlap_ranking"] = df_test.apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)

    # spacy sentence embeddings
    df_test["sent_embedding"] = df_test.apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], df_selected_senses), axis=1)

    #w2v lesk
    df_test["w2v_lesk_ranking"] = df_test.apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)

    #Bert lesk      
    #df_test["bert_lesk_ranking"] = df_test.apply (lambda row: wsd.bert_lesk_ranking(row["text"]["full_text"], df_selected_senses, bert_sentsim_model), axis=1)


    # supervised baselined (w-emb SVM) - careful this is a 19thC BL model
    df_test["svm_wemb_baseline"] = wsd.svm_wemb_baseline(df_train,df_test,wemb_model)

    for vector_col in vector_cols:
        print(f'[LOG] computing centoids for {senses} [BERT model = {vector_col}]' )
        df_test[f"bert_binary_centroid_{vector_col}"] = df_test.apply(wsd.bert_binary_centroid_vector, 
                                        df_train = df_train, 
                                        vector_col=vector_col,
                                        return_ranking=False, axis=1)

        senseid2label = dict(df_test[['sense_id','label']].values)
        df_test[f"bert_centroid_sense_{vector_col}"] = df_test.apply(wsd.bert_sense_centroid_vector,  
                                                    senseid2label= senseid2label,
                                                    vector_col=vector_col,
                                                    df_train = df_train, axis=1)

        centroid_vectors = df_train.groupby('label')[vector_col].apply(np.mean,axis=0)
        sem_axis = centroid_vectors[1] - centroid_vectors[0] 
        df_test[f"bert_contrast_{vector_col}"] = df_test[vector_col].apply(wsd.bert_semaxis_vector,
                                                    sem_axis=sem_axis,
                                                    threshold=.0)

        df_test[f"bert_ts_binary_centroid_{vector_col}"] = df_test.apply(wsd.bert_ts_binary_centroid_vector, 
                                                        df_train=df_train, 
                                                        ts_method='nearest',
                                                        vector_col=vector_col,
                                                        axis=1)

        senseid2label = dict(df_test[['sense_id','label']].values)
        df_test[f"bert_ts_centroid_sense_{vector_col}"] = df_test.apply(wsd.bert_ts_sense_centroid_vector,  
                        senseid2label= senseid2label,
                        ts_method='nearest',
                        vector_col=vector_col,
                        df_train = df_train, axis=1)

        print(f'[LOG] traing classifier for {senses} [BERT model = {vector_col}]' )
        X,y = list(df_train[vector_col].values), list(df_train.label.values)
        #print('bert_clf_svm')
        #svm_model = LinearSVC(random_state=0, C=.1, tol=1e-5,class_weight='balanced')
        #svm_model.fit(X,y)
        df_test[f"bert_svm_{vector_col}"] = "0"#wsd.clf_svm(vector_col,df_test, svm_model)

        #print('bert_clf_perc')
        perc_model = Perceptron(validation_fraction=.2, early_stopping=True,class_weight='balanced')
        perc_model.fit(X,y)
        df_test[f"bert_perceptron_{vector_col}"] = wsd.clf_perceptron(vector_col,df_test, perc_model)

        #print('bert_clf_mlperc')
        mlperc_model = MLPClassifier(validation_fraction=.2, early_stopping=True, solver='lbfgs',activation='relu')
        mlperc_model.fit(X,y)
        df_test[f"bert_ml_perceptron_{vector_col}"]  = wsd.clf_perceptron(vector_col,df_test, mlperc_model)


    return df_test

In [27]:
def run(lemma, 
        pos, 
        senses, 
        start, 
        end,
        train_on_dev=True,
        vector_cols=vector_cols,
        eval_mode='lemma_etal',
        relations=['seed','synonym']):

    df_test = eval_sense(lemma,
                pos,
                senses,
                start,
                end,
                train_on_dev=train_on_dev,
                eval_mode=eval_mode,
                relations=relations,
                vector_cols=vector_cols)

    results_path = os.path.join('results', f"{lemma}_{pos}", eval_mode)
    results_filename = '_'.join(senses) + "~" + "+".join(sorted(relations)) + ".csv"
    Path(results_path).mkdir(parents=True, exist_ok=True)

    # IF df_test is None, create an empty DataFrame
    if not isinstance(df_test, type(None)):
        
        baselines = ['id_x','label','random','def_tok_overlap_ranking', 'sent_embedding', 'w2v_lesk_ranking',                        'svm_wemb_baseline']
        bert_methods = [[f"bert_binary_centroid_{vector_col}",f"bert_centroid_sense_{vector_col}",f"bert_contrast_{vector_col}",
                        f"bert_ts_binary_centroid_{vector_col}",f"bert_ts_centroid_sense_{vector_col}"] 
                                    for vector_col in  vector_cols]
        bert_methods = [i for tm in bert_methods for i in tm]

        out_df = df_test.filter(baselines + bert_methods, axis=1)
    else:
        out_df = pd.DataFrame()

    out_df.to_csv(os.path.join(results_path, results_filename), index=False)  

In [28]:

wemb_model = Word2Vec.load("models/w2v_004/w2v_words.model")

In [29]:
words = [['anger',"NN"],["apple","NN"],["art","NN"],["democracy","NN"],["happiness","NN"],["labour","NN"],["machine","NN"],["man","NN"],["nation","NN"],["power","NN"],["slave","NN"],['woman','NN']]

words = [['machine','NN']]
start,end = 1760, 1920

In [26]:
for lemma, pos in words:
    quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle"
    lemma_senses = pd.read_pickle(f'./data/lemma_senses_{lemma}_{pos}.pickle')
    
    # this is the index of the lemma id <-- we could remove this later
    idx = "01"
    senses = set(lemma_senses[lemma_senses.word_id==f'{lemma}_{pos.lower()}{idx}'].id)
    
    for sense in tqdm(list(senses)):
        run(lemma, pos, {sense},  1760, 1920)

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))

[LOG] # senses before filtering by date = 517
[LOG ]# senses after filtering by date = 352


[LOG] # of seed senses 22 
[LOG] # of synonyms 310 
[LOG] # of branch senses 0


[LOG] # of seeds selected 1 
[LOG] # of synonyms selected 3 
[LOG] # of branches selected 0
[LOG] 274 quotations selected
[LOG] train = 207 val = 24 test = 43 quotations
[LOG] computing baselines for {'machine_nn01-38474301'}
[LOG] Using lemma_etal as evaluation mode.
[LOG] computing centoids for {'machine_nn01-38474301'} [BERT model = vector_bert_base_-1,-2,-3,-4_mean]
[LOG] traing classifier for {'machine_nn01-38474301'} [BERT model = vector_bert_base_-1,-2,-3,-4_mean]
[LOG] computing centoids for {'machine_nn01-38474301'} [BERT model = vector_blert_base_-1,-2,-3,-4_mean]
[LOG] traing classifier for {'machine_nn01-38474301'} [BERT model = vector_blert_base_-1,-2,-3,-4_mean]
[LOG] computing centoids for {'machine_nn01-38474301'} [BERT model = vector_bert_1850_-1,-2,-3,-4_mean]
[LOG] traing classifier for {'machine

NameError: name 'relations' is not defined