In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from scipy.spatial import distance
from pymystem3 import Mystem
from argparse import ArgumentParser

from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import Word2Vec, FastText

from compounds_utils import acquiring, average_normalized, average_standard, apply_distance


def evaluate(compounds_path, dsm_path, model_type, label, f_average, dsm_words_path=None):
    comp = pd.read_csv(compounds_path)
    model = model_type.load(dsm_path)
    if dsm_words_path is None:
        model_words = None
    else:
        model_words = model_type.load(dsm_words_path)
        
    w1, w2, c, true = acquiring(comp, model.wv, label, model_words)
    
    f_spearman_eval = lambda a_dist: spearmanr(apply_distance(w1, w2, c, a_dist, f_average), true)[0]
    
    res = pd.DataFrame.from_dict({metric.__name__ : f_spearman_eval(metric)
                                  for metric in [distance.cosine, 
                                                 distance.chebyshev, 
                                                 distance.cityblock, 
                                                 distance.euclidean]},
                                 orient='index', 
                                 columns=['spearman'])
    return res

In [37]:
model = Word2Vec.load('./workdir/models/model_word2vec_test1')

In [39]:
dataset = pd.read_csv('./workdir/annotation_small.csv')

In [None]:
dataset[dataset['Катя (short list)'].isin({0., 1.})]['Часть 1'].tolist()

In [19]:
dataset = pd.read_csv('./workdir/annotation_katya_ref_v2.csv')
dataset.iloc[248:].to_csv('./workdir/annotation_small.csv')

In [4]:
evaluate(compounds_path='./workdir/annotation_katya_ref_v2.csv',
         dsm_path='./workdir/models/model_word2vec_compounds_7',
         model_type=Word2Vec,
         label='Катя (short list)',
         f_average=average_standard)

Number of examples:  443


Unnamed: 0,spearman
cosine,0.081411
chebyshev,-0.012571
cityblock,-0.021378
euclidean,-0.025882


In [7]:
evaluate(compounds_path='./workdir/annotation_katya_ref_v2.csv',
         dsm_path='./workdir/models/model_word2vec_compounds6',
         model_type=Word2Vec,
         label='Катя (short list)',
         f_average=average_normalized)

Number of examples:  432


Unnamed: 0,spearman
cosine,0.092848
chebyshev,0.078531
cityblock,0.079382
euclidean,0.079878


In [24]:
evaluate(compounds_path='./workdir/annotation_small.csv',
    #compounds_path='./workdir/annotation_katya_ref_v2.csv',
         dsm_path='./workdir/models/model_fasttext_compounts_1',
         model_type=FastText,
         label='Катя (short list)',
         f_average=average_standard)

Number of examples:  201
33


Unnamed: 0,spearman
cosine,0.327757
chebyshev,0.27151
cityblock,0.30623
euclidean,0.304379


In [20]:
evaluate(
    compounds_path='./workdir/annotation_small.csv',
    #compounds_path='./workdir/annotation_katya_ref_v2.csv',
         dsm_path='./workdir/models/model_fast2vec_test',
         model_type=FastText,
         label='Катя (short list)',
         f_average=average_normalized)

Number of examples:  201
33


Unnamed: 0,spearman
cosine,0.374513
chebyshev,0.140963
cityblock,0.142815
euclidean,0.141658


In [7]:
evaluate(compounds_path='./workdir/annotation_katya_ref_v2.csv',
         dsm_path='./workdir/models/model_fast2vec_test',
         model_type=FastText,
         label='Катя (short list)',
         f_average=average_normalized)

Number of examples:  443


Unnamed: 0,spearman
cosine,0.255594
chebyshev,0.080067
cityblock,0.081344
euclidean,0.081478


In [12]:
evaluate(
    compounds_path='./workdir/annotation_small.csv',
    dsm_path='./workdir/models/model_fasttext_300_mc5',
    model_type=FastText,
    label='Катя (short list)',
    f_average=average_standard)

Number of examples:  201


Unnamed: 0,spearman
cosine,0.394188
chebyshev,0.325905
cityblock,0.333543
euclidean,0.330534


In [4]:
evaluate(
    compounds_path='./workdir/annotation_small.csv',
    dsm_path='./workdir/models/model_word2vec_300_mc5',
    model_type=Word2Vec,
    label='Катя (short list)',
    f_average=average_normalized)

Number of examples:  198


Unnamed: 0,spearman
cosine,0.380239
chebyshev,0.158433
cityblock,0.163234
euclidean,0.164674


In [7]:
evaluate(
    compounds_path='./workdir/annotation_small.csv',
    dsm_path='./workdir/models/model_word2vec_300_mc2',
    model_type=FastText,
    label='Катя (short list)',
    f_average=average_standard)

Number of examples:  201


Unnamed: 0,spearman
cosine,0.366412
chebyshev,0.103234
cityblock,0.142352
euclidean,0.147907


In [8]:
evaluate(
    #compounds_path='./workdir/annotation_katya_v3.csv',
    compounds_path='./workdir/annotation_small.csv',
    #compounds_path='./workdir/annotation_katya_ref_v2.csv',
         dsm_path='./workdir/models/model_fast2vec_test',
         model_type=Word2Vec,
         label='Катя (short list)',
         f_average=average_standard)

Number of examples:  201


Unnamed: 0,spearman
cosine,0.371735
chebyshev,0.090735
cityblock,0.13888
euclidean,0.140269


In [None]:
Number of examples:  201
33
spearman
cosine	0.374513
chebyshev	0.140963
cityblock	0.142815
euclidean	0.141658

In [3]:
evaluate(
    #compounds_path='./workdir/annotation_katya_v3.csv',
    compounds_path='./workdir/annotation_small.csv',
    #compounds_path='./workdir/annotation_katya_ref_v2.csv',
         #dsm_path='./workdir/models/model_fasttext_300_6',
    dsm_path='./workdir/models/model_fasttext_300_mc2',
         model_type=FastText,
         label='Катя (short list)',
         f_average=average_normalized,
         dsm_words_path='./workdir/models/model_fasttext_nocompounds_300_mc2')
         #dsm_comp_path='./workdir/models/model_fasttext_no_compounds')

Number of examples:  201


Unnamed: 0,spearman
cosine,-0.014582
chebyshev,-0.199524
cityblock,-0.115502
euclidean,-0.130779


In [87]:
data1 = pd.read_csv('./workdir/annotation_katya_ref.csv')
data2 = pd.read_csv('./workdir/compounds_select_1000_v2_ans.csv')

create_set = lambda data: set(zip(data['Часть 1'].tolist(), data['Часть 2'].tolist()))
set1 = create_set(data1)
set2 = create_set(data2)

len(set1 & set2)

997

In [None]:
FastText:
0.24916833583916628
-0.009943028200770114
-0.031080194111016517
-0.028182887880328536


FastText
Number of examples:  449
spearman
cosine	0.253712
chebyshev	-0.040233
cityblock	-0.038060
euclidean	-0.031278


word2vec
	spearman
cosine	0.258014
chebyshev	0.073209
cityblock	0.105411
euclidean	0.106419