# Benchmarking word embeddings

- CUE: Normed Word Cue
- TARGET: Response to Normed Word
- #G: The number of participants serving in the group norming the word, 
- #P: The number of participants producing a particular response.
- FSG: Forward Cue-to-Target Strength  
    - dividing #P by #G which gives the proportion of subjects in the group who produce a particular target in the presence of the cue word.
- BSG: Backward Target-to-Cue Strength
- OSG: Overlapping strength. 
    - Two words comprising a particular pair may also have associates in common, what have sometimes been called overlapping, convergent or shared associates. The cue word and the target word may produce some of the same words as associates. For example, both ABILITY and CAPABILITY produce the same 6 words as associates, including able, strength, talent, potential, capacity, and knowledge. The overlap strength for this pair is calculated as shown in Table 2. From this example, it should be clear that OSG is calculated like MSG in that the strengths of the individual connections are cross multiplied and then summed.

### University of South Florida Free Association Norms
- see more at http://w3.usf.edu/FreeAssociation/

### SimLex-999
- see more at https://fh295.github.io//simlex.html
- paper: Faruqui, M., Tsvetkov, Y., Rastogi, P., & Dyer, C. (2016). Problems With Evaluation of Word Embeddings Using Word Similarity Tasks. 30–35. https://doi.org/10.18653/v1/w16-2506

In [None]:
%load_ext lab_black
import pandas as pd

### List all existing overlapped words in BERT and LSA-TASA

In [None]:
import numpy as np
from random import sample
# Select overlapping set in TASA and BERT
bert = pd.read_csv('input/df_train_bert.csv')
tasa = pd.read_csv('input/df_train_tasa.csv')

bert_vec = np.load('input/y_train_bert.npz')['data']
tasa_vec = np.load('input/y_train_tasa.npz')['data']

selwords = list(tasa.word[tasa.word.isin(bert.word)].str.lower())
print('Some samples: {}'.format(sample(selwords, 10)))

### Select and clean Free Association Norm

In [None]:
fan = pd.read_csv('benchmark/free_asso/fan_manual.csv', low_memory=False)
fan = fan.loc[:, ['CUE', ' TARGET', ' OSG']]
fan.dropna(inplace=True)
fan.columns = ['cue', 'target', 'osg']
fan['cue'] = fan.cue.str.strip().str.lower()
fan['target'] = fan.target.str.strip().str.lower()
fan['cue_chk'] = fan.cue.isin(selwords)
fan['tar_chk'] = fan.target.isin(selwords)
fan['double_chk'] = fan.cue_chk & fan.tar_chk
testcase_fan = fan.loc[fan.double_chk, ['cue', 'target', 'osg']]
testcase_fan.reset_index(inplace=True)

### Select and clean SimLex-999

In [None]:
sl = pd.read_csv('benchmark/simlex_999/SimLex-999.txt', delimiter='\t')
sl = sl.loc[:, ['word1', 'word2', 'SimLex999']]
sl.dropna(inplace=True)
sl.columns = ['cue', 'target', 'sl']
sl['cue'] = sl.cue.str.strip().str.lower()
sl['target'] = sl.target.str.strip().str.lower()
sl['cue_chk'] = sl.cue.isin(selwords)
sl['tar_chk'] = sl.target.isin(selwords)
sl['double_chk'] = sl.cue_chk & sl.tar_chk
testcase_sl = sl.loc[sl.double_chk, ['cue', 'target', 'sl']]
testcase_sl.reset_index(inplace=True)

### Create test case by overlapping items with two embeddings

In [None]:
from tensorflow.keras.losses import cosine_similarity as cosdis

def w2vec(word, words_df, words_vec):
    if len(words_df) != len(words_vec):
        print('CAUTION: WORDS DF AND WORDS VECTOR DONT MATCH IN LENGTH!!')
    if word in list(words_df):
        vec_id = words_df.index[words_df == word]
        return words_vec[vec_id]
    else:
        print('not on list')

### Run throught test case

In [None]:
def cal_similarity(w1, w2, emb):
    if emb == 'bert':
        words = bert.word
        vecs = bert_vec
    elif emb == 'tasa':
        words = tasa.word
        vecs = tasa_vec
    else:
        print('Use option emb = "tasa" or "bert"')

    return -cosdis(w2vec(w1, words, vecs), w2vec(w2, words, vecs)).numpy()


cal_similarity('lamp', 'dog', 'bert')

In [None]:
def cal_all_sims(testcase, savefile):

    from tqdm import tqdm

    bert_similarity = []
    tasa_similarity = []

    for i in tqdm(range(len(testcase))):

        bert_similarity.append(
            cal_similarity(
                testcase.cue.str.lower()[i],
                testcase.target.str.lower()[i], 'bert'
            )[0]
        )

        tasa_similarity.append(
            cal_similarity(
                testcase.cue.str.lower()[i],
                testcase.target.str.lower()[i], 'tasa'
            )[0]
        )

    testcase['tasa_similarity'] = tasa_similarity
    testcase['bert_similarity'] = bert_similarity

    testcase.to_csv(savefile)

    return testcase


testcase_fan = cal_all_sims(testcase_fan, "fan_sim.csv")
testcase_sl = cal_all_sims(testcase_sl, "sl_sim.csv")

# Calculate correlations / cosine similarity

In [None]:
testcase_fan = pd.read_csv('fan_sim.csv')
testcase_sl = pd.read_csv('sl_sim.csv')

In [None]:
def eval_sims(emb, bench, emb_name, bench_name):

    from scipy.stats import pearsonr, spearmanr
    from scipy.spatial.distance import cosine, euclidean

    results = pd.DataFrame(columns=['emb', 'bench', 'stat', 'value'])

    results = pd.concat(
        [
            results,
            pd.DataFrame(
                {
                    'emb': emb_name,
                    'bench': bench_name,
                    'stat': 'pearson',
                    'value': pearsonr(emb, bench)[0]
                },
                index=[0]
            )
        ],
        sort=False
    )

    results = pd.concat(
        [
            results,
            pd.DataFrame(
                {
                    'emb': emb_name,
                    'bench': bench_name,
                    'stat': 'spearmanr',
                    'value': spearmanr(emb, bench)[0]
                },
                index=[0]
            )
        ],
        sort=False
    )

    results = pd.concat(
        [
            results,
            pd.DataFrame(
                {
                    'emb': emb_name,
                    'bench': bench_name,
                    'stat': 'cosine_sim',
                    'value': 1 - cosine(emb, bench)
                },
                index=[0]
            )
        ],
        sort=False
    )

    results = pd.concat(
        [
            results,
            pd.DataFrame(
                {
                    'emb': emb_name,
                    'bench': bench_name,
                    'stat': 'euclidean',
                    'value': euclidean(emb, bench)
                },
                index=[0]
            )
        ],
        sort=False
    )

    return results


results = pd.concat(
    [
        eval_sims(
            testcase_fan.bert_similarity, testcase_fan.osg, 'bert', 'fan'
        ),
        eval_sims(
            testcase_fan.tasa_similarity, testcase_fan.osg, 'tasa', 'fan'
        ),
        eval_sims(testcase_sl.bert_similarity, testcase_sl.sl, 'bert', 'sl'),
        eval_sims(testcase_sl.tasa_similarity, testcase_sl.sl, 'tasa', 'sl')
    ],
    sort=False
)

results.reset_index(inplace=True, drop=True)
results

In [None]:
import altair as alt

sel_stat = alt.selection(
    type="single",
    on="click",
    fields=['stat'],
    bind=alt.binding_radio(
        options=['pearson', 'spearmanr', 'euclidean', 'cosine_sim'],
        name="Statistics: "
    )
)

alt.Chart(results).mark_bar().encode(
    x=alt.X('emb:N', title=''),
    y=alt.Y('value:Q'),
    color="emb:N",
    column="bench:N"
).add_selection(sel_stat).transform_filter(sel_stat)

### Make plotting data file

In [None]:
pdf = testcase_sem.melt(
    id_vars=['cue', 'target', 'rosg'],
    value_vars=['tasa_similarity', 'bert_similarity'],
    var_name='embedding',
    value_name='similarity'
)

pdf

## Plotting

In [None]:
testcase_sem.columns

In [None]:
import altair as alt
from random import sample
alt.data_transformers.enable("default")
alt.data_transformers.disable_max_rows()

samp_df = testcase_sem.sample(n=300, axis=0, random_state=9999)

brush = alt.selection(type='interval')
base = alt.Chart(samp_df).add_selection(brush)

# Configure the points
points_tasa = base.mark_point().encode(
    x=alt.X('rosg:Q', title='', scale=alt.Scale(domain=(0, 1))),
    y=alt.Y('tasa_similarity:Q', title='', scale=alt.Scale(domain=(-1, 1))),
    color=alt.condition(brush, alt.value('blue'), alt.value('grey')),
    tooltip=[
        'cue:N', 'target:N', 'rosg:Q', 'tasa_similarity:Q', 'bert_similarity:Q'
    ]
)

points_bert = base.mark_point().encode(
    x=alt.X('rosg:Q', title='', scale=alt.Scale(domain=(0, 1))),
    y=alt.Y('bert_similarity:Q', title='', scale=alt.Scale(domain=(-1, 1))),
    color=alt.condition(brush, alt.value('orange'), alt.value('grey')),
    tooltip=[
        'cue:N', 'target:N', 'rosg:Q', 'tasa_similarity:Q', 'bert_similarity:Q'
    ]
)

# Configure the ticks
tick_axis = alt.Axis(labels=False, domain=False, ticks=False)

x_ticks = base.mark_tick().encode(
    x=alt.X('rosg', axis=tick_axis, scale=alt.Scale(domain=(0, 1))),
    color=alt.condition(brush, alt.value('black'), alt.value('lightgrey'))
)

y_ticks_tasa = base.mark_tick().encode(
    y=alt.Y(
        'tasa_similarity:Q', axis=tick_axis, scale=alt.Scale(domain=(-1, 1))
    ),
    color=alt.condition(brush, alt.value('black'), alt.value('lightgrey'))
)

y_ticks_bert = base.mark_tick().encode(
    y=alt.Y(
        'bert_similarity:Q', axis=tick_axis, scale=alt.Scale(domain=(-1, 1))
    ),
    color=alt.condition(brush, alt.value('black'), alt.value('lightgrey'))
)

left = y_ticks_tasa | (points_tasa & x_ticks)

right = y_ticks_bert | (points_bert & x_ticks)

left | right

In [None]:
chart = left | right
chart.save('semantic_testcase.html')