# University of South Florida Free Association Norms
- see more at http://w3.usf.edu/FreeAssociation/

In [None]:
# !mkdir free_asso

# import os

# url_prefix = 'http://w3.usf.edu/FreeAssociation/AppendixA/Cue_Target_Pairs.'
# files = ['A-B', 'C', 'D-F', 'G-K', 'L-O', 'P-R', 'S', 'T-Z']

# for abc in files:
#     command = 'wget -O free_asso/Cue_Target_Pairs.{}.txt {}{}'.format(abc, url_prefix, abc)
#     os.system(command)
#         f.write(line)
    
# Encoding error... just do it manually...

In [None]:
%load_ext lab_black
import pandas as pd
df = pd.read_csv('free_asso/fan_manual.csv')

In [2]:
df

Unnamed: 0,CUE,TARGET,NORMED?,#G,#P,FSG,BSG,MSG,OSG,#M,...,QUC,TSS,TFR,TCON,TH,TPS,TMC,TPR,TRSG,TUC
0,A,B,NO,152,69,0.454,,,,,...,0.0,,,,,,,,,0.0
1,A,ALPHABET,YES,152,10,0.066,0.046,0.0020,0.0000,2.0,...,0.0,11.0,2.0,,,N,0.50,0.25,0.062,1.0
2,A,THE,NO,152,10,0.066,,,,,...,0.0,,,,,,,,,0.0
3,A,GRADE,YES,152,9,0.059,0.277,0.0000,0.0013,0.0,...,0.0,14.0,35.0,3.70,N,N,1.00,0.54,0.025,1.0
4,A,LETTER,YES,152,6,0.039,0.000,0.0030,0.0022,2.0,...,0.0,15.0,145.0,5.16,N,N,1.27,0.60,0.142,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70365,WED,MARRY,YES,148,58,0.392,0.083,0.0012,0.0145,1.0,...,1.0,21.0,18.0,3.41,,V,1.86,0.33,0.054,1.0
70366,WED,MARRIAGE,YES,148,24,0.162,0.000,0.0263,0.0164,4.0,...,1.0,19.0,95.0,3.94,,N,2.00,0.50,0.087,1.0
70367,WED,DIVORCE,YES,148,9,0.061,0.000,0.1071,0.0536,3.0,...,1.0,11.0,29.0,4.23,,N,0.56,0.44,0.083,1.0
70368,WED,THURSDAY,NO,148,5,0.034,,,,,...,1.0,,,,,,,,,0.0


In [87]:
df.columns

Index(['CUE', ' TARGET', ' NORMED?', ' #G', ' #P', ' FSG', ' BSG', ' MSG',
       ' OSG', ' #M', ' MMIAS', ' #O', ' OMIAS', ' QSS', ' QFR', ' QCON',
       ' QH', ' QPS', ' QMC', ' QPR', ' QRSG', ' QUC', ' TSS', ' TFR', ' TCON',
       ' TH', ' TPS', ' TMC', ' TPR', ' TRSG', ' TUC'],
      dtype='object')

- CUE: Normed Word Cue
- TARGET: Response to Normed Word
- #G: The number of participants serving in the group norming the word, 
- #P: The number of participants producing a particular response.
- FSG: Forward Cue-to-Target Strength  
    - dividing #P by #G which gives the proportion of subjects in the group who produce a particular target in the presence of the cue word.
- BSG: Backward Target-to-Cue Strength
- OSG: Overlapping strength. 
    - Two words comprising a particular pair may also have associates in common, what have sometimes been called overlapping, convergent or shared associates. The cue word and the target word may produce some of the same words as associates. For example, both ABILITY and CAPABILITY produce the same 6 words as associates, including able, strength, talent, potential, capacity, and knowledge. The overlap strength for this pair is calculated as shown in Table 2. From this example, it should be clear that OSG is calculated like MSG in that the strengths of the individual connections are cross multiplied and then summed.

### Select and clean Free Association Norm

In [12]:
sel_df = df.loc[:,['CUE', ' TARGET', ' OSG']]
sel_df.dropna(inplace=True)
sel_df.columns = ['cue', 'target', 'osg']
sel_df['cue'] = sel_df.cue.str.strip()
sel_df['target'] = sel_df.target.str.strip()

In [119]:
import numpy as np
from random import sample
# Select overlapping set in TASA and BERT
bert = pd.read_csv('input/df_train_bert.csv')
tasa = pd.read_csv('input/df_train_tasa.csv')

bert_vec = np.load('input/y_train_bert.npz')['data']
tasa_vec = np.load('input/y_train_tasa.npz')['data']

selwords = list(tasa.word[tasa.word.isin(bert.word)].str.upper())
print('Some samples: {}'.format(sample(selwords, 10)))

Some samples: ['STORMS', 'HURLED', 'GULP', 'DEEMED', 'CRACK', 'ADDS', 'FATES', 'CUT', 'GUARD', 'DROPS']


### Create test case by overlapping items with two embeddings

In [121]:
sel_df['cue_chk'] = sel_df.cue.isin(selwords)
sel_df['tar_chk'] = sel_df.target.isin(selwords)
sel_df['double_chk'] = sel_df.cue_chk & sel_df.tar_chk
testcase_sem = sel_df.loc[sel_df.double_chk,
                          ['cue', 'target', 'osg']].reset_index(drop=True)

In [97]:
from tensorflow.keras.losses import cosine_similarity as cosdis


def w2vec(word, words_df, words_vec):
    if len(words_df) != len(words_vec):
        print('CAUTION: WORDS DF AND WORDS VECTOR DONT MATCH IN LENGTH!!')
    if word in list(words_df):
        vec_id = words_df.index[words_df == word]
        return words_vec[vec_id]
    else:
        print('not on list')

In [106]:
w2vec('lamp', bert.word, bert_vec)

array([[ 0.03261244, -0.3644209 , -0.5248671 , ...,  0.17392747,
         0.7278073 ,  0.48814464]], dtype=float32)

### Run throught test case

In [124]:
def cal_similarity(w1, w2, emb):
    if emb == 'bert':
        words = bert.word
        vecs = bert_vec
    elif emb == 'tasa':
        words = tasa.word
        vecs = tasa_vec
    else:
        print('Use option emb = "tasa" or "bert"')

    return -cosdis(w2vec(w1, words, vecs), w2vec(w2, words, vecs)).numpy()

In [None]:
from tqdm import tqdm

bert_similarity = []
tasa_similarity = []

for i in tqdm(range(len(testcase_sem))):

    bert_similarity.append(
        cal_similarity(
            testcase_sem.cue.str.lower()[i],
            testcase_sem.target.str.lower()[i], 'bert'
        )[0]
    )

    tasa_similarity.append(
        cal_similarity(
            testcase_sem.cue.str.lower()[i],
            testcase_sem.target.str.lower()[i], 'tasa'
        )[0]
    )
    
testcase_sem['tasa_similarity'] = tasa_similarity
testcase_sem['bert_similarity'] = bert_similarity

testcase_sem.to_csv('testcase_sim.csv')

# Calculate correlations / cosine similarity

In [246]:
from scipy.stats import pearsonr, spearmanr
from scipy.spatial.distance import cosine

print(pearsonr(testcase_sem.tasa_similarity, testcase_sem.bert_similarity)[0])
print(pearsonr(testcase_sem.osg, testcase_sem.bert_similarity)[0])
print(pearsonr(testcase_sem.osg, testcase_sem.tasa_similarity)[0])

print(spearmanr(testcase_sem.tasa_similarity, testcase_sem.bert_similarity)[0])
print(spearmanr(testcase_sem.osg, testcase_sem.bert_similarity)[0])
print(spearmanr(testcase_sem.osg, testcase_sem.tasa_similarity)[0])

print(1 - cosine(testcase_sem.tasa_similarity, testcase_sem.bert_similarity))
print(1 - cosine(testcase_sem.osg, testcase_sem.bert_similarity))
print(1 - cosine(testcase_sem.osg, testcase_sem.tasa_similarity))

0.02248188164808238
0.008490250197436912
0.21122959887120418
0.051495739760699454
0.025544238824187663
0.3157331741327457
0.5748138744242304
0.3189423197211554
0.3869941652670307


### Make plotting data file

In [220]:
pdf = testcase_sem.melt(
    id_vars=['cue', 'target', 'rosg'],
    value_vars=['tasa_similarity', 'bert_similarity'],
    var_name='embedding',
    value_name='similarity'
)

pdf

Unnamed: 0,cue,target,rosg,embedding,similarity
0,ACE,CARDS,0.063246,tasa_similarity,0.059783
1,ACE,HOLE,0.000000,tasa_similarity,-0.034480
2,ACE,KING,0.000000,tasa_similarity,0.065472
3,ACE,TOP,0.000000,tasa_similarity,-0.056699
4,ACE,SMART,0.000000,tasa_similarity,0.031158
...,...,...,...,...,...
25701,WEAVE,SPIN,0.000000,bert_similarity,0.924054
25702,WEAVE,STITCH,0.385487,bert_similarity,0.745930
25703,WEAVE,TIE,0.000000,bert_similarity,0.861274
25704,WEAVE,TIGHT,0.000000,bert_similarity,0.754696


## Plotting

In [240]:
testcase_sem.columns

Index(['cue', 'target', 'osg', 'tasa_similarity', 'bert_similarity', 'rosg'], dtype='object')

In [244]:
import altair as alt
from random import sample
alt.data_transformers.enable("default")
alt.data_transformers.disable_max_rows()

samp_df = testcase_sem.sample(n=300, axis=0, random_state=9999)

brush = alt.selection(type='interval')
base = alt.Chart(samp_df).add_selection(brush)

# Configure the points
points_tasa = base.mark_point().encode(
    x=alt.X('rosg:Q', title='', scale=alt.Scale(domain=(0, 1))),
    y=alt.Y('tasa_similarity:Q', title='', scale=alt.Scale(domain=(-1, 1))),
    color=alt.condition(brush, alt.value('blue'), alt.value('grey')),
    tooltip=[
        'cue:N', 'target:N', 'rosg:Q', 'tasa_similarity:Q', 'bert_similarity:Q'
    ]
)

points_bert = base.mark_point().encode(
    x=alt.X('rosg:Q', title='', scale=alt.Scale(domain=(0, 1))),
    y=alt.Y('bert_similarity:Q', title='', scale=alt.Scale(domain=(-1, 1))),
    color=alt.condition(brush, alt.value('orange'), alt.value('grey')),
    tooltip=[
        'cue:N', 'target:N', 'rosg:Q', 'tasa_similarity:Q', 'bert_similarity:Q'
    ]
)

# Configure the ticks
tick_axis = alt.Axis(labels=False, domain=False, ticks=False)

x_ticks = base.mark_tick().encode(
    x=alt.X('rosg', axis=tick_axis, scale=alt.Scale(domain=(0, 1))),
    color=alt.condition(brush, alt.value('black'), alt.value('lightgrey'))
)

y_ticks_tasa = base.mark_tick().encode(
    y=alt.Y(
        'tasa_similarity:Q', axis=tick_axis, scale=alt.Scale(domain=(-1, 1))
    ),
    color=alt.condition(brush, alt.value('black'), alt.value('lightgrey'))
)

y_ticks_bert = base.mark_tick().encode(
    y=alt.Y(
        'bert_similarity:Q', axis=tick_axis, scale=alt.Scale(domain=(-1, 1))
    ),
    color=alt.condition(brush, alt.value('black'), alt.value('lightgrey'))
)

left = y_ticks_tasa | (points_tasa & x_ticks)

right = y_ticks_bert | (points_bert & x_ticks)

left | right

In [245]:
chart = left | right
chart.save('semantic_testcase.html')