### Set up

In [57]:
import pandas as pd
import numpy as np
from pathlib import Path

root = Path('.').absolute().parent
data_path = root / 'evaluation'

words_freq = pd.read_csv(data_path / 'words_freq.csv')
swow = pd.read_csv(data_path / 'SWOWRP_words_associations.csv', delimiter='\t')
swow.drop(columns=['N'], inplace=True)
swow.rename(columns={'response': 'answer', 'R1': 'n', 'R1.Strength': 'freq'}, inplace=True)
swow.drop_duplicates(subset=['cue', 'answer'], inplace=True)

### Preprocessing

In [58]:
non_latin_pattern = '[^ \nA-Za-zá-úñ/]+'
invalid_inbetween_pattern = '[A-Za-zá-úñ]+.*?[^\sA-Za-zá-úñ][A-Za-zá-úñ]+'
for keyword in ['cue', 'answer']:
    # Delete cues and answers with more than one word
    swow = swow.drop(swow[swow[keyword].str.contains(' ')].index)    
    # Delete cues and answers that are not composed of latin letters, or contain invalid characters in between
    swow = swow.drop(swow[swow[keyword].str.contains(non_latin_pattern)].index)
    swow = swow.drop(swow[swow[keyword].str.contains(invalid_inbetween_pattern)].index)
    # Delete cues with length less than 3
    swow = swow.drop(swow[swow[keyword].str.len() < 3].index)
    swow = swow.drop(swow[swow[keyword].str.len() > 20].index)
    swow[keyword] = swow[keyword].str.lower()
# Filter out cues and answers that are not in EsPal (words_freq)
swow = swow[swow['cue'].isin(words_freq['word'])]
swow = swow[swow['answer'].isin(words_freq['word'])]

#### Get same cue-answer word pairs

In [60]:
original_cues = swow['cue'].unique().copy()
swow_renamed = swow.rename(columns={'answer': 'cue', 'cue': 'answer'})
swow_merged = pd.merge(swow, swow_renamed, on='cue').drop(columns=['freq_x', 'freq_y'])
same_word_pairs = swow_merged[swow_merged['answer_x'] == swow_merged['answer_y']].copy()
# Average the number of responses for the same word pairs
same_word_pairs['n'] = (same_word_pairs['n_x'] + same_word_pairs['n_y']) // 2
same_word_pairs.drop(columns=['n_x', 'n_y', 'answer_y'], inplace=True)
same_word_pairs.rename(columns={'answer_x': 'answer'}, inplace=True)

#### Compute frequency of answer with new n

In [61]:
swow_merged = pd.merge(swow, same_word_pairs, on=['cue', 'answer'], how='left')
swow_merged['n'] = swow_merged['n_y'].fillna(swow_merged['n_x'])
swow_merged.drop(columns=['n_x', 'n_y', 'freq'], inplace=True)
swow_merged['n'] = swow_merged['n'].astype(int)
swow_merged['freq'] = swow_merged.groupby('cue')['n'].transform(lambda x: x / x.sum())

#### Get rid of repeated word pairs and retain only original cues

In [62]:
swow_merged[['cue', 'answer']] = pd.DataFrame(np.sort(swow_merged[['cue', 'answer']], axis=1), index=swow_merged.index)
swow_merged.drop_duplicates(['cue', 'answer'], inplace=True)
swow_merged = swow_merged[swow_merged['cue'].isin(original_cues)]

### Save to file

In [64]:
swow_merged.to_csv(data_path / 'SWOWRP_words_associations.csv', index=False)