### Set up

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

root = Path('.').absolute().parent
data_path = root / 'evaluation'

swow = pd.read_csv(data_path / 'SWOWRP_words_associations.csv', delimiter='\t')
swow.drop(columns=['N'], inplace=True)
swow.rename(columns={'response': 'answer', 'R1': 'n', 'R1.Strength': 'freq'}, inplace=True)
swow.drop_duplicates(subset=['cue', 'answer'], inplace=True)

### Preprocessing

In [4]:
for keyword in ['cue', 'answer']:
    # Delete cues and answers with more than one word
    swow = swow.drop(swow[swow[keyword].str.contains(' ')].index)    
    # Delete cues and answers that are not composed of latin letters
    swow = swow.drop(swow[swow[keyword].str.contains('[^ \nA-Za-zá-úñ/]+')].index)
    # Delete cues with length less than 3
    swow = swow.drop(swow[swow[keyword].str.len() < 3].index)
    swow = swow.drop(swow[swow[keyword].str.len() > 20].index)
    swow[keyword] = swow[keyword].str.lower()

#### Get same cue-answer word pairs

In [None]:
swow_renamed = swow.rename(columns={'answer': 'cue', 'cue': 'answer'})
swow_merged = pd.merge(swow, swow_renamed, on='cue').drop(columns=['freq_x', 'freq_y'])
same_word_pairs = swow_merged[swow_merged['answer_x'] == swow_merged['answer_y']].copy()
# Average the number of responses for the same word pairs
same_word_pairs['n'] = (same_word_pairs['n_x'] + same_word_pairs['n_y']) // 2
same_word_pairs.drop(columns=['n_x', 'n_y', 'answer_y'], inplace=True)
same_word_pairs.rename(columns={'answer_x': 'answer'}, inplace=True)

#### Compute frequency of answer with new n

In [None]:
swow_merged = pd.merge(swow, same_word_pairs, on=['cue', 'answer'], how='left')
swow_merged['n'] = swow_merged['n_y'].fillna(swow_merged['n_x'])
swow_merged.drop(columns=['n_x', 'n_y', 'freq'], inplace=True)
swow_merged['n'] = swow_merged['n'].astype(int)
swow_merged['freq'] = swow_merged.groupby('cue')['n'].transform(lambda x: x / x.sum())

#### Get rid of repeated word pairs

In [None]:
swow_merged[['cue', 'answer']] = pd.DataFrame(np.sort(swow_merged[['cue', 'answer']], axis=1), index=swow_merged.index)
swow_merged.drop_duplicates(['cue', 'answer'], inplace=True)

### Save to file

In [5]:
swow_merged.to_csv(data_path / 'SWOWRP_words_associations.csv', index=False)