### Set up

In [23]:
import pandas as pd
from corpora import Corpus
from pathlib import Path

def get_words(corpus):
    words_in_texts = set()
    for sentence in corpus.get_texts():
        for word in sentence['text']:
            words_in_texts.add(word)
    return words_in_texts

DEACCENT_MAP = {'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Ä': 'A',
                'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'ä': 'a',
                'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E',
                'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e',
                'Í': 'I', 'Ì': 'I', 'Î': 'I', 'Ï': 'I',
                'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i',
                'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ö': 'O',
                'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o',
                'Ù': 'U', 'Ú': 'U', 'Û': 'U', 'Ü': 'U',
                'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u',
                'Ç': 'C', 'ç': 'c'}

root = Path('.').absolute().parent
data_path = root / 'evaluation'
texts_path = root / 'texts'

words_in_texts = get_words(Corpus('../stimuli', 'local', 1.0, min_token_len=2, max_token_len=20, min_sentence_len=None))
swow = pd.read_csv(data_path / 'SWOWRP_words_associations.csv', delimiter='\t')
swow.drop(columns=['N'], inplace=True)
swow.rename(columns={'response': 'answer', 'R1': 'n', 'R1.Strength': 'freq'}, inplace=True)

### Preprocessing

In [24]:
# Delete cues and answers with more than one word
swow = swow.drop(swow[swow['cue'].str.contains(' ')].index)
swow = swow.drop(swow[swow['answer'].str.contains(' ')].index)
# Delete cues and answers that are not composed of latin letters or have accents
swow = swow.drop(swow[swow['cue'].str.contains('[^ \nA-Za-z/]+')].index)
swow = swow.drop(swow[swow['answer'].str.contains('[^ \nA-Za-z/]+')].index)
# Delete cues with length less than 3
swow = swow.drop(swow[swow['cue'].str.len() < 3].index)
swow = swow.drop(swow[swow['answer'].str.len() < 3].index)
# Delete cues with length more than 20
swow = swow.drop(swow[swow['cue'].str.len() > 20].index)
swow = swow.drop(swow[swow['answer'].str.len() > 20].index)
# To lower
swow['cue'] = swow['cue'].str.lower()
swow['answer'] = swow['answer'].str.lower()

### Save to file

In [25]:
swow.to_csv(data_path / 'SWOWRP_words_associations.csv', index=False)

### Divide in sets

In [26]:
swow_in_texts = swow.drop(swow[~swow['cue'].isin(words_in_texts)].index)
swow_outside_texts = swow.drop(swow[swow['cue'].isin(words_in_texts)].index)
print('Cues in texts: ' + str(len(swow_in_texts['cue'].unique())))
print('Cues outside of texts: ' + str(len(swow_outside_texts['cue'].unique())))

Cues in texts: 1857
Cues outside of texts: 9221
