### Set up

In [29]:
import pandas as pd
import corpora
from pathlib import Path

def get_words(corpus):
    words_in_texts = set()
    for sentence in corpus.get_texts():
        for word in sentence['text']:
            words_in_texts.add(word)
    return words_in_texts

DEACCENT_MAP = {'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Ä': 'A',
                'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'ä': 'a',
                'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E',
                'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e',
                'Í': 'I', 'Ì': 'I', 'Î': 'I', 'Ï': 'I',
                'í': 'i', 'ì': 'i', 'î': 'i', 'ï': 'i',
                'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ö': 'O',
                'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o',
                'Ù': 'U', 'Ú': 'U', 'Û': 'U', 'Ü': 'U',
                'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u',
                'Ç': 'C', 'ç': 'c'}

deaccent_map = str.maketrans(DEACCENT_MAP)
root = Path('.').absolute()
data_path = root / 'evaluation'
texts_path = root / 'texts'

texts = corpora.Corpus('texts', 'local', 1.0, min_token_len=2, max_token_len=20, min_sentence_len=None)
words_in_texts = get_words(texts)
swow = pd.read_csv(data_path / 'SWOWRP_words_associations.csv')
swow = swow.drop(swow[~swow['Unnamed: 3'].isna()].index)
swow.drop(columns=['Unnamed: 3'], inplace=True)

### Preprocessing

In [30]:
# Delete cues with more than one word
swow = swow.drop(swow[swow['cue'].str.contains(' ')].index)
# Delete cues that are not composed of latin letters
swow = swow.drop(swow[swow['cue'].str.contains('[^ \nA-Za-zÀ-ÖØ-öø-ÿ/]+')].index)
# Delete cues with length less than 3
swow = swow.drop(swow[swow['cue'].str.len() < 3].index)
# Delete cues with length more than 20
swow = swow.drop(swow[swow['cue'].str.len() > 20].index)
# Deaccent cues
swow['cue'] = swow['cue'].str.translate(deaccent_map)

### Divide in sets

In [33]:
swow_in_texts = swow.drop(swow[~swow['cue'].isin(words_in_texts)].index)
swow_outside_texts = swow.drop(swow[swow['cue'].isin(words_in_texts)].index)
print('Cues in texts: ' + str(len(swow_in_texts['cue'].unique())))
print('Cues outside of texts: ' + str(len(swow_outside_texts['cue'].unique())))

Cues in texts: 2134
Cues outside of texts: 10954


In [34]:
swow_in_texts.to_csv(data_path / 'swow_in_texts.csv', index=False)
swow_outside_texts.to_csv(data_path / 'swow_outside_texts.csv', index=False)