In [6]:
import re
from collections import Counter

In [18]:
filename = 'Serbian_LIWC2007_Dictionary.dic'

In [46]:
with open('../dicts/sentiment_dicts/liwc/raw/{}'.format(filename), 'r') as f:
    new_file = []
    for line in f:
        line_clean = line
        line_clean = re.sub(
            r'\b(230|156|l250|365|249|145)\b',
            '\t', line_clean
        )
        if '230' in line:
            print(line)
            print(line_clean)
        
        new_file.append(line_clean)

momentalno	230	253	16	1				

momentalno			253	16	1				



In [47]:
with open('../dicts/sentiment_dicts/liwc/{}'.format(filename), 'w') as f:
    for item in new_file:
        f.write(item)

## Test

In [48]:
def read_dic(filepath):
    '''
    Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
    (lexicon, category_names), where:
    * `lexicon` is a dict mapping string patterns to lists of category names
    * `categories` is a list of category names (as strings)
    '''
    # category_mapping is a mapping from integer string to category name
    category_mapping = {}
    # category_names is equivalent to category_mapping.values() but retains original ordering
    category_names = []
    lexicon = {}
    # the mode is incremented by each '%' line in the file
    mode = 0
    for line in open(filepath, encoding='utf-8-sig'):
        tsv = line.strip()
        if tsv:
            parts = tsv.split()
            if parts[0] == '%':
                mode += 1
            elif mode == 1:
                # definining categories
                category_names.append(parts[1])
                category_mapping[parts[0]] = parts[1]
            elif mode == 2:
                lexicon[parts[0]] = [category_mapping[category_id] for category_id in parts[1:]]
    return lexicon, category_names, category_mapping

def _build_trie(lexicon):
    '''
    Build a character-trie from the plain pattern_string -> categories_list
    mapping provided by `lexicon`.
    Some LIWC patterns end with a `*` to indicate a wildcard match.
    '''
    trie = {}
    for pattern, category_names in lexicon.items():
        cursor = trie
        for char in pattern:
            if char == '*':
                cursor['*'] = category_names
                break
            if char not in cursor:
                cursor[char] = {}
            cursor = cursor[char]
        cursor['$'] = category_names
    return trie


def _search_trie(trie, token, token_i=0):
    '''
    Search the given character-trie for paths that match the `token` string.
    '''
    if '*' in trie:
        return trie['*']
    elif '$' in trie and token_i == len(token):
        return trie['$']
    elif token_i < len(token):
        char = token[token_i]
        if char in trie:
            return _search_trie(trie[char], token, token_i + 1)
    return []

def load_token_parser(filepath):
    '''
    Reads a LIWC lexicon from a file in the .dic format, returning a tuple of
    (parse, category_names), where:
    * `parse` is a function from a token to a list of strings (potentially
      empty) of matching categories
    * `category_names` is a list of strings representing all LIWC categories in
      the lexicon
    '''
    lexicon, category_names, category_mapping = read_dic(filepath)
    trie = _build_trie(lexicon)
    return trie, category_mapping

def parse_token(token, trie):
    for category_name in _search_trie(trie, token):
        yield category_name

In [49]:
trie, xwalk = load_token_parser('../dicts/sentiment_dicts/liwc/{}'.format(filename))

In [50]:
Counter(category for word in ['fettet'] for category in parse_token(word, trie))

Counter()