/
utils.py
79 lines (65 loc) · 2.38 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 10 12:52:44 2016
@author: salo
"""
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import pandas as pd
import re
stemmer = PorterStemmer()
#stop = stopwords.words("english")
def stem_tokens(tokens, stemmer):
"""
http://stackoverflow.com/questions/26126442/combining-text-stemming-and-
removal-of-punctuation-in-nltk-and-scikit-learn
"""
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
"""
http://stackoverflow.com/questions/26126442/combining-text-stemming-and-
removal-of-punctuation-in-nltk-and-scikit-learn
"""
tokens = word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
def cogpo_columns(columns):
"""
"""
column_to_cogpo = {"Paradigm Class": "ParadigmClass",
"Behavioral Domain": "BehavioralDomain",
"Diagnosis": "Diagnosis",
"Stimulus Modality": "StimModality",
"Stimulus Type": "StimType",
"Response Modality": "RespModality",
"Response Type": "RespType",
"Instructions": "Instruction",
"Context": "Context"}
subset = { key:value for key, value in column_to_cogpo.items() if key in columns }
return subset
def clean_str(str_):
"""
"""
label = str_.replace(' ', '').replace("'", '').replace('(Overt)', '.Overt')
label = label.replace('(Covert)', '.Covert').replace('Stroop-', 'Stroop.')
label = label.replace('-', '').replace('/', '')
label = re.sub(r'\([^)]*\)', '', label)
return label
def get_label_parents(df, column, dimension):
"""
Create full list of labels (and their parents) in DataFrame column.
"""
col_labels = df[pd.notnull(df[column])][column]
col_labels.apply(lambda x: '{%s}' % '| '.join(x))
col_labels = col_labels.tolist()
dim_labels = [clean_str(label) for exp_labels in col_labels for label in exp_labels.split('| ')]
parents = dim_labels[:]
while parents:
parents = ['.'.join(item.split('.')[:-1]) for item in parents if len(item.split('.'))>1]
dim_labels += parents
dim_labels = ['{0}.{1}'.format(dimension, label) for label in dim_labels]
return dim_labels