In [1]:
# import nltk
# nltk.download()

In [16]:
# Import packages and read from ./data repo
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
corpus_root = 'data/drug_data'
wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')
ids = wordlists.fileids()

# use three documents as samples
corp = {}
for x in range(0,10):
    corp["d{0}".format(x)]=wordlists.words(ids[x])

In [25]:
#corp["d0"]

In [4]:
print(stopwords.readme())

# Get the frequency distribution for each document
fd = {}
for x in range(0,10):
    fd["fd{0}".format(x)]=nltk.FreqDist(corp["d{0}".format(x)])

# the words that only shown once
print(
    len(fd['fd1'].hapaxes()),
    len(fd['fd2'].hapaxes()),
    len(fd['fd3'].hapaxes()),
    )

Stopwords Corpus

This corpus contains lists of stop words for several languages.  These
are high-frequency grammatical words which are usually ignored in text
retrieval applications.

They were obtained from:
http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/

The English list has been augmented
https://github.com/nltk/nltk_data/issues/22

The German list has been corrected
https://github.com/nltk/nltk_data/pull/49

A Kazakh list has been added
https://github.com/nltk/nltk_data/pull/52


(1059, 1135, 994)


In [5]:
# most common words before cleaning
print(fd['fd1'].most_common(10))
print(fd['fd2'].most_common(10))
print(fd['fd3'].most_common(10))

[(u'.', 1312), (u',', 986), (u'(', 545), (u'of', 381), (u'and', 344), (u'the', 307), (u'sexual', 285), (u'1', 250), (u'\u2013', 231), (u')', 228)]
[(u'.', 912), (u',', 736), (u'and', 382), (u'of', 358), (u'(', 352), (u'the', 291), (u'harassment', 211), (u'sexual', 194), (u')', 186), (u'a', 153)]
[(u'.', 1150), (u',', 561), (u'and', 323), (u'of', 313), (u'(', 292), (u'the', 279), (u'-', 146), (u').', 135), (u'in', 133), (u'to', 131)]


In [26]:
my_stopwords = set(['the', 'a', '.', 'studies', 'students', 'schools', 'prevalence', 'national', 'general', 'whether', 'statistically', 'probabilities', 'covered', 'subject', 'associated', 'indicate', 'testing', 'research'])  # we can define our own stop words in here

In [27]:
# Returns a set of word-frequency pairs chosen based on given limitations
# param@words: list of words from input documents
# param@freq: frequency distributions for each word from words
# param@min_len: the minimum length of a word to be chosen
# param@min_freq: the minimum times a word appear in the doc
def feature_extractor(words, freq, min_len, min_freq, my_stop=set()):
    return(set([(w.lower(), freq.get(w))
           for w in set(words)
           if len(w) >= min_len
           and freq.get(w) >= min_freq
           and w not in stopwords.words(fileids='english')
           and w not in my_stopwords]))
    # I want to add a filter to check whether the 'w' is a English word (with Lexical data)

def print_size(feature, id):
    print(' '.join(['Choosen', str(len(feature)), 'words from document', str(id)]))

# The numerical parameters in here can be tuned
f = {}
for x in range(0,10):
    f["f{0}".format(x)]=feature_extractor(corp['d{0}'.format(x)], fd['fd{0}'.format(x)], 7, 7, my_stopwords)

print_size(f['f1'], 1)
print_size(f['f2'], 2)
print_size(f['f3'], 3)

UnicodeDecodeError: 'utf8' codec can't decode byte 0xb1 in position 22: invalid start byte

In [13]:
# Returns a dictionary of word-frequency pairs from the union of several feature sets
def union_choosen_features(setlist):
#     if (not all([type(arg) is set for arg in args])):
#          raise ValueError('Input must be sets of word-frequency pairs.')
#     if (len(args) < 2):
#         raise ValueError('At least 2 arguments needed.')
    N = len(setlist)  # number of documents
    result = dict()  # word:frequency pairs
    for i in range(N):
        for w in setlist[i]:  # access each feature set
            if w[0] in result.keys():
                result[w[0]] += w[1]
            else:
                result[w[0]] = w[1]
    return result
       
union_features = union_choosen_features(f.values())
print_size(union_features, '\"union features\"')
sorted(union_features.items(), key=lambda x: x[1], reverse=True)

Choosen 311 words from document "union features"


[(u'harassment', 1198),
 (u'victimization', 633),
 (u'bullying', 470),
 (u'research', 263),
 (u'violence', 223),
 (u'journal', 222),
 (u'reported', 181),
 (u'bullied', 172),
 (u'climate', 165),
 (u'adolescents', 141),
 (u'american', 130),
 (u'ethnicity', 125),
 (u'orientation', 122),
 (u'harassed', 113),
 (u'experience', 111),
 (u'sexually', 106),
 (u'behavior', 106),
 (u'lesbian', 100),
 (u'distressing', 98),
 (u'experiences', 94),
 (u'bisexual', 93),
 (u'females', 92),
 (u'student', 92),
 (u'relationships', 90),
 (u'significant', 88),
 (u'perpetration', 87),
 (u'factors', 85),
 (u'university', 81),
 (u'adolescent', 77),
 (u'support', 76),
 (u'differences', 76),
 (u'psychology', 74),
 (u'heterosexual', 72),
 (u'identity', 70),
 (u'different', 67),
 (u'physical', 67),
 (u'response', 66),
 (u'experienced', 65),
 (u'attitudes', 64),
 (u'transgender', 64),
 (u'outcomes', 61),
 (u'significantly', 60),
 (u'internet', 60),
 (u'context', 60),
 (u'espelage', 59),
 (u'mitchell', 58),
 (u'distre

In [183]:
# The way to assign a sequence of variables (you can safely ignore this part)
for i in range(3):
    vars()[''.join(['arg', str(i)])] = ' '.join(['Integer', str(i)])
print(arg0, arg1, arg2)

Integer 0 Integer 1 Integer 2


In [20]:
all_data = {}
def main():
    #create the dictionary
    keys = ["Drug", "Harrass", "Bullying", "Vandalism"]
    paths = ["data/drug_data", "data/SH_data", "data/bully_data", "data/vandalism_data"]
    for path in paths:
        print(path)
        z = 0
        corpus_root = path
        wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')
        ids = wordlists.fileids()

        # collect all the documents from the folder
        corp = {}
        for x in range(0,10):
            corp["d{0}".format(x)]=wordlists.words(ids[x])

        fd = {}
        for x in range(0,10):
            fd["fd{0}".format(x)]=nltk.FreqDist(corp["d{0}".format(x)])
        
        f = {}
        for x in range(0,10):
            f["f{0}".format(x)]=feature_extractor(corp["d{0}".format(x)], fd['fd{0}'.format(x)], 7, 7, my_stopwords)
        
        union_features = union_choosen_features(f.values())
        print_size(union_features, '\"union features\"')
        print(keys[z])
        all_data[keys[z]] = sorted(union_features.items(), key=lambda x: x[1], reverse=True)
        z+=1
main()  
print(all_data['Drug'])

data/drug_data
Choosen 443 words from document "union features"
data/SH_data
Choosen 311 words from document "union features"
data/bully_data
Choosen 274 words from document "union features"
data/vandalism_data
Choosen 340 words from document "union features"
[(u'vandalism', 684), (u'behavior', 184), (u'student', 182), (u'residence', 166), (u'education', 100), (u'teachers', 97), (u'research', 94), (u'community', 87), (u'journal', 75), (u'discipline', 70), (u'environment', 70), (u'university', 70), (u'project', 70), (u'property', 67), (u'analysis', 66), (u'activity', 60), (u'teacher', 59), (u'content', 56), (u'treatment', 53), (u'illinois', 53), (u'significantly', 52), (u'downloaded', 52), (u'parents', 52), (u'control', 51), (u'actions', 50), (u'programs', 50), (u'college', 48), (u'individual', 46), (u'relationship', 46), (u'process', 46), (u'reported', 44), (u'butterworth', 38), (u'however', 37), (u'evidence', 37), (u'alcohol', 36), (u'results', 35), (u'incidence', 35), (u'punishment',

In [24]:
all_data.keys()

['Drug']