In [190]:
# Import packages and read from ./data repo
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
corpus_root = 'data'
wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')
ids = wordlists.fileids()

# use three documents as samples
d1 = wordlists.words(ids[0])
d2 = wordlists.words(ids[1])
d3 = wordlists.words(ids[2])

print(stopwords.readme())

Stopwords Corpus

This corpus contains lists of stop words for several languages.  These
are high-frequency grammatical words which are usually ignored in text
retrieval applications.

They were obtained from:
http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/

The English list has been augmented
https://github.com/nltk/nltk_data/issues/22

The German list has been corrected
https://github.com/nltk/nltk_data/pull/49

A Kazakh list has been added
https://github.com/nltk/nltk_data/pull/52




In [29]:
# Get the frequency distribution for each document
fd1 = nltk.FreqDist(d1)
fd2 = nltk.FreqDist(d2)
fd3 = nltk.FreqDist(d3)

# the words that only shown once
print(
    len(fd1.hapaxes()),
    len(fd2.hapaxes()),
    len(fd3.hapaxes()))

727 1001 976


In [192]:
# most common words before cleaning
print(fd1.most_common(10))
print(fd2.most_common(10))
print(fd3.most_common(10))

[('.', 497), ('the', 229), (',', 226), ('of', 186), ('-', 161), ('to', 122), ('in', 118), ('and', 113), ('students', 99), ('use', 85)]
[('.', 840), (',', 461), ('(', 303), (')', 233), ('and', 218), ('of', 202), ('1', 187), ('the', 175), ('4', 136), ('-', 133)]
[('.', 677), (',', 605), ('1', 298), (')', 266), ('of', 215), ('testing', 207), ('and', 185), ('(.', 178), ('-', 149), ('use', 146)]


In [None]:
my_stopwords = set(['the', 'a', '.'])  # we can define our own stop words in here

In [176]:
# Returns a set of word-frequency pairs chosen based on given limitations
# param@words: list of words from input documents
# param@freq: frequency distributions for each word from words
# param@min_len: the minimum length of a word to be chosen
# param@min_freq: the minimum times a word appear in the doc
def feature_extractor(words, freq, min_len, min_freq, my_stop=set()):
    return(set([(w.lower(), freq.get(w))
           for w in set(words)
           if len(w) >= min_len
           and freq.get(w) >= min_freq
           and w not in stopwords.words(fileids='english')
           and w not in my_stopwords]))
    # I want to add a filter to check whether the 'w' is a English word (with Lexical data)

def print_size(feature, id):
    print(' '.join(['Choosen', str(len(feature)), 'words from document', str(id)]))

# The numerical parameters in here can be tuned
f1 = feature_extractor(d1, fd1, 7, 7, my_stopwords)
f2 = feature_extractor(d2, fd2, 7, 7, my_stopwords)
f3 = feature_extractor(d3, fd3, 7, 7, my_stopwords)

print_size(f1, 1)
print_size(f2, 2)
print_size(f3, 3)

Choosen 52 words from document 1
Choosen 55 words from document 2
Choosen 57 words from document 3


In [182]:
# Returns a dictionary of word-frequency pairs from the union of several feature sets
def union_choosen_features(*args):
    if (not all([type(arg) is set for arg in args])):
        raise ValueError('Input must be sets of word-frequency pairs.')
    if (len(args) < 2):
        raise ValueError('At least 2 arguments needed.')
    
    N = len(args)  # number of documents
    result = dict()  # word:frequency pairs
    for i in range(N):
        for w in args[i]:  # access each feature set
            if w[0] in result.keys():
                result[w[0]] += w[1]
            else:
                result[w[0]] = w[1]
    return result
        
union_features = union_choosen_features(f1, f2, f3)
print_size(union_features, '\"union features\"')
sorted(all_features.items(), key=lambda x: x[1], reverse=True)

Choosen 121 words from document "union features"


[('testing', 251),
 ('students', 228),
 ('substance', 164),
 ('student', 145),
 ('schools', 116),
 ('marijuana', 109),
 ('reported', 62),
 ('extracurricular', 61),
 ('alcohol', 57),
 ('activities', 47),
 ('illicit', 47),
 ('national', 40),
 ('prevalence', 38),
 ('adolescent', 35),
 ('subject', 35),
 ('athlete', 34),
 ('results', 33),
 ('predominant', 32),
 ('education', 32),
 ('polysubstance', 32),
 ('treatment', 30),
 ('classes', 29),
 ('control', 28),
 ('population', 27),
 ('nonathlete', 27),
 ('institute', 27),
 ('associated', 26),
 ('substances', 25),
 ('significantly', 25),
 ('covered', 25),
 ('participation', 24),
 ('journal', 24),
 ('ethnicity', 23),
 ('effects', 23),
 ('significant', 22),
 ('measures', 22),
 ('associations', 21),
 ('perceived', 21),
 ('research', 21),
 ('districts', 20),
 ('drinking', 20),
 ('depressive', 20),
 ('symptoms', 19),
 ('consequences', 19),
 ('athletes', 19),
 ('general', 19),
 ('somatic', 18),
 ('evidence', 18),
 ('covariates', 18),
 ('american', 18

In [183]:
# The way to assign a sequence of variables (you can safely ignore this part)
for i in range(3):
    vars()[''.join(['arg', str(i)])] = ' '.join(['Integer', str(i)])
print(arg0, arg1, arg2)

Integer 0 Integer 1 Integer 2
