-
Notifications
You must be signed in to change notification settings - Fork 0
/
patent_word_extraction.py
74 lines (58 loc) · 2.17 KB
/
patent_word_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import multiprocessing as mp
import nltk
from nltk.corpus import stopwords
import os
import re
# use standard nltk stopwords
try:
nltk.corpus.cmudict.dict()
except LookupError:
nltk.download('stopwords')
finally:
stop_words = set(stopwords.words('english'))
domain_specific_stop_words = {'reference', 'references'}
stop_words |= domain_specific_stop_words
output_file = 'all_patent_docs.txt'
def extract_words(filename, q):
'''Extract valid words from each patent document. The actual words of the patent are positioned in the second
column of the text file. Valid words are those that are not stopwords, don't contain any numeric digits are at
least four characters long. The words are added to a string, which is then pushed onto the queue.'''
s = ''
with open(filename, 'r') as f:
for line in f:
line = line.lstrip()
split_line = line.split('\t')
if len(split_line) > 1:
word = split_line[1]
word = re.sub("[(),\[\]{}!.?@+_:;'$`&]", '', word.lower()).replace('-', '')
if not (word in stop_words or len(word) < 4 or any(char.isdigit() for char in word)):
s += word + ' '
s = s.rstrip()
q.put(s)
def listener(q):
'''Listens for messages on the queue and writes a new line to the output file for each patent document.'''
with open(output_file, 'w') as f:
while True:
m = q.get()
if m == 'kill':
break
f.write(str(m) + '\n')
f.flush()
# set up queue
manager = mp.Manager()
q = manager.Queue()
pool = mp.Pool()
# activate listener
watcher = pool.apply_async(listener, (q,))
# get complete list of filenames where the patent documents reside
patent_filenames = []
for directory in os.listdir():
if directory.startswith('patents'):
patent_filenames.extend([(directory + '/' + filename, q) for filename in os.listdir(directory)
if filename.endswith('.nlp')])
# assign filenames in parallel to each of the worker processes
pool.starmap(extract_words, patent_filenames)
# terminate listener
q.put('kill')
pool.close()
pool.join()