-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing_truecase.py
59 lines (49 loc) · 2.23 KB
/
preprocessing_truecase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import sys
import random
def preprocessing_input(filename):
infile = open(filename+'_cased.txt', 'r')
lines = infile.readlines()
infile.close()
# parse the (cased) input file
doc = []
sentence = []
for line in lines:
linesplit = line.strip().split()
if(len(linesplit)>=4): # line with data
sentence.append(linesplit)
else: # empty line; end of sentence
doc.append(sentence)
sentence = []
if(len(sentence)>0):
doc.append(sentence)
# create a text file to be truecased (one sentence per line)
filename2 = (('/').join(filename.split('/')[:-1]))+'/truecaser/'+filename.split('/')[-1]
# for example, if filename is 'conll/train/conll_train' then filename2 is 'conll/train/truecaser/conll_train'
outfile = open(filename2+'_truecase_input.txt','w')
for sentence in doc:
outfile.write(" ".join([token[0].lower() for token in sentence])+"\n")
outfile.close()
# truecase that file. This takes time
os.system("allennlp predict wiki-truecaser-model.tar.gz "+filename2+"_truecase_input.txt --output-file "+filename2+"_truecase_output.txt \
--include-package mylib --use-dataset-reader --predictor truecaser-predictor --silent")
# replace the token[0] (i.e. the word) in each token with the truecased word
infile = open(filename2+'_truecase_output.txt', 'r')
lines = infile.readlines()
infile.close()
for i_line, line in enumerate(lines):
linesplit = line.strip().split(" ")
for i_word, word in enumerate(linesplit):
if(word.lower()!='-docstart-'):
doc[i_line][i_word][0] = word # this line is replacing the word with the truecased version
else:
doc[i_line][i_word][0] ='-DOCSTART-'
# write that to a text file
outfile = open(filename+'_truecased.txt','w')
for sentence in doc:
for token in sentence:
outfile.write(" ".join(token)+"\n")
outfile.write("\n")
outfile.close()
if __name__ == '__main__':
for suffix in ['train','dev','test']:
preprocessing_input('conll/'+suffix+'/conll_'+suffix) # for example, 'conll/train/conll_train'