-
Notifications
You must be signed in to change notification settings - Fork 7
/
Make_Law_Label.py
99 lines (84 loc) · 2.55 KB
/
Make_Law_Label.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#coding=utf8
import os,sys
import json
import jieba
import torch
from utils.parse_data import *
stopWords =True
with open('./data/stop_words.txt', 'r') as stops:
s = stops.read()
stop_words_from_file = s.split()
stop_words_from_file = set(stop_words_from_file)
total_stop_words_set = stop_words_from_file
def stopword_remover(line):
tokens = [w for w in line if not (w in total_stop_words_set
or is_number(w) or ((not is_ascii(w)) and len(w) <= 1))]
return tokens
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def is_number(s):
try:
float(s)
return True
except ValueError:
pass
try:
import unicodedata
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass
return False
def makelaw():
"""
read the law file and use Word2vec to transform
:return:law_text
:return:law_length
:return:law_order
:return:parent2law(The type of law that each parent class contains)
"""
filelaw = "./data/newlaw.txt"
input_lines = open(filelaw, 'r')
filetext = "./data/rulenew.txt"
text = open(filetext, 'r')
wordHelper = data_helper.Vocab("./data/word_dict_10w.pkl")
with open('./data/my_dict.txt', 'r') as f:
data = f.read()
data = data.rstrip()
my_dict = json.loads(data)
parent2law = [[] for i in range(10)]
law = []
for line in input_lines:
line = line.rstrip()
law.append(line)
parent2law[my_dict[str(line)]-1].append(int(line))
law.sort()
for part in parent2law:
part.sort()
i = 0
law_text = []
law_length = []
law_order = []
for line in text:
line = line.rstrip()
index, raw_text = line.split('\t')
if index == law[i]:
words = list(jieba.cut(raw_text))
seg_fact = stopword_remover(words) if stopWords == True else words
# print(index,"---",seg_fact[0:5])
# print(index, "---", seg_fact[0:5])
trans_text = wordHelper.transform_raw(seg_fact[0:20])
length = len(trans_text)
law_text.append(trans_text)
law_length.append(length)
law_order.append(int(index))
if i + 1 == len(law):
break
else:
continue
i = i + 1
maxlen = max(law_length)
law_text = [item + [0] * (maxlen - len(item)) for item in law_text]
return torch.LongTensor(law_text), torch.LongTensor(law_length), law_order, parent2law
if __name__ == "__main__":
makelaw()