-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_preprocess.py
153 lines (132 loc) · 5.34 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import json
import pandas as pd
import re
import numpy as np
import jieba
def transform_data_style(path):
with open(path) as file:
lines = file.read().split('\n')
lines.pop() # 去掉最后一个空格
# 每个测试用例共八个元素
# 总共41815个测试用例
# print(lines[:8])
# print(len(lines),(len(lines))/8)
test_id = []
ref = []
hyp = []
errors_count = []
words_count = []
for i,line in enumerate(lines):
# id
if i % 8 == 0:
line = line.split()[1]
test_id.append(line)
# error count & words count
if i % 8 == 1:
line = list(map(int,line.split()[-4:]))
words_count.append(sum(line))
errors_count.append(sum(line)-line[0])
# REF text
if i % 8 == 2:
line = ''.join(line.split()[1:])
ref.append(line)
# HYP text
if i % 8 == 3:
line = ''.join(line.split()[1:])
hyp.append(line)
original_data = pd.DataFrame(columns=['id','REF','HYP','words_count','errors_count','WER'])
original_data.id = test_id
original_data.REF = ref
original_data.HYP = hyp
original_data.words_count = words_count
original_data.errors_count = errors_count
original_data.WER = original_data.errors_count/original_data.words_count
print(original_data.head())
original_data.to_csv('../data/wechat_original_test2.csv',index=0)
return original_data
# def load_testdata():
# data = pd.read_csv('../data/wechat_original_test2.csv')
# data_id = pd.concat([data.id,data.id],axis=0)
# data = pd.concat([data.REF,data.HYP],axis=0)
# return data,data_id
def load_testdata():
# data = pd.read_excel('../data/wechat_team_test.xlsx',index=None)
data = pd.read_excel('../data/top_stories.xlsx',index=None)
# data['text_length'] = data.comment.apply(len)
# print(data.text_length.mean())
return data.comment,data.uin
# def load_data():
# neg=pd.read_excel('../data/neg.xls',header=None,index=None)
# pos=pd.read_excel('../data/pos.xls',header=None,index=None)
# # print(neg.head(),pos.head())
# data = pd.concat([neg[0],pos[0]])
# # print(data.head())
# print(data.apply(len).mean())
# y = np.concatenate((np.ones(len(pos),dtype=int), np.zeros(len(neg),dtype=int)))
# # print(y)
# # add extra training data
# train = pd.read_excel('../data/wechat_team_train.xlsx')
# train_data = train[:399][train.Label!= 2].comment
# train_label = train[:399][train.Label!=2].Label.values
# data = pd.concat([data,train_data])
# y = np.concatenate((y,train_label))
# print('train data: ',data.shape,len(y))
# return data,y
def load_data():
noise = get_noise() # 0
clean = get_clean() # 1
data = pd.concat([noise,clean])
y = np.concatenate((np.zeros(len(noise),dtype=int), np.ones(len(clean),dtype=int)))
# add extra training data
train = pd.read_excel('../data/wechat_team_train.xlsx')
train[:1500].label = train[:1500].label.fillna(1)
data = pd.concat([data,train[:1500].comment])
y = np.concatenate((y,train[:1500].label.values))
print('train data: ',data.shape,len(y))
return data,y
# 数据预处理操作:分词,去停用词,词性筛选
def dataPrepos(text, stopkey):
words = []
pos = ['n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd'] # 定义选取的词性
seg = jieba.posseg.cut(text) # 分词
for i in seg:
# if i.word not in stopkey and i.flag in pos: # 去停用词 + 词性筛选
if i.word not in stopkey and i.flag in pos:
words.append(i.word)
return ' '.join(words)
def preprocess_data(data):
# clean data
data = data.apply(lambda x:re.sub(u"[^\u4E00-\u9FFF]", "", x))
punctuation = """!?。"#$%&'()*+-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏"""
data = data.apply(lambda x:re.sub( punctuation, "",x)) #清除所有标点符号
# tokenize
data = data.apply(lambda x:' '.join(jieba.cut(x)))
data = data.apply(lambda x: re.sub('\s{2,}',' ',x))
# delete stop words
file = open('../data/chinese_stopwords.txt','r',encoding='utf-8')
stopwords = file.read().split('\n')
stopwords = set(stopwords) # set比list查找速度更快!!
data = data.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
# save data
data.to_csv('../data/preprocess_data.csv',index=False)
# data = pd.read_csv('../data/preprocess_data.csv',header=0, sep = ",",encoding='utf-8')
return data
# noise data
def get_noise():
data = open('../data/wechat_pay_mp.json').read()
data = json.loads(data)
train_data = []
for item in data:
train_data.append(item['_source']['comment'])
# print(len(train_data)) 15374
noise = pd.DataFrame({'comment':train_data})
noise.comment.to_csv('../data/noise_data.csv',index=0)
return noise.comment
# clean data
def get_clean():
data = pd.read_excel('../data/wechat_pay.xlsx')
clean = pd.DataFrame({'comment':data.comment.values})
clean.comment.to_csv('../data/clean_data.csv',index=0)
return clean.comment
if __name__ == '__main__':
load_data()