-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
102 lines (92 loc) · 3.23 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
const path = require("path");
const fs = require("fs");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const Segment = require("segment");
const chineseRegex = /[\u4e00-\u9fa5]/g;
const matchCnStringLiteral = /'[\u4e00-\u9fa5]+'/g;
const commentRegex = /\/\/.*|\/\*[\s\S]*?\*\//g;
const consoleRegex = /console\..*/g;
const segment = new Segment();
segment.useDefault();
/**
*
* @param {String} directory 需要遍历的文件夹
* @param {Object} csvWriter 创建的csvWriter对象
* @param {String} avoidFiles 需要避免遍历的文件
* @returns
*/
async function calculateCnNode(directory, csvWriter, avoidFiles) {
const baseName = path.basename(directory);
if (avoidFiles.includes(baseName)) return;
const files = await fs.promises.readdir(directory);
for (let i = 0; i < files.length; i++) {
const file = files[i];
const filePath = path.join(directory, file);
const stats = await fs.promises.stat(filePath);
if (stats.isDirectory()) {
await calculateCnNode(filePath, csvWriter, avoidFiles);
} else if (stats.isFile()) {
const fileContent = await fs.promises.readFile(filePath, "utf-8");
const lines = fileContent.split(/\r?\n/);
for (let j = 0; j < lines.length; j++) {
const line = lines[j];
if (!commentRegex.test(line) && !consoleRegex.test(line)) {
const chineseMatches = line.match(chineseRegex);
if (chineseMatches) {
const chineseString = chineseMatches.join("");
const words = segment.doSegment(chineseString, {
simple: true,
stripPunctuation: true,
});
const wordIndex = line.indexOf(words);
await csvWriter.writeRecords([
{
char: words,
line: j + 1,
column: wordIndex + 1,
file: filePath,
},
]);
}
const stringLiteralMatches = line.match(matchCnStringLiteral);
if (stringLiteralMatches) {
for (const match of stringLiteralMatches) {
const chineseString = match.slice(1, -1);
const words = segment.doSegment(chineseString, {
simple: true,
stripPunctuation: true,
});
const wordIndex = line.indexOf(words);
await csvWriter.writeRecords([
{
char: words,
line: j + 1,
column: wordIndex + 1,
file: filePath,
},
]);
}
}
}
}
}
console.log("当前已完成一个分词操作~");
}
}
const input = process.argv[2];
const output = process.argv[3];
const avoidArray = process.argv.slice(4);
const csvWriter = createCsvWriter({
path: output,
header: [
{ id: "char", title: "Character" },
{ id: "line", title: "Line" },
{ id: "column", title: "Column" },
{ id: "file", title: "File" },
],
});
csvWriter.writeRecords([]).then(() => {
calculateCnNode(input, csvWriter, avoidArray).then(() => {
console.log("CSV file written successfully");
});
});