-
Notifications
You must be signed in to change notification settings - Fork 53
/
text.ts
165 lines (155 loc) · 5.05 KB
/
text.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import * as _ from 'lodash';
import * as sw from 'stopword';
import { Type1DMatrix } from '../types';
import { WordTokenizer } from '../utils/nlp';
import { validateMatrix1D } from '../utils/validation';
import { ENGLISH_STOP_WORDS } from './stop_words';
/**
* The CountVectorizer provides a simple way to both tokenize a collection
* of text documents and build a vocabulary of known words, but also
* to encode new documents using that vocabulary.
*
* @example
* import { CountVectorizer } from 'machinelearn/feature_extraction';
*
* const corpus = ['deep learning ian good fellow learning jason shin shin', 'yoshua bengio'];
* const vocabCounts = cv.fit_transform(corpus);
* console.log(vocabCounts); // [ [ 0, 1, 1, 1, 1, 1, 2, 2, 0 ], [ 1, 0, 0, 0, 0, 0, 0, 0, 1 ] ]
* console.log(cv.vocabulary); // { bengio: 0, deep: 1, fellow: 2, good: 3, ian: 4, jason: 5, learning: 6, shin: 7, yoshua: 8 }
* console.log(cv.getFeatureNames()); // [ 'bengio', 'deep', 'fellow', 'good', 'ian', 'jason', 'learning', 'shin', 'yoshua' ]
*
* const newVocabCounts = cv.transform(['ian good fellow jason duuog']);
* console.log(newVocabCounts); // [ [ 0, 0, 1, 1, 1, 1, 0, 0, 0 ] ]
*/
export class CountVectorizer {
public vocabulary: object = {};
/** @ignore */
private internalVocabulary: Type1DMatrix<string>;
/**
* Learn a vocabulary dictionary of all tokens in the raw documents.
* @param {string[]} doc - An array of strings
* @returns {CountVectorizer}
*/
public fit(doc: Type1DMatrix<string> = null): this {
validateMatrix1D(doc);
this.fit_transform(doc);
return this;
}
/**
* fit transform applies
* @param {string[]} doc - An array of strings
* @returns {number[][]}
*/
public fit_transform(doc: Type1DMatrix<string> = null): number[][] {
validateMatrix1D(doc);
const { internalVocabulary, pubVocabulary } = this.buildVocabulary(doc);
this.vocabulary = pubVocabulary;
this.internalVocabulary = internalVocabulary;
return this.countVocab(doc);
}
/**
* Transform documents to document-term matrix.
* Extract token counts out of raw text documents using the vocabulary
* fitted with fit or the one provided to the constructor.
* @param {string[]} doc - An array of strings
* @returns {number[][]}
*/
public transform(doc: Type1DMatrix<string> = null): number[][] {
validateMatrix1D(doc);
return this.countVocab(doc);
}
/**
* Array mapping from feature integer indices to feature name
* @returns {Object}
*/
public getFeatureNames(): object {
if (!this.internalVocabulary) {
throw new Error('You must fit a document first before you can retrieve the feature names!');
}
return this.internalVocabulary;
}
/**
* Build a tokenizer/vectorizer
* @returns {(x: string) => string[]}
*/
private buildAnalyzer(): (x: string) => string[] {
return (x) => this.preprocess(x, { removeSW: true });
}
/**
* Calculates list of vocabularies in the entire document and come up with
* vocab: index pairs
* @param doc
*/
private buildVocabulary(
doc: Type1DMatrix<string>,
): {
internalVocabulary: string[];
pubVocabulary: object;
} {
const analyze = this.buildAnalyzer();
const processedDoc: string[] = _.flowRight(
(d: string[]) => _.uniq(d),
(d: string[]) => _.sortBy(d, (z) => z),
(d: string[][]) => _.flatten(d),
(d: string[]) => _.map(d, (text) => analyze(text)),
)(doc);
const pubVocabulary = _.reduce(
processedDoc,
(sum, val, index) => {
return _.set(sum, val, index);
},
{},
);
return {
internalVocabulary: processedDoc,
pubVocabulary,
};
}
/**
* @ignore
* Counting number of vocab occurences in the current token of a sentence
* ['yoshua', 'bengio', 'deep', 'learning'] = vocabulary
* ['yohua', 'bengio'] => tokens
* results in
* [1, 1, 0, 0]
* @param doc
*/
private countVocab(doc: Type1DMatrix<string>): number[][] {
const analyze = this.buildAnalyzer();
const docVocabCounts: number[][] = [];
for (const sentence of doc) {
// For each sentence, get tokens
const tokens: string[] = analyze(sentence);
const sentenceCounts: number[] = [];
// For each vocab, count number of appearance of each vocab in the tokens
for (const vocab of this.internalVocabulary) {
let vocabCount = 0;
for (const t of tokens) {
if (t === vocab) {
vocabCount++;
}
}
sentenceCounts.push(vocabCount);
}
docVocabCounts.push(sentenceCounts);
}
return docVocabCounts;
}
/**
* @ignore
* preprocess a line of text by applying
* 1) tokenization
* 2) removing stopwords
* @param text
* @param { boolean } removeSW
* @returns {any}
*/
private preprocess(text: string, { removeSW = false }): string[] {
const tokenizer = new WordTokenizer();
let tokens = text.split(' ');
if (removeSW) {
tokens = sw.removeStopwords(tokens, ENGLISH_STOP_WORDS);
}
return tokenizer.tokenize(tokens.join(' '));
}
}