This repository has been archived by the owner on Nov 26, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 26
/
generate-index.js
179 lines (166 loc) · 5.99 KB
/
generate-index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
'use strict'
const lunr = require('lunr')
const cheerio = require('cheerio')
const { decode } = require('html-entities')
/**
* Generate a Lunr index.
*
* Iterates over the specified pages and creates a Lunr index.
*
* @memberof generate-index
*
* @param {Object} playbook - The configuration object for Antora.
* @param {Array<File>} pages - The publishable pages to map.
* @param {Object} contentCatalog - the Antora content catalog (allows access to page metadata).
* @param {Object} env - command line environment variables.
* @returns {Object} A JSON object with a Lunr index and a documents store.
*/
function generateIndex (playbook, pages, contentCatalog, env) {
let siteUrl = playbook.site.url
if (!siteUrl) {
// Uses relative links when site URL is not set
siteUrl = ''
}
if (siteUrl.charAt(siteUrl.length - 1) === '/') siteUrl = siteUrl.substr(0, siteUrl.length - 1)
if (!pages.length) return {}
// Map of Lunr ref to document
const documentsStore = {}
const documents = pages
.map((page) => {
const html = page.contents.toString()
const $ = cheerio.load(html)
return { page, $ }
})
// Exclude pages marked as "noindex"
.filter(({ page, $ }) => {
const $metaRobots = $('meta[name=robots]')
const metaRobotNoIndex = $metaRobots && $metaRobots.attr('content') === 'noindex'
const pageNoIndex = page.asciidoc && page.asciidoc.attributes && page.asciidoc.attributes.noindex === ''
const noIndex = metaRobotNoIndex || pageNoIndex
const indexOnlyLatest = env.DOCSEARCH_INDEX_VERSION &&
env.DOCSEARCH_INDEX_VERSION === 'latest'
if (indexOnlyLatest) {
const component = contentCatalog.getComponent(page.src.component)
const thisVersion = contentCatalog.getComponentVersion(component, page.src.version)
const latestVersion = contentCatalog.getComponent(page.src.component).latest
const notLatest = thisVersion !== latestVersion
return !(noIndex || notLatest)
}
return !noIndex
})
.map(({ page, $ }) => {
// Fetch just the article content, so we don't index the TOC and other on-page text
// Remove any found headings, to improve search results
const article = $('article.doc')
const $h1 = $('h1', article)
const documentTitle = $h1.first().text()
$h1.remove()
const titles = []
$('h2,h3,h4,h5,h6', article).each(function () {
const $title = $(this)
// If the title does not have an Id then Lunr will throw a TypeError
// cannot read property 'text' of undefined.
if ($title.attr('id')) {
titles.push({
text: $title.text(),
id: $title.attr('id')
})
}
$title.remove()
})
// don't index navigation elements for pagination on each page
// as these are the titles of other pages and it would otherwise pollute the index.
$('nav.pagination', article).each(function () {
$(this).remove()
})
// Pull the text from the article, and convert entities
let text = article.text()
// Decode HTML
text = decode(text)
// Strip HTML tags
text = text.replace(/(<([^>]+)>)/ig, '')
.replace(/\n/g, ' ')
.replace(/\r/g, ' ')
.replace(/\s+/g, ' ')
.trim()
// Return the indexable content, organized by type
return {
text: text,
title: documentTitle,
component: page.src.component,
version: page.src.version,
name: page.src.stem,
url: page.pub.url,
titles: titles // TODO get title id to be able to use fragment identifier
}
})
const languages = env.DOCSEARCH_LANGS
? env.DOCSEARCH_LANGS.split(',')
: ['en']
if (languages.length > 1 || !languages.includes('en')) {
if (languages.length > 1 && typeof lunr.multiLanguage === 'undefined') {
// required, otherwise lunr.multiLanguage will be undefined
require('lunr-languages/lunr.multi')(lunr)
}
// required, to load additional languages
require('lunr-languages/lunr.stemmer.support')(lunr)
languages.forEach((language) => {
if (language === 'ja' && typeof lunr.TinySegmenter === 'undefined') {
require('lunr-languages/tinyseg')(lunr) // needed for Japanese Support
}
if (language === 'th' && typeof lunr.wordcut === 'undefined') {
lunr.wordcut = require('lunr-languages/wordcut') // needed for Thai support
}
if (language !== 'en' && typeof lunr[language] === 'undefined') {
require(`lunr-languages/lunr.${language}`)(lunr)
}
})
}
// Construct the lunr index from the composed content
const lunrIndex = lunr(function () {
const self = this
if (languages.length > 1) {
self.use(lunr.multiLanguage(...languages))
} else if (!languages.includes('en')) {
self.use(lunr[languages[0]])
} else {
// default language (English)
}
self.ref('url')
self.field('title', { boost: 10 })
self.field('name')
self.field('text')
self.field('component')
self.metadataWhitelist = ['position']
documents.forEach(function (doc) {
self.add(doc)
doc.titles.forEach(function (title) {
self.add({
title: title.text,
url: `${doc.url}#${title.id}`
})
}, self)
}, self)
})
// Place all indexed documents into the store
documents.forEach(function (doc) {
documentsStore[doc.url] = doc
})
// Return the completed index, store, and component map
return {
index: lunrIndex,
store: documentsStore
}
}
// Helper function allowing Antora to create a site asset containing the index
function createIndexFile (index) {
return {
mediaType: 'text/javascript',
contents: Buffer.from(`window.antoraLunr.init(${JSON.stringify(index)})`),
src: { stem: 'search-index' },
out: { path: 'search-index.js' },
pub: { url: '/search-index.js', rootPath: '' }
}
}
module.exports = generateIndex
module.exports.createIndexFile = createIndexFile