This repository has been archived by the owner on Jun 27, 2019. It is now read-only.
/
metascraper.js
99 lines (85 loc) · 3.09 KB
/
metascraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// get link metadata
// TODO: add more services and use the metascraper to fill some metadata on the article
const metascraper = require('metascraper').load([
require('metascraper-author')(),
require('metascraper-date')(),
require('metascraper-description')(),
require('metascraper-image')(),
require('metascraper-logo')(),
require('metascraper-clearbit-logo')(),
require('metascraper-logo-favicon')(),
require('metascraper-publisher')(),
require('metascraper-title')(),
require('metascraper-url')(),
require('metascraper-youtube')(),
]);
const got = require('got');
const _ = require('lodash');
const getMetadata = async (targetUrl, app) => {
const { body: html, url } = await got(targetUrl);
app.debug(`getMetadata - getting metadata for ${url}`);
// app.debug(html);
// app.debug(html);
const metadata = await metascraper({ html, url });
// app.debug(metadata);
app.debug(`getMetadata - got metadata for ${url}`);
return metadata;
};
module.exports = function () {
return function (hook) {
return new Promise(async (resolve, reject) => {
if (hook.data === undefined) {
return reject(new Error('No data found for metascraper.'));
}
// Check required fields
if (!hook.data || !hook.data.content) {
return resolve(hook);
}
if (!hook.data.meta) {
hook.data.meta = {};
}
let promises = [];
let embedds = {};
try {
// find links
const youtubeRegex = new RegExp(/(?:(?:https?:)?\/\/)?(?:www\.)?youtu(?:be\.com\/(?:watch\?(?:.*?&(?:amp;)?)*v=|v\/|embed\/)|\.be\/)([\w\-]+)(?:(?:&(?:amp;)?|\?)[\w\?=]*)*/, 'ig'); // eslint-disable-line
const youtubeLinks = youtubeRegex.exec(hook.data.content);
// html links
const linkRegex = new RegExp(/<a\s[^>]*href=\"([^\"]*)\"[^>]*>([^<]*)<\/a>/, 'ig') // eslint-disable-line
let match;
while (match = linkRegex.exec(hook.data.content)) {
const url = match[1];
hook.app.debug(url);
// skip if url already exists
if (!_.isEmpty(embedds[url])) {
continue;
}
// here you could scrape the url for metadata
// hook.app.debug(match);
promises.push(new Promise(async (resolve) => {
try {
const metadata = await getMetadata(url, hook.app);
embedds[url] = metadata;
return resolve(metadata);
} catch (err) {
hook.app.error('FAILED TO GRAB THE LINK');
return resolve();
}
}));
}
if (youtubeLinks.length >= 2) {
hook.data.teaserImg = `https://img.youtube.com/vi/${youtubeLinks[1]}/hqdefault.jpg`;
hook.data.meta = Object.assign(hook.data.meta || {}, { hasVideo: true });
}
} catch (err) {} // eslint-disable-line
return Promise.all(promises)
.then(() => {
hook.app.debug('embedds:');
hook.app.debug(embedds);
hook.data.meta.embedds = embedds;
hook.app.debug('FINISHED!');
resolve(hook);
});
});
};
};