-
Notifications
You must be signed in to change notification settings - Fork 0
/
oldscrape.js
72 lines (59 loc) · 2.58 KB
/
oldscrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
// Function to scrape the data and return the links, H1 headings, and images
async function scrapeData() {
try {
const url = "https://www.discudemy.com/all/1"; // Update the URL here
const response = await axios.get(url);
const $ = cheerio.load(response.data);
const courseElements = $(".card-header");
const links = [];
// Concurrently scrape course links, H1 headings, and images with limited concurrency
const coursePromises = courseElements.toArray().map(async (element) => {
const page = $(element).attr("href");
const courseResponse = await axios.get(page);
const courseData = courseResponse.data;
const coursePage = cheerio.load(courseData);
const courseLink = coursePage(
'body > div[class^="ui container"] > div > section > div:nth-child(5) > div > a'
).attr("href");
const image = coursePage(
'body > div[class^="ui container"] > div > section > div.ui.center.aligned.attached.segment > amp-img'
).attr("src"); // Update the image selector
const h1 = coursePage("#description-text > h1").text();
const desc = coursePage("div.ui.attached.segment")
.clone()
.children()
.remove()
.end()
.text()
.trim();
const price = coursePage(
"body > div.ui.container.item-f > div > section > div:nth-child(4) > p:nth-child(4) > span"
).text();
return { courseLink, h1, image, desc, price };
});
// Limit concurrency using Promise.all with a concurrency limit of 5
const concurrencyLimit = 5;
const courseData = [];
for (let i = 0; i < coursePromises.length; i += concurrencyLimit) {
const batch = coursePromises.slice(i, i + concurrencyLimit);
const batchResult = await Promise.all(batch);
courseData.push(...batchResult);
}
// Process the course links to fetch the final data
for (const { courseLink, h1, image, desc, price } of courseData) {
const courseLinkResponse = await axios.get(courseLink);
const courseLinkData = courseLinkResponse.data;
const courseLinkPage = cheerio.load(courseLinkData);
const link = courseLinkPage("#couponLink").attr("href");
if (link) {
links.push({ link, h1, image, desc, price });
}
}
console.log("Scraped data:", links); // Log the scraped data
console.log("Scraped data updated");
return links;
} catch (error) {
console.error(error);
throw error;
}
}