forked from gfrmin/hong_kong_company_records
/
scraper.R
89 lines (72 loc) · 3.39 KB
/
scraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
library(data.table)
library(rvest)
library(lubridate)
library(stringr)
deleteslashes <- function(x) {str_replace_all(x, "[\n\t\r]", "")}
extractnames <- function(names) {
if (length(grep("-THE-", names)) == 1) {
namesenglish <- str_extract(names, ".+-THE-")
} else if (length(grep("-The-", names)) == 1) {
namesenglish <- str_extract(names, ".+-The-")
} else if (length(grep("Limited", names)) == 1) {
namesenglish <- str_extract(names, ".+Limited")
} else if (length(grep("LTD", names)) == 1) {
namesenglish <- str_extract(names, ".+LTD")
} else {
namesenglish <- str_extract(names, ".+LIMITED")
}
nameschinese <- str_replace(names, fixed(namesenglish), "")
return(list(deleteslashes(namesenglish), deleteslashes(nameschinese)))
}
scrape <- function(crno, crdata) {
crnostr = sprintf("%07d", crno)
baseurl = "https://www.mobile-cr.gov.hk/mob/cps_criteria.do?queryCRNO="
url = paste0(baseurl, crnostr)
print(c("trying", crnostr))
html <- html(url, encoding = "utf-8")
if (length(html_nodes(html, "img")) > 0) {
print(c("sleeping, then trying again", crnostr))
save(scrapelist, file = "scrapelist.Rdata")
Sys.sleep(50)
html <- html(url, encoding = "utf-8")
}
if (!is.null(html_node(html, "body"))) {
if (length(grep('沒有紀錄與輸入的查詢資料相符', html_text(html_node(html, "body"))) > 0)) {
print('NO MATCHING RECORD FOUND FOR THE SEARCH INFORMATION INPUT!')
return(crdata)
}
}
tds <- html_text(html_nodes(html, "td:nth-child(2)"))
crdata <- rbindlist(list(crdata, list(crno = tds[1], companytype = deleteslashes(tds[2]), incorporationdate = dmy(tds[3]), status = deleteslashes(tds[4]), windingupmode = deleteslashes(tds[5]), dissolutiondate = dmy(tds[6]), registeravailable = deleteslashes(tds[7]))), fill = TRUE)
setkey(crdata, crno)
companyname <- str_replace(deleteslashes(html_text(html_node(html, "td tr:nth-child(2) td"))), "公司名稱:", "")
companynameextract <- extractnames(companyname)
crdata[crnostr,currentnameenglish := companynameextract[[1]]]
crdata[crnostr,currentnamechinese := companynameextract[[2]]]
crdata[crnostr, remarks := deleteslashes(html_text(html_node(html, ".sameasbody")))]
crdata[crnostr, note := str_replace(html_text(html_node(html, "tr:nth-child(10) td")), fixed("重要事項:\r\n\t\t\t\t\t"), "")]
names <- html_nodes(html, ".data")
for (name in 1:length(names)) {
tempname <- html_text(names[name])
crdata[crnostr, paste0("namesdate", name) := dmy(str_sub(tempname, 1, 10))]
tempname <- str_sub(tempname, 11)
tempextract <- extractnames(tempname)
crdata[crnostr, paste0("namesenglish", name) := tempextract[[1]]]
crdata[crnostr, paste0("nameschinese", name) := tempextract[[2]]]
}
return(crdata)
}
if (file.exists("scrapelist.Rdata")) {
load("scrapelist.Rdata")
} else {
scrapelist <- data.table(crno = "0", companytype = "blah")
setkey(scrapelist, crno)
}
maxcr = 2120960
mincr = scrapelist[,max(as.numeric(crno))]
for (crnonumber in (mincr:maxcr)) {
if (is.na(scrapelist[sprintf("%07d", crnonumber), companytype])) {
scrapelist <- scrape(crnonumber, scrapelist)
}
save(scrapelist, file = "scrapelist.Rdata")
}