In [None]:
# Imports
library(rvest)
library(dplyr)
library(writexl)
library(readxl)

In [None]:
# Function to generate table from url seqm
generate_table <- function(site_html) {
    
    # Create a table from html structure
    seqm = site_html %>% html_nodes('table') %>% html_table()
    seqm = seqm[[1]]

    # Ignore the first column
    seqm = seqm[seq(from=2,to=ncol(seqm))]

    # Link of each person
    links = site_html %>% html_nodes('table') %>% html_nodes('a') %>% html_attr('href')

    # Add links to seqm's table
    seqm['Link'] = links
    
    return (seqm)
}

In [None]:
# Try to start crawler
start_crawler = try ( {
    # Start url to get data
    start_url = 'https://seqm.com.br/ranking/pessoa/proprietarios'

    # Get html from url
    site = read_html(start_url)

    # Amount of pages to visit
    pages_to_visit = site %>% html_nodes(xpath = '/html/body/div[2]/div[2]/main/section/div/ul/li/a') %>% html_text()
}, silent=TRUE)

In [None]:
if ("try-error" %in% class(start_crawler)) {
    print('Some error happens')
} else {
    
    # Empty data frame
    seqm_final = data.frame()
    
    # Count of erros
    error_count = 0

    # Run for each url page
    i = 1
    while (i <= length(pages_to_visit[c(1,2)])){
        error = try({
            # Genereate url to search
            url = paste('https://seqm.com.br/ranking/pessoa/proprietarios?pagina=',pages_to_visit[i],sep="")

            # Get html from url
            site = read_html(url)

            # Create table
            seqm = generate_table(site)

            # Save bkp file
            write_xlsx(seqm,path=paste(getwd(),'/Log_user/seqm_user_bkp',i,'.xlsx','',sep=""))

            # Concat dataframes
            seqm_final = rbind(seqm,seqm_final)

            # Wait 2 seconds before start to a next page
            Sys.sleep(2)
            
        }, silent = TRUE)
        
        # If some error happens try again
        if ("try-error" %in% class(error)) {
            error_count = error_count + 1
        } else {
            i = i + 1
            error_count = 0
        }
        
        # Maximium of 4 tries
        if(error_count == 4){
            error_count = 0
            i = i + 1
        }
    }

    # Generate an ID for each user
    seqm_final$Id = seqm_final$Link %>% lapply(function(x) {
        return (strsplit(x,'/')[[1]][6])
    })
    seqm_final$Id = unlist(seqm_final$Id)

    # Drop duplicates
    seqm_final = unique(seqm_final)
    
    # Change the colname
    colnames(seqm_final)[colnames(seqm_final) == "Proprietário"] = "Proprietario"

    # Save file
    write_xlsx(seqm_final,path=paste(getwd(),'/seqm_user.xlsx',sep=""))
}

In [None]:
# Read files
seqm = read_xlsx(paste(getwd(),'/seqm_user.xlsx',sep=""))
seqm_updated = read_xlsx(paste(getwd(),'/seqm_user_updated.xlsx',sep=""))

# Updating data
seqm_updated = seqm_updated %>% filter(!Id %in% seqm$Id)

# Concat data frames
seqm_updated = rbind(seqm_updated,seqm)

In [None]:
# Save file
write_xlsx(seqm_updated,path=paste(getwd(),'/seqm_user_updated.xlsx',sep=""))