In [None]:
# Imports
library(rvest)
library(dplyr)
library(writexl)
library(readxl)

In [None]:
# Read file with urls to get
seqm_user = read_xlsx(paste(getwd(),'/seqm_user_updated.xlsx',sep=""))

# Function to generate table from url seqm
generate_table <- function(site_html,url) {
    # Create a table from html structure
    seqm = site_html %>% html_nodes('table') %>% html_table()
    seqm = seqm[[1]]

    # Ignore the first column
    seqm = seqm[seq(from=2,to=ncol(seqm))]

    # Get links
    links = site_html %>% html_nodes('table') %>% html_nodes('a') %>% html_attr('href')
    links = data.frame(links)

    # Divide links in two columns
    animal_links = links %>% filter(grepl('animal',links))
    criador_links = links %>% filter(!grepl('animal',links))

    # Set in data frame
    seqm$AnimalLink = animal_links$links
    seqm$CriadorLink = criador_links$links
    
    # Convert to character
    seqm$AnimalLink = as.character(seqm$AnimalLink)
    seqm$CriadorLink = as.character(seqm$CriadorLink)

    # Store the user id
    seqm$Id_User = strsplit(url,'/')[[1]][6]
    
    return (seqm)
}

# Parse an url
parse_url <- function(route,page_number,user) {
    
    # Create url to search
    url = paste('https://seqm.com.br',route,'?pagina=',page_number,sep='')

    # Get html from site
    site = read_html(url)

    # Number of pages to search
    pages_to_visit =  site %>% html_nodes(xpath = '/html/body/div[2]/div[2]/main/section/div/ul/li/a') %>% html_text()

    # Create table
    seqm = generate_table(site,route)

    # Save bkp file
    write_xlsx(seqm,path=paste(getwd(),'/Log_user_detail/seqm_user_detail_bkp',user,'_',page_number,'.xlsx','',sep=""))
    
    # Create a list to return
    result = list(seqm=seqm,pages_to_visit=pages_to_visit)
    
    return(result)
}

In [None]:
# Empty data frame to store the final result
seqm_final = data.frame()

# Count of erros
error_count = 0

# For each route
i = 1
while (i <= length(seqm_user$Link[c(1,2)])) {
    # First Page
    start_crawler =  try({
        
        # Parse an url
        result = parse_url(seqm_user$Link[i],1,i)

        # Concat dataframes
        seqm_final = rbind(result$seqm,seqm_final)
                
    }, silent = TRUE)
    
    # If some error happens try again
    if ("try-error" %in% class(start_crawler)) {
        error_count = error_count + 1
    } else {        
        # Sub count of erros
        sub_error_count = 0
        
        # For each page number
        j = 2
        while (j <= length(result$pages_to_visit)){
            sub_crawler = try ({

                # Parse an url
                sub_result = parse_url(seqm_user$Link[i],j,i)

                # Concat dataframes
                seqm_final = rbind(sub_result$seqm,seqm_final)

            },silent=TRUE)
            
            # If some error happens try again
            if ("try-error" %in% class(sub_crawler)) {
                sub_error_count = sub_error_count + 1
            } else {
                j = j + 1
                sub_error_count = 0
            }
    
            # Maximium of 4 tries
            if(sub_error_count == 4){
                j = j + 1
                sub_error_count = 0
            }   
        }
        
        i = i + 1
        error_count = 0
    }
    
    # Maximium of 4 tries
    if(error_count == 4){
        i = i + 1
        error_count = 0
    }
    
    # Drop duplicates
    seqm_final = unique(seqm_final)
}

In [None]:
# Generate an ID for each user
seqm_final$Id = seqm_final$AnimalLink %>% lapply(function(x) {
    return (strsplit(x,'/')[[1]][4])
})
seqm_final$Id = unlist(seqm_final$Id)

# Save file
write_xlsx(seqm_final,path=paste(getwd(),'/seqm_user_detail.xlsx',sep=""))

In [None]:
# Read files
seqm = read_xlsx(paste(getwd(),'/seqm_user_detail.xlsx',sep=""))
seqm_updated = read_xlsx(paste(getwd(),'/seqm_user_detail_updated.xlsx',sep=""))

# Updating data
seqm_updated = seqm_updated %>% filter(!Id %in% seqm$Id)

# Concat data frames
seqm_updated = rbind(seqm_updated,seqm)

# Save file
write_xlsx(seqm_updated,path=paste(getwd(),'/seqm_user_detail_updated.xlsx',sep=""))