In [None]:
# Imports
library(rvest)
library(dplyr)
library(writexl)
library(readxl)

# 1. Commom functions

In [None]:
# Delete files in some specific folder
delete_files <- function(folder) {
    # Create path
    path = paste(getwd(),'/',folder,sep='')

    # List files in path
    files = list.files(path)

    # Concat path with files
    files = lapply(files,function(f){
        return (paste(path,'/',f,sep=''))
    })

    # Delete files
    unlink(files)
}

# Create files to store links
generate_link_user <- function(site_html,seqm) {
    # Link of each person
    links = site_html %>% html_nodes('table') %>% html_nodes('a') %>% html_attr('href')

    # Add links to seqm's table
    seqm['Link'] = links
    
    return (seqm)
}

# Create files to store links
generate_link_user_detail <- function(site_html,seqm,route) {
    
    # Get links
    links = site_html %>% html_nodes('table') %>% html_nodes('a') %>% html_attr('href')
    links = data.frame(links)

    # Divide links in two columns
    animal_links = links %>% filter(grepl('animal',links))
    criador_links = links %>% filter(!grepl('animal',links))

    # Set in data frame
    seqm$AnimalLink = animal_links$links
    seqm$CriadorLink = criador_links$links

    # Convert to character
    seqm$AnimalLink = as.character(seqm$AnimalLink)
    seqm$CriadorLink = as.character(seqm$CriadorLink)

    # Store the user id
    seqm$Id_User = strsplit(route,'/')[[1]][6]
    
    return (seqm)
}

# Function to generate table from url seqm
generate_table <- function(site_html,route,kind=1) {
    
    # Create a table from html structure
    seqm = site_html %>% html_nodes('table') %>% html_table()
    seqm = seqm[[1]]

    # Ignore the first column
    seqm = seqm[seq(from=2,to=ncol(seqm))]
    
    if(kind == 1) {
        seqm = generate_link_user(site_html,seqm)
    } else {
        seqm = generate_link_user_detail(site_html,seqm,route)
    }
    
    return (seqm)
}

# Parse an url
parse_url <- function(route,page_number,kind=1,user) {
    
    # Create url to search
    url = paste('https://seqm.com.br',route,'?pagina=',page_number,sep='')

    # Get html from site
    site = read_html(url)

    # Number of pages to search
    pages_to_visit =  site %>% html_nodes(xpath = '/html/body/div[2]/div[2]/main/section/div/ul/li/a') %>% html_text()

    # Create table
    seqm = generate_table(site,route,kind)

    # Save bkp file
    if (kind == 1){
        write_xlsx(seqm,path=paste(getwd(),'/Log_user/seqm_user_bkp',page_number,'.xlsx','',sep=""))
    } else {
        write_xlsx(seqm,path=paste(getwd(),'/Log_user_detail/seqm_user_detail_bkp_',user,'_',page_number,'.xlsx','',sep=""))   
    }
    
    # Create a list to return
    result = list(seqm=seqm,pages_to_visit=pages_to_visit)
    
    return(result)
}

# Update data with new values
update_data <- function(file_bkp,file_updated) {
    
    # Read files
    seqm = read_xlsx(paste(getwd(),file_bkp,sep=""))
    seqm_updated = read_xlsx(paste(getwd(),file_updated,sep=""))

    # Updating data
    seqm_updated = seqm_updated %>% filter(!Id %in% seqm$Id)

    # Concat data frames
    seqm_updated = rbind(seqm_updated,seqm)
    
    return (seqm_updated)
}

# 2. SEQM User

In [None]:
# Delete files
delete_files('Log_user')

# Try to start crawler
start_crawler = try ( {
    # Start url to get data
    start_url = 'https://seqm.com.br/ranking/pessoa/proprietarios'

    # Get html from url
    site = read_html(start_url)

    # Amount of pages to visit
    pages_to_visit = site %>% html_nodes(xpath = '/html/body/div[2]/div[2]/main/section/div/ul/li/a') %>% html_text()
}, silent=TRUE)

if ("try-error" %in% class(start_crawler)) {
    print('Some error happens')
} else {
    
    # Empty data frame
    seqm_final = data.frame()
    
    # Count of erros
    error_count = 0

    # Run for each url page
    i = 1
    while (i <= length(pages_to_visit[c(1,2)])){
        error = try({
            
            # Parse url
            seqm = parse_url('/ranking/pessoa/proprietarios',pages_to_visit[i])

            # Concat dataframes
            seqm_final = rbind(seqm$seqm,seqm_final)

            # Wait 2 seconds before start to a next page
            Sys.sleep(2)
            
        }, silent = TRUE)
        
        # If some error happens try again
        if ("try-error" %in% class(error)) {
            error_count = error_count + 1
        } else {
            i = i + 1
            error_count = 0
        }
        
        # Maximium of 4 tries
        if(error_count == 4){
            error_count = 0
            i = i + 1
        }
    }

    # Drop duplicates
    seqm_final = unique(seqm_final)
    
    # Change the colname
    colnames(seqm_final)[colnames(seqm_final) == "Proprietário"] = "Proprietario"
}

# Generate an ID for each user
seqm_final$Id = seqm_final$Link %>% lapply(function(x) {
    return (strsplit(x,'/')[[1]][6])
})
seqm_final$Id = unlist(seqm_final$Id)

# Save file
write_xlsx(seqm_final,path=paste(getwd(),'/seqm_user_bkp.xlsx',sep=""))

# Update data
seqm_final = update_data('/seqm_user_bkp.xlsx','/seqm_user.xlsx')

# Save file
write_xlsx(seqm_final,path=paste(getwd(),'/seqm_user.xlsx',sep=""))

# 3. SEQM User Detail

In [None]:
# Read file with urls to get
seqm_user = read_xlsx(paste(getwd(),'/seqm_user.xlsx',sep=""))

In [None]:
# Delete files
delete_files('Log_user_detail')

# Empty data frame to store the final result
seqm_final = data.frame()

# Count of erros
error_count = 0

# For each route
i = 1
while (i <= length(seqm_user$Link[c(1,2,3,4,5)])) {
    # First Page
    start_crawler =  try({
        
        # Parse an url
        result = parse_url(seqm_user$Link[i],1,2,seqm_user$Id[i])

        # Concat dataframes
        seqm_final = rbind(result$seqm,seqm_final)
                
    }, silent = TRUE)
    
    # If some error happens try again
    if ("try-error" %in% class(start_crawler)) {
        error_count = error_count + 1
    } else {        
        # Sub count of erros
        sub_error_count = 0
        
        # For each page number
        j = 2
        while (j <= length(result$pages_to_visit)){
            sub_crawler = try ({

                # Parse an url
                sub_result = parse_url(seqm_user$Link[i],j,2,seqm_user$Id[i])

                # Concat dataframes
                seqm_final = rbind(sub_result$seqm,seqm_final)

            },silent=TRUE)
            
            # If some error happens try again
            if ("try-error" %in% class(sub_crawler)) {
                sub_error_count = sub_error_count + 1
            } else {
                j = j + 1
                sub_error_count = 0
            }
    
            # Maximium of 4 tries
            if(sub_error_count == 4){
                j = j + 1
                sub_error_count = 0
            }   
        }
        
        i = i + 1
        error_count = 0
    }
    
    # Maximium of 4 tries
    if(error_count == 4){
        i = i + 1
        error_count = 0
    }
    
    # Drop duplicates
    seqm_final = unique(seqm_final)
}

# Generate an ID for each user
seqm_final$Id = seqm_final$AnimalLink %>% lapply(function(x) {
    return (strsplit(x,'/')[[1]][4])
})
seqm_final$Id = unlist(seqm_final$Id)

# Save file
write_xlsx(seqm_final,path=paste(getwd(),'/seqm_user_detail_bkp.xlsx',sep=""))

# Update data
seqm_final = update_data('/seqm_user_detail_bkp.xlsx','/seqm_user_detail.xlsx')

In [None]:
# Save file
write_xlsx(seqm_final,path=paste(getwd(),'/seqm_user_detail.xlsx',sep=""))