# scrape web
zagor

# libraries
try not to add libraries globally, rather call f-ctions with ```library::f-ction```

In [1]:
library(magrittr)

# define the base URL

In [2]:
base_url = "https://projekt-spoznaj.si/videoposnetki-in-predstavitve/fair/"

# with rvest

## f-ctions

### find the total number of pages
*_Denote_*: page 1 has no number

In [3]:
find_total_pages = function(base_url) {
    current_page = 1
    has_next = TRUE
    last_page = 1
    
    while (has_next) {
    # Construct URL for the current page
    page_url = ifelse(current_page == 1, base_url, paste0(base_url, "page/", current_page, "/"))
    
    # Read the webpage
    webpage = tryCatch(rvest::read_html(page_url), error = function(e) return(NULL))
    
    if (is.null(webpage)) {
        break
    }
    
    # Check for the "Next" button
    next_button = webpage %>%
    rvest::html_nodes("a.next.page-numbers") %>%
    rvest::html_attr("href")
    
    if (length(next_button) == 0) {
    # No "Next" button found
    has_next = FALSE
    } else {
    # Increment the counter
    current_page = current_page + 1
    last_page = current_page
    }
    
}


return(last_page)
}


### scrape titles and dates

In [4]:
scrape_page = function(url) {
    
    # Read the webpage content
    webpage = rvest::read_html(url)
    
    # Extract lecture titles
    lecture_titles = webpage %>%
    rvest::html_nodes(".aiovg-title a.aiovg-link-title") %>%
    rvest::html_text()

    # Create a data.table for the extracted data
    data.table::data.table(
    Date = lecture_titles %>%
    stringr::str_extract("\\(\\d{1,2}\\. \\d{1,2}\\. \\d{4}\\)") %>% # Extract dates
    stringr::str_remove_all("[()]") %>% # Remove parentheses
    as.Date(format = "%d. %m. %Y"), # Convert to Date format
    Lecture_Title = lecture_titles %>%
    stringr::str_remove("\\s*\\(\\d{1,2}\\. \\d{1,2}\\. \\d{4}\\)") %>% # Remove dates from titles
    stringr::str_trim() # Remove whitespace and newlines
    )
}

## scrape

In [5]:
total_pages = find_total_pages(base_url)
urls = c(base_url, paste0(base_url, "page/", 2:total_pages, "/"))
lectures_dt = data.table::rbindlist(lapply(urls, scrape_page))

##  write res

In [6]:
fp = file.path('..', 'output')
fn = "R-lecture-titles-dates.xlsx"
openxlsx::write.xlsx(lectures_dt, file.path(fp, fn))

# with xml2

### find the total number of pages

In [None]:
find_total_pages = function(base_url) {

    current_page = 1
    has_next = TRUE
    last_page = 1

    while (has_next) {
    # URL
    page_url = ifelse(current_page == 1, base_url, paste0(base_url, "page/", current_page, "/"))
    
    # Fetch content
    response = httr::GET(page_url)
    
    # successful?
    if (httr::status_code(response) != 200) {
    break
    }
    
    # Parse
    webpage = xml2::read_html(httr::content(response, as = "text"))
    
    # "Next" button?
    next_button = xml2::xml_find_first(webpage, "//a[contains(@class, 'next page-numbers')]")
    if (is.na(next_button)) {
    # No "Next" button
    has_next = FALSE
    } else {
    # counter
    current_page = current_page + 1
    last_page = current_page
    }
}

return(last_page)
}

### scripe titles and dates

In [None]:
scrape_page <- function(url) {
    # webpage content
    response = httr::GET(url)

    # successful ?
    if (httr::status_code(response) != 200) {
    return(data.table::data.table(Date = as.Date(character()), Lecture_Title = character()))
    }

    # Parse
    webpage <- xml2::read_html(httr::content(response, as = "text"))

    # titles
    titles_nodes <- xml2::xml_find_all(webpage, "//div[contains(@class, 'aiovg-title')]//a[contains(@class, 'aiovg-link-title')]")
    lecture_titles <- xml2::xml_text(titles_nodes)

    # dates
    dates <- stringr::str_extract(lecture_titles, "\\(\\d{1,2}\\. \\d{1,2}\\. \\d{4}\\)") %>%
    stringr::str_remove_all("[()]") %>%
    as.Date(format = "%d. %m. %Y")

    # clean titles
    cleaned_titles <- stringr::str_remove(lecture_titles, "\\s*\\(\\d{1,2}\\. \\d{1,2}\\. \\d{4}\\)") %>%
    stringr::str_trim()

    # data table
    return(data.table::data.table(Date = dates, Lecture_Title = cleaned_titles))
}

### scrape

In [10]:
total_pages = find_total_pages(base_url)
head(total_pages)

urls = c(base_url, paste0(base_url, "page/", 2:total_pages, "/"))
head(urls)

lectures_dt = data.table::rbindlist(lapply(urls, scrape_page))
head(lectures_dt)

Date,Lecture_Title
<date>,<chr>
2023-06-23,Odprta znanost v Evropskem raziskovalnem prostoru
2023-09-07,"Odprta znanost v Evropskem raziskovalnem prostoru, ponovitev"
2023-09-14,Nacionalna zakonodaja na področju odprte znanosti
2023-09-21,Slovenska skupnost odprte znanosti
2023-09-28,Možnosti in ugodnosti pri odprtem objavljanju za slovensko akademsko skupnost
2023-10-12,Platforma za odprte objave Open Research Europe


# session info

In [7]:
sessionInfo()

R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19045)

Matrix products: default


locale:
[1] LC_COLLATE=English_United Kingdom.utf8 
[2] LC_CTYPE=English_United Kingdom.utf8   
[3] LC_MONETARY=English_United Kingdom.utf8
[4] LC_NUMERIC=C                           
[5] LC_TIME=English_United Kingdom.utf8    

time zone: Europe/Ljubljana
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] magrittr_2.0.3

loaded via a namespace (and not attached):
 [1] zip_2.3.1         crayon_1.5.3      vctrs_0.6.5       httr_1.4.7       
 [5] cli_3.6.3         rlang_1.1.4       stringi_1.8.4     data.table_1.16.0
 [9] jsonlite_1.8.8    glue_1.7.0        selectr_0.4-2     htmltools_0.5.8.1
[13] IRdisplay_1.1     IRkernel_1.3.2    fansi_1.0.6       evaluate_0.24.0  
[17] fastmap_1.2.0     base64enc_0.1-3   openxlsx_4.2.7    lifecycle_1.0.4 