In [1]:
library(tidyverse)
library(magrittr) # better handling of pipes
library(purrr) # to work with lists and map functions
library(glue) # to paste strings
library(stringr) # to hand strings
library(rvest) # rvest makes scraping easier
library(polite) # polite is the "polite" version of rvest
library(xml2) # makes it easier to work with HTML and XML from R

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: 'magrittr'


The following object is masked from 'package:purrr':

    set_names


The following object is masked from 'package:tidyr':

    extract



Attaching package: 'rvest'


The following object is masked from 'package:readr':

    guess_encoding




In [6]:
# The website in html
audio_culture_page = read_html("https://www.audioculture.co.nz/music_index?category=Person")

# Profile names
Profiles = audio_culture_page %>% html_nodes(".skippy-col-link") %>% html_text()

# Profile links
Links = audio_culture_page %>% html_nodes(".skippy-col-link") %>% html_attr('href')

# Dataframe of profile and associated link
df = data.frame(Profiles, Links)

# Prints NZ music profiles based on chars of given string
create_profile_list = function(cafe_name) {
    # Removes special symbols and whitespace from string
    cleaned_string = gsub("([^A-Za-z0-9])+", "", cafe_name)

    # Splits string into list for iteration
    char_list = strsplit(cleaned_string, "")[[1]]

    # The list of NZ profiles based on cafe name
    profile_list = data.frame()
    
    # Iterate through every char in the given string
    for (char in char_list) {
        # Converts char into upper case for REGEX pattern
        upper_char = toupper(char)

        # Converts char into lower case for REGEX pattern
        lower_char = tolower(char)

        # Creates list of profiles that match REGEX pattern (Starts with given char)
        match_list = grep(glue('^[{upper_char}{lower_char}]'), Profiles, value = TRUE)
        
        # If the match list is not empty choose random profile from match list
        if (length(match_list) != 0) {
                
            # Does at most 5 attempts to find at least 1 album from a profile
            for (i in 1:5) {
                # Randomly selected profile
                profile_match = sample(match_list, 1)

                # The profile's discography
                profile_discography = get_discography(profile_match)
                
                if (nrow(profile_discography) >= 1) {
                    break()
                }
            }

            # Sometimes profiles have empty discographies
            if (nrow(profile_discography) == 0) {
                profile_album = data.frame(Character = upper_char, 
                                           Profile = profile_match, 
                                           Album = NA_character_, 
                                           Featuring = NA_character_, 
                                           Year = NA_real_)
            } else {
                # profile_album_index = match(profile_discography[3] >= 2010)[2]


                # Randomly selects album index from discography
                profile_album_index = sample(nrow(profile_discography), 1) 

                # Create data frame entry for profile_lsit
                profile_album = merge((data.frame(Character = upper_char, Profile = profile_match)), 
                                      (profile_discography[profile_album_index,]))
            }
            
        } else {
            # If no match, create empty data frame entry for profile_list
            profile_album = data.frame(Character = upper_char, 
                                       Profile = NA_character_, 
                                       Album = NA_character_, 
                                       Featuring = NA_character_, 
                                       Year = NA_real_)
        }

        # Appends profile_list with new profile
        profile_list = rbind(profile_list, profile_album)
    }

    return(profile_list)
}

# Takes a profile name as input
# Returns given name's discography (currenly no error handling if name not in dataframe)
get_discography = function(name) {
    name_link = (df %>% filter(str_detect(Profiles, name)))[[2]][1]
    
    profile_discography_page = glue('https://www.audioculture.co.nz{name_link}/discography') %>% read_html()

    Album     = profile_discography_page %>% html_nodes("[class='header']") %>% html_text()
    Featuring = profile_discography_page %>% html_nodes("[class='body']")   %>% html_text()
    Year      = profile_discography_page %>% html_nodes("[class='year']")   %>% html_text() %>% as.numeric()
    
    # Replaces empty entries with NA
    Album[Album == ""] = NA_character_

    # Replaces empty entries with NA
    Featuring[Featuring == ""] = NA_character_

    # If the year length is less than 1000, replace with NA
    Year[Year < 1000] = NA_real_
    
    # Data frame of the given profile's discography
    discography_df = data.frame(Album, Featuring, Year)

    return(discography_df %>% filter(Year >= 1000))
}

In [8]:
create_profile_list("Vick's Cafe")

Character,Profile,Album,Featuring,Year
<chr>,<chr>,<chr>,<chr>,<dbl>
V,Val Murphy,My Way Of Singing,Val Murphy,1965
I,Ike Metekingi,Hippy Hippy Shake,The Maori Hi Five,1962
C,Crowded House,Into Temptation,Crowded House,1988
K,Kim Willoughby,Look Around Again/ Have You Ever Seen The Rain?,Kim Willoughby,1990
S,Sina,Boy,Sina*,1998
C,Creation,Can't Help Myself,Creation (7),1972
A,Ardijah,The Best: PolyFonk,Ardijah,2010
F,"Four Fours, The",From The Bottom Of My Heart / Stingray,Four Fours,1966
E,Evasive Action,Imminent Impact,Evasive Action,2008
