In [1]:
library(tidyverse)
library(magrittr) # better handling of pipes
library(purrr) # to work with lists and map functions
library(glue) # to paste strings
library(stringr) # to hand strings
library(rvest) # rvest makes scraping easier
library(polite)

-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --
[32mv[39m [34mggplot2[39m 3.2.0     [32mv[39m [34mpurrr  [39m 0.3.2
[32mv[39m [34mtibble [39m 2.1.3     [32mv[39m [34mdplyr  [39m 0.8.3
[32mv[39m [34mtidyr  [39m 0.8.3     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.3.1     [32mv[39m [34mforcats[39m 0.4.0
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: 'magrittr'

The following object is masked from 'package:purrr':

    set_names

The following object is masked from 'package:tidyr':

    extract


Attaching package: 'glue'

The following object is masked from 'package:dplyr':

    collapse

Loading required package: xml2

Attaching package: 'rvest'

The following object is masked from

In [47]:
years <- seq(1950, 2019, 1) #first number = first year for scrape, second number = last year for scraping

In [3]:

#Grabs each movie link for provided years
data <- function(years){
    movie_links <- NULL
    movie_links <- list()
    for(year in years){
        url_titles <- glue("https://www.rottentomatoes.com/top/bestofrt/?year={year}")
        base_page <- read_html(url_titles)

        movie_links <- append(movie_links, list(base_page %>%
            html_nodes(".allow-overflow") %>%
            html_nodes(".articleLink") %>%
            html_attr("href")
            ))

            }
    return(movie_links)
    }



In [51]:
movie_links <- data(years)
movie_links

In [67]:
get_info <- function(link){
    
    #Retrieve critic and audience rating
    movie_base_page <- glue("https://www.rottentomatoes.com{link}")
    web_data <- movie_base_page %>%
        read_html()
    movie_reviews <- web_data %>%
        html_nodes(".js-scoreboard-container") %>%
        html_nodes(".mop-ratings-wrap__percentage") %>%
        html_text() %>% 
        trimws() 
    #Retrieves the movie title
    movie_title <- web_data %>%
        html_nodes(".mop-ratings-wrap__title--top") %>%
        html_text()
    
    #grab supplementary data
    movie_info_labels <- web_data %>%
        html_nodes('.content-meta.info') %>%
        html_nodes(".meta-row.clearfix")%>%
        html_nodes('.meta-label')%>%
        html_text() %>%
        trimws()
    movie_info_data <- web_data %>%
        html_nodes('.content-meta.info') %>%
        html_nodes(".meta-row.clearfix")%>%
        html_nodes('.meta-value')%>%
        html_text() %>%
        trimws() 
    
    #formats supplementary data into key-value pairs for processing
    movie_info <- tibble(movie_info_labels, movie_info_data)
    movie_info <- spread(movie_info, 
                        key = movie_info_labels,
                        value = movie_info_data)
    

    
    #Retrieves value if the value is there otherwise sets value to NA
    if ("Rating:" %in% colnames(movie_info)){
        rating <- movie_info$"Rating:"
    } else {
        rating <- NA
    }
    if ("Runtime:" %in% colnames(movie_info)){
        runtime <- movie_info$"Runtime:"
        runtime <- gsub("[^0-9\\.]", "", runtime) %>%
                    as.numeric()
    } else {
        runtime <- NA
    }
    if ("Studio:" %in% colnames(movie_info)){
        studio <- movie_info$"Studio:"
    } else {
        studio <- NA
    }
    
    if ("Genre:" %in% colnames(movie_info)){
        genre <- movie_info$"Genre:" %>% 
                str_replace_all(fixed(" "), "") %>% 
                str_replace_all(fixed("\n"), "") %>%
                strsplit(",")
    } else {
        genre <- NA
    }
  
    if ("Box Office:" %in% colnames(movie_info)){
        box_office <- movie_info$"Box Office:" 
        box_office <- gsub("[^0-9\\.]", "", box_office) %>%
        as.numeric()
        
    } else {
        box_office <- NA
        
    }
    
    #place data into a tibble
    processed_data <- tibble("Movie Title" = movie_title, 
                  "Critic Score (%)" = readr::parse_number(movie_reviews[1]), 
                  "Audience Score (%)" = readr::parse_number(movie_reviews[2]),
                  "Movie Rating" = rating,
                  "Box Office ($)" = as.numeric(box_office), 
                  "Runtime (m)" = runtime,
                  "Genre" = genre,
                  "Studio" = studio)

    return(processed_data)
}

# Base Test 
# get_info(movie_links[[1]][[1]])


# puts all tibbles together into one 
all_df <- tibble()
    for (decade in movie_links){
      for (link in decade){
        temp <- get_info(link)
        all_df <- rbind(all_df, temp)    #combines each movie tibble retrieved into a single tibble
    }
}



Movie Title,Critic Score (%),Audience Score (%),Movie Rating,Box Office ($),Runtime (m),Genre,Studio
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<list>,<chr>
Black Panther,96,79,"PG-13 (for prolonged sequences of action violence, and a brief rude gesture)",501105037,135,"Action&Adventure , Drama , ScienceFiction&Fantasy",Marvel Studios


In [68]:
all_df

Movie Title,Critic Score (%),Audience Score (%),Movie Rating,Box Office ($),Runtime (m),Genre,Studio
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<list>,<chr>
Black Panther,96,79,"PG-13 (for prolonged sequences of action violence, and a brief rude gesture)",501105037,135,"Action&Adventure , Drama , ScienceFiction&Fantasy",Marvel Studios
Mission: Impossible - Fallout,97,88,"PG-13 (for violence and intense sequences of action, and for brief strong language)",,147,"Action&Adventure, Drama , Mystery&Suspense",Paramount Pictures
BlacKkKlansman,96,82,"R (for language throughout, including racial epithets, and for disturbing/violent material and some sexual references)",,135,"Comedy, Drama",Focus Features
Spider-Man: Into the Spider-Verse,97,93,"PG (for frenetic sequences of animated action violence, thematic elements, and mild language)",,100,"Action&Adventure , Animation , Kids&Family , ScienceFiction&Fantasy",Sony Pictures
Roma,96,71,"R (for graphic nudity, some disturbing images, and language)",,135,Drama,Netflix
A Star Is Born,90,79,"R (for language throughout, some sexuality/nudity and substance abuse)",,135,Drama,Warner Bros. Pictures
Eighth Grade,99,83,R (for language and some sexual material),,94,Comedy,A24
A Quiet Place,95,83,PG-13 (for terror and some bloody images),,90,"Drama , Horror , Mystery&Suspense",Paramount Pictures
Can You Ever Forgive Me?,98,81,"R (for language including some sexual references, and brief drug use)",,107,"Comedy, Drama",Fox Searchlight Pictures
Paddington 2,100,88,PG (for some action and mild rude humor),,105,"Animation , Comedy , Kids&Family",Warner Bros. Pictures
