# Installation

In [3]:
# install.packages("magrittr")
# install.packages("purrr")
# install.packages("glue")
# install.packages("stringr")
# install.packages("rvest")
# remotes::install_github("dmi3kno/polite")

In [1]:
library(tidyverse)
library(magrittr) # better handling of pipes
library(purrr) # to work with lists and map functions
library(glue) # to paste strings
library(stringr) # to hand strings
library(rvest) # rvest makes scraping easier
library(polite)
library(httr)
library(jsonlite)

-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --
[32mv[39m [34mggplot2[39m 3.2.0     [32mv[39m [34mpurrr  [39m 0.3.2
[32mv[39m [34mtibble [39m 2.1.3     [32mv[39m [34mdplyr  [39m 0.8.3
[32mv[39m [34mtidyr  [39m 0.8.3     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.3.1     [32mv[39m [34mforcats[39m 0.4.0
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: 'magrittr'

The following object is masked from 'package:purrr':

    set_names

The following object is masked from 'package:tidyr':

    extract


Attaching package: 'glue'

The following object is masked from 'package:dplyr':

    collapse

Loading required package: xml2

Attaching package: 'rvest'

The following object is masked from

### Titles from Rotten Tomatoes

In [3]:

#Here is where we select the amount of years to scrape
years <- seq(2018, 2019, 1)

#This function returns movie links for the years provided
data <- function(years){
    movie_links <- NULL #Resetting movie links in case of a previous save
    movie_links <- list()
    for(year in years){
        #Within the for loop we get the correct links from the html, for each year provided
        url_titles <- glue("https://www.rottentomatoes.com/top/bestofrt/?year={year}")
        base_page <- read_html(url_titles)

        movie_links <- append(movie_links, list(base_page %>%
            html_nodes(".allow-overflow") %>%
            html_nodes(".articleLink") %>%
            html_attr("href")
            ))}
    return(movie_links)
    }
movie_links <- data(years) #Perform the funtion on the years

In [15]:
#This function returns movie titles from the links provided.
get_movie_titles <- function(link){
    movie_base_page <- glue("https://www.rottentomatoes.com{link}") #Scraping the movie pages
    web_data <- movie_base_page %>%
        read_html()
    #grab movie title from the HTML
    movie_title <- web_data %>%
        html_nodes(".mop-ratings-wrap__title--top") %>%
        html_text()
    processed_data <- tibble("Movie Title" = movie_title)
    return(processed_data)
}
#puts all tibbles together into one 
all_df <- tibble()
    for (decade in movie_links){
      for (link in decade){
        temp <- get_movie_titles(link)
        all_df <- rbind(all_df, temp)
    }
}
all_df

Movie Title
<chr>
Black Panther
Mission: Impossible - Fallout
BlacKkKlansman
Spider-Man: Into the Spider-Verse
Roma
A Star Is Born
Eighth Grade
A Quiet Place
Can You Ever Forgive Me?
Paddington 2


### Api Querying

In [2]:
get_api_urls <- function(movie_title_df) {
    l <- nrow(movie_title_df)
    api_urls <- vector("list", 1) #initialising an empty list to add the urls too.
    j <- 1

    for (title in movie_title_df) { 
        api_urls[[j]] <- title %>% 
        str_replace_all(" ", "_") %>%
        tolower() %>% 
        sprintf("http://www.omdbapi.com/?t=%s&apikey=88925226", .) #Here we are inserting the titles to search into the api urls to request
        j <- j + 1
    }
    return(api_urls)
}


ERROR: Error in nrow(movie_title_df): object 'all_df' not found


In [24]:
api_urls[[1]]#Here are the api's urls we request data from

In [21]:
get_api_results <- function(api_urls) {
    api_results <- vector("list", length(api_urls)) #Empty list to insert that movie data into.
    i <- 1
    for (url in api_urls[[1]]) {
        api_response <- fromJSON(url)
        error_ressult = tryCatch({ #Errors can come up when an api call has returned an incorrect piece of data. Eg. when it finds a tv show with a similar name to a movie
            movie_data <- data.frame(api_response)[1,]
            if (ncol(movie_data) == 26) { 
                api_results[[i]] <- movie_data
                i <- i + 1
             } 
        }, error = function(e) { #Here we can print the error if we need too
#             print(e)
        }, finally = {
        })
    }
    
    return(api_results)
}



In [4]:
get_api_data_df <- function(api_results) {
    #Here we aare combining the dataframes
    from <- 2 #Can skip the first entry
    to <- length(api_results)

    api_data_df <- data.frame(api_results[1]) #create the first dataframe
    for (i in from:to) {
        temp <- data.frame(api_results[i]) #create a temporary dataframe
        api_data_df <- rbind(api_data_df,temp) #Using rbind the data can be merge vertically
    }
    return(api_data_df)
}



ERROR: Error in get_api_data_df(api_results): object 'api_results' not found


In [40]:
# api_urls <- get_api_urls(all_df)
# api_results <- get_api_results(api_urls)
# movie_data_df <- get_api_data_df(api_results)

all_df %>% get_api_urls() %>% get_api_results() %>% get_api_data_df()

length(movie_data_df)
movie_data_df

Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response
<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,...,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
Black Panther,2018,PG-13,16 Feb 2018,134 min,"Action, Adventure, Sci-Fi",Ryan Coogler,"Ryan Coogler, Joe Robert Cole, Stan Lee (based on the Marvel comics by), Jack Kirby (based on the Marvel Comics by)","Chadwick Boseman, Michael B. Jordan, Lupita Nyong'o, Danai Gurira","T'Challa, heir to the hidden but advanced kingdom of Wakanda, must step forward to lead his people into a new future and must confront a challenger from his country's past.",...,88,7.3,538936,tt1825683,movie,15 May 2018,"$501,105,037",Marvel Studios,https://www.marvel.com/movies/black-panther,True
Mission: Impossible - Fallout,2018,PG-13,27 Jul 2018,147 min,"Action, Adventure, Thriller",Christopher McQuarrie,"Bruce Geller (based on the television series created by), Christopher McQuarrie","Tom Cruise, Henry Cavill, Ving Rhames, Simon Pegg","Ethan Hunt and his IMF team, along with some familiar allies, race against time after a mission gone wrong.",...,86,7.8,247489,tt4912910,movie,04 Dec 2018,,Paramount Pictures,https://www.missionimpossible.com/,True
BlacKkKlansman,2018,R,10 Aug 2018,135 min,"Biography, Crime, Drama",Spike Lee,"Charlie Wachtel, David Rabinowitz, Kevin Willmott, Spike Lee, Ron Stallworth (based on the book by)","Alec Baldwin, John David Washington, Isiah Whitlock Jr., Robert John Burke","Ron Stallworth, an African American police officer from Colorado Springs, CO, successfully manages to infiltrate the local Ku Klux Klan branch with the help of a Jewish surrogate who eventually becomes its leader. Based on actual events.",...,83,7.5,160869,tt7349662,movie,06 Nov 2018,,Focus Features,http://www.focusfeatures.com/blackkklansman,True
Spider-Man: Into the Spider-Verse,2018,PG,14 Dec 2018,117 min,"Animation, Action, Adventure, Family, Sci-Fi","Bob Persichetti, Peter Ramsey, Rodney Rothman","Phil Lord (screenplay by), Rodney Rothman (screenplay by), Phil Lord (story by)","Shameik Moore, Jake Johnson, Hailee Steinfeld, Mahershala Ali","Teen Miles Morales becomes Spider-Man of his reality, crossing his path with five counterparts from other dimensions to stop a threat for all realities.",...,87,8.4,257039,tt4633694,movie,26 Feb 2019,,Sony Pictures,http://www.intothespiderverse.movie/,True
Roma,2018,R,21 Nov 2018,135 min,Drama,Alfonso Cuarón,Alfonso Cuarón,"Yalitza Aparicio, Marina de Tavira, Diego Cortina Autrey, Carlos Peralta",A year in the life of a middle-class family's maid in Mexico City in the early 1970s.,...,96,7.8,111965,tt6155172,movie,,,,,True
A Star Is Born,2018,R,05 Oct 2018,136 min,"Drama, Music, Romance",Bradley Cooper,"Eric Roth (screenplay by), Bradley Cooper (screenplay by), Will Fetters (screenplay by), Moss Hart (based on the 1954 screenplay by), John Gregory Dunne (based on the 1976 screenplay by), Joan Didion (based on the 1976 screenplay by), Frank Pierson (based on the 1976 screenplay by), William A. Wellman (based on a story by), Robert Carson (based on a story by)","Lady Gaga, Bradley Cooper, Sam Elliott, Andrew Dice Clay",A musician helps a young singer find fame as age and alcoholism send his own career into a downward spiral.,...,88,7.7,271609,tt1517451,movie,19 Feb 2019,,Warner Bros. Pictures,http://www.astarisbornmovie.com/,True
Eighth Grade,2018,R,03 Aug 2018,93 min,"Comedy, Drama",Bo Burnham,Bo Burnham,"Elsie Fisher, Josh Hamilton, Emily Robinson, Jake Ryan",An introverted teenage girl tries to survive the last week of her disastrous eighth grade year before leaving to start high school.,...,89,7.4,47799,tt7014006,movie,25 Sep 2018,,A24,,True
A Quiet Place,2018,PG-13,06 Apr 2018,90 min,"Drama, Horror, Sci-Fi",John Krasinski,"Bryan Woods (screenplay by), Scott Beck (screenplay by), John Krasinski (screenplay by), Bryan Woods (story by), Scott Beck (story by)","Emily Blunt, John Krasinski, Millicent Simmonds, Noah Jupe","In a post-apocalyptic world, a family is forced to live in silence while hiding from monsters with ultra-sensitive hearing.",...,82,7.5,319499,tt6644200,movie,10 Jul 2018,,Paramount Pictures,https://www.facebook.com/AQuietPlaceMovie,True
Can You Ever Forgive Me?,2018,R,02 Nov 2018,106 min,"Biography, Comedy, Crime, Drama",Marielle Heller,"Nicole Holofcener (screenplay by), Jeff Whitty (screenplay by)","Melissa McCarthy, Richard E. Grant, Dolly Wells, Ben Falcone","When Lee Israel falls out of step with current tastes, she turns her art form to deception.",...,87,7.1,33776,tt4595882,movie,,,Fox Searchlight Pictures,http://www.foxsearchlight.com/canyoueverforgiveme/,True
Paddington 2,2017,PG,12 Jan 2018,103 min,"Adventure, Comedy, Family, Fantasy",Paul King,"Paul King, Simon Farnaby, Michael Bond (""Paddington Bear"" created by), Jon Croker (additional material), Simon Stephenson (additional material)","Michael Gambon, Imelda Staunton, Ben Whishaw, Madeleine Harris","Paddington, now happily settled with the Brown family and a popular member of the local community, picks up a series of odd jobs to buy the perfect present for his Aunt Lucy's 100th birthday, only for the gift to be stolen.",...,88,7.8,48368,tt4468740,movie,,,,,True


### Titles from The Numbers

In [5]:
mylist <- list() #Create a list to house all the urls 
for (page in seq(1:56)) { #loop through to get to 5001 adding it to the end of the url 
    #adde the url to the list
    mylist <- c(mylist,as.character(paste0("https://www.the-numbers.com/movie/budgets/all/" , ((page - 1)* 100) + 1)))
}
#sanity check
mylist[1]

In [6]:
#Might take a while to run
movie_titles_list <- list() #Create list to house the lists of nodes
for(i in mylist){
    page_html <- read_html(i) #read in the url and turn it into html
    table_nodes <- page_html %>% html_nodes("table") %>% html_table() # Get the table in the html and make a list row out of it
    movie_titles_list <- append(movie_titles_list,table_nodes) #Append the table nodes to the list
}
#Check that something was obtained
length(movie_titles_list) #Should 56 

In [7]:
#Setting from and setting to, it is bad practice to have magic numbers in code and can lead to mistakes
from <- 2
to <- length(movie_titles_list)
df <- data.frame(movie_titles_list[1]) #create the first dataframe
for (i in from:to) { #start from 2 as use has been processed already
    temp <- data.frame(movie_titles_list[i]) #create a temporary dataframe
    df <- rbind(df,temp) #Using rbind the data can be merge vertically
}

In [8]:
names(df)[names(df) == 'Var.1'] <- 'Rank' # change the variable to something useful, found how to do this from https://stackoverflow.com/questions/7531868/how-to-rename-a-single-column-in-a-data-frame second answer. 
#The following functions have used this link as a reference: https://stackoverflow.com/questions/31944103/convert-currency-with-commas-into-numeric
df$ProductionBudget <- as.numeric(gsub('[$,]', '', df$ProductionBudget))
df$DomesticGross <- as.numeric(gsub('[$,]', '', df$DomesticGross))
df$WorldwideGross <- as.numeric(gsub('[$,]', '', df$WorldwideGross))
df <- na_if(df, 0)
df %>% select(Movie)


Movie
<chr>
Avatar
Pirates of the Caribbean: On Stranger Tides
Avengers: Endgame
Dark Phoenix
Avengers: Age of Ultron
Star Wars Ep. VIII: The Last Jedi
Star Wars Ep. VII: The Force Awakens
Avengers: Infinity War
Pirates of the Caribbean: At World’s End
Justice League


In [23]:
df %>% select(Movie)  %>% as_tibble() %>% get_api_urls() %>% get_api_results() %>% get_api_data_df()

ERROR: Error in open.connection(con, "rb"): HTTP error 401.


In [24]:
df %>% select(Movie) %>% head(10) %>% as_tibble() %>% get_api_urls() %>% get_api_results() %>% get_api_data_df()

Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response
<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,...,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
Avatar,2009,PG-13,18 Dec 2009,162 min,"Action, Adventure, Fantasy, Sci-Fi",James Cameron,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver, Stephen Lang",A paraplegic Marine dispatched to the moon Pandora on a unique mission becomes torn between following his orders and protecting the world he feels is his home.,...,83,7.8,1059860,tt0499549,movie,22 Apr 2010,"$749,700,000",20th Century Fox,http://www.avatarmovie.com/,True
Pirates of the Caribbean: On Stranger Tides,2011,PG-13,20 May 2011,136 min,"Action, Adventure, Fantasy",Rob Marshall,"Ted Elliott (screenplay), Terry Rossio (screenplay), Ted Elliott (screen story), Terry Rossio (screen story), Ted Elliott (characters), Terry Rossio (characters), Stuart Beattie (characters), Jay Wolpert (characters), Tim Powers (novel)","Johnny Depp, Penélope Cruz, Geoffrey Rush, Ian McShane","Jack Sparrow and Barbossa embark on a quest to find the elusive fountain of youth, only to discover that Blackbeard and his daughter are after it too.",...,45,6.6,453586,tt1298650,movie,18 Oct 2011,"$241,063,875",Walt Disney Pictures,http://Disney.com/Pirates,True
Avengers: Endgame,2019,PG-13,26 Apr 2019,181 min,"Action, Adventure, Sci-Fi","Anthony Russo, Joe Russo","Christopher Markus (screenplay by), Stephen McFeely (screenplay by), Stan Lee (based on the Marvel comics by), Jack Kirby (based on the Marvel comics by), Jim Starlin (Thanos, Gamora & Drax created by), Jack Kirby (Groot created by)","Robert Downey Jr., Chris Evans, Mark Ruffalo, Chris Hemsworth","After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.",...,78,8.6,561813,tt4154796,movie,30 Jul 2019,,Marvel Studios,,True
Dark Phoenix,2019,PG-13,07 Jun 2019,113 min,"Action, Adventure, Sci-Fi",Simon Kinberg,"Simon Kinberg, Stan Lee (comic book created by), Jack Kirby (comic book created by), Chris Claremont (story ""The Dark Phoenix Saga""), John Byrne (story ""The Dark Phoenix Saga""), Dave Cockrum (story ""The Dark Phoenix Saga"")","James McAvoy, Michael Fassbender, Jennifer Lawrence, Nicholas Hoult",Jean Grey begins to develop incredible powers that corrupt and turn her into a Dark Phoenix. Now the X-Men will have to decide if the life of a team member is worth more than all of humanity.,...,43,6.0,58485,tt6565702,movie,03 Sep 2019,,20th Century Fox,https://movies.disney.com/dark-phoenix,True
Avengers: Age of Ultron,2015,PG-13,01 May 2015,141 min,"Action, Adventure, Sci-Fi",Joss Whedon,"Joss Whedon, Stan Lee (based on the Marvel comics by), Jack Kirby (based on the Marvel comics by), Joe Simon (character created by: Captain America), Jack Kirby (character created by: Captain America), Jim Starlin (character created by: Thanos)","Robert Downey Jr., Chris Hemsworth, Mark Ruffalo, Chris Evans","When Tony Stark and Bruce Banner try to jump-start a dormant peacekeeping program called Ultron, things go horribly wrong and it's up to Earth's mightiest heroes to stop the villainous Ultron from enacting his terrible plan.",...,66,7.3,680834,tt2395427,movie,02 Oct 2015,"$429,113,729",Walt Disney Pictures,http://marvel.com/avengers,True
Avengers: Infinity War,2018,PG-13,27 Apr 2018,149 min,"Action, Adventure, Sci-Fi","Anthony Russo, Joe Russo","Christopher Markus (screenplay by), Stephen McFeely (screenplay by), Stan Lee (based on the Marvel comics by), Jack Kirby (based on the Marvel comics by), Joe Simon (Captain America created by), Jack Kirby (Captain America created by), Steve Englehart (Star-Lord created by), Steve Gan (Star-Lord created by), Bill Mantlo (Rocket Raccoon created by), Keith Giffen (Rocket Raccoon created by), Jim Starlin (Thanos, Gamora and Drax created by), Stan Lee (Groot created by), Larry Lieber (Groot created by), Jack Kirby (Groot created by), Steve Englehart (Mantis created by), Don Heck (Mantis created by)","Robert Downey Jr., Chris Hemsworth, Mark Ruffalo, Chris Evans",The Avengers and their allies must be willing to sacrifice all in an attempt to defeat the powerful Thanos before his blitz of devastation and ruin puts an end to the universe.,...,68,8.5,706537,tt4154756,movie,14 Aug 2018,"$664,987,816",Walt Disney Pictures,http://marvel.com/movies/movie/223/avengers_infinity_war,True
Justice League,2017,PG-13,17 Nov 2017,120 min,"Action, Adventure, Fantasy, Sci-Fi",Zack Snyder,"Jerry Siegel (Superman created by), Joe Shuster (Superman created by), Chris Terrio (story by), Zack Snyder (story by), Chris Terrio (screenplay by), Joss Whedon (screenplay by), Gardner Fox (Justice League of America created by), Bob Kane (Batman created by), Bill Finger (Batman created by), William Moulton Marston (Wonder Woman created by), Jack Kirby (Fourth World created by)","Ben Affleck, Henry Cavill, Amy Adams, Gal Gadot","Fueled by his restored faith in humanity and inspired by Superman's selfless act, Bruce Wayne enlists the help of his new-found ally, Diana Prince, to face an even greater enemy.",...,45,6.4,339190,tt0974015,movie,13 Mar 2018,"$227,032,490",Warner Bros. Pictures,http://www.justiceleaguethemovie.com/,True
