# Import Libraries

In [43]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests as rq
import pygal
from rotten_tomatoes_scraper.rt_scraper import MovieScraper
from IPython.display import display, HTML

# Scraping

## IMDb top 250 films scraping

In [2]:
# fetching film pages' urls
pages_urls = []

page_start = 1
while page_start <= 250:
    url = f'https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start={page_start}&ref_=adv_nxt'
    pages_urls.append(url)
    page_start = page_start + 50

In [104]:
pages_urls

['https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=1&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=51&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=101&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=151&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=201&ref_=adv_nxt']

In [3]:
# pulling html content from request response
pages_content = []

# fetch requests in US format
headers = {"Accept-Language": "en-US,en;q=0.5"}

for page in pages_urls:
    response = rq.get(page, headers=headers)
    if response.status_code != 200:
        print("Error fetching page")
    else:
        pages_content.append(response.content)

In [41]:
# collect page html content tree
film_infos = []

for content in pages_content:
    soup = bs(content, "html.parser")
    for film_content in soup.find_all("div", class_="lister-item mode-advanced"):
        film_infos.append(film_content)

# list with fetched data
film_data = []

# fetch film infos from html hashes and lowercase all instances and replace empty spaces w/ underscores
for info in film_infos:
    title = (info.find("h3", class_="lister-item-header").findChildren()[1]).get_text(strip=True)
    year = (info.h3.findChildren()[2]).get_text(strip=True)[1:5]
    runtime = (info.find("p", class_="text-muted").findChildren()[2]).get_text(strip=True)[0:3]
    genre = (info.find("p", class_="text-muted").find("span", class_="genre")).get_text(strip=True)
    certificate = info.find("span", class_ ="certificate").get_text(strip=True)
    rating = (info.find("div", class_="ratings-bar").find("div", class_='inline-block ratings-imdb-rating').findChildren()[1]).get_text(strip=True)
    director = info.find_all('p', class_="")[0].a.text.strip()

    # define list to fill in all fetched data
    film_data.append([title, year, runtime, genre, certificate, rating, director])

    # define dataframe containing top 250 films data
    top250_film = pd.DataFrame(film_data, columns = ['title', 'year', 'runtime', 'genre', 'certificate', 'rating', 'director'])


In [42]:
top250_film

Unnamed: 0,title,year,runtime,genre,certificate,rating,director
0,The Shawshank Redemption,1994,142,Drama,R,9.3,Frank Darabont
1,The Godfather,1972,175,"Crime, Drama",R,9.2,Francis Ford Coppola
2,The Dark Knight,2008,152,"Action, Crime, Drama",PG-13,9.0,Christopher Nolan
3,The Godfather: Part II,1974,202,"Crime, Drama",R,9.0,Francis Ford Coppola
4,12 Angry Men,1957,96,"Crime, Drama",Approved,9.0,Sidney Lumet
...,...,...,...,...,...,...,...
245,Mr. Smith Goes to Washington,1939,129,"Comedy, Drama",Passed,8.1,Frank Capra
246,Gone with the Wind,1939,238,"Drama, History, Romance",Passed,8.1,Victor Fleming
247,It Happened One Night,1934,105,"Comedy, Romance",Passed,8.1,Frank Capra
248,The Passion of Joan of Arc,1928,114,"Biography, Drama, History",Passed,8.1,Carl Theodor Dreyer


----------------------------------------

## Rotten Tomatoes tomatometer and audience score scraping

In [40]:
top250_film

Unnamed: 0,title,year,runtime,genre,certificate,rating,director
0,shawshank_redemption,1994,142,Drama,R,9.3,Frank Darabont
1,godfather,1972,175,"Crime, Drama",R,9.2,Francis Ford Coppola
2,dark_knight,2008,152,"Action, Crime, Drama",PG-13,9.0,Christopher Nolan
3,godfather:_part_ii,1974,202,"Crime, Drama",R,9.0,Francis Ford Coppola
4,12_angry_men,1957,96,"Crime, Drama",Approved,9.0,Sidney Lumet
...,...,...,...,...,...,...,...
245,mr._smith_goes_to_washington,1939,129,"Comedy, Drama",Passed,8.1,Frank Capra
246,gone_with_wind,1939,238,"Drama, History, Romance",Passed,8.1,Victor Fleming
247,it_happened_one_night,1934,105,"Comedy, Romance",Passed,8.1,Frank Capra
248,passion_of_joan_of_arc,1928,114,"Biography, Drama, History",Passed,8.1,Carl Theodor Dreyer


In [None]:
for title in top250_film['title']:
    movie_scraper = MovieScraper(movie_title=title)
    if movie_scraper.extract_metadata() != 404:
        print(movie_scraper.metadata)
    else:
        print('No film found')