In [21]:
!pip install BeautifulSoup4



In [22]:
# I want to scrape images from a website

from bs4 import BeautifulSoup
import requests
import urllib.request
import os

# On the website I want all the images from the page
# I want to save them in a folder called "images"

website = "https://moviebarcode.tumblr.com/"
page = requests.get(website)
soup = BeautifulSoup(page.content, 'html.parser')


In [23]:
import json
def get_imdb_info(post) -> tuple:
    links = post.find_all("a")
    for link in links:
        if "www.imdb.com" in link['href']:
            # get the length of the movie
            imdb_url = link['href'].replace("https://href.li/?", "")
            try:
                movie_page = requests.get(imdb_url)
                movie_soup = BeautifulSoup(movie_page.content, 'html.parser')
                return get_movie_info(movie_soup, imdb_url)
            except Exception as e:
                print(f"Error: {e}")
                return None

def get_movie_info(movie_soup, imdb_url) -> tuple:
    # find the tag script with id ="__NEXT_DATA__" and get the json
    script = movie_soup.find("script", id="__NEXT_DATA__")
    # check script exists
    if script:
        movie_details = json.loads(script.contents[0])
        try:
            runtime = movie_details["props"]["pageProps"]["aboveTheFoldData"]["runtime"]["seconds"]
            titleText = movie_details["props"]["pageProps"]["aboveTheFoldData"]["originalTitleText"]["text"]
            rating = movie_details["props"]["pageProps"]["aboveTheFoldData"]["ratingsSummary"]["aggregateRating"]
            genre_object = movie_details["props"]["pageProps"]["aboveTheFoldData"]["genres"]["genres"]
            genre_list = [i["text"] for i in genre_object]
            genre = ", ".join(genre_list)
            return titleText, runtime, rating, genre, imdb_url
        except Exception as e:
            print(f"Error: {e}")
            return None
    else:
        return None

def get_photo(post, movie_title, redownload=False, images_folder="./images") -> str:
    photo = post.find("img")
    if not(photo.has_attr('src') and photo.has_attr('alt')):
        return None
    else:
        photo_name = photo['alt']
        # ensure the image name is safe
        photo_name = photo_name.replace('/', '')
        image_path = f"{images_folder}/{photo_name}.jpg"
        # check if file exists
        if not redownload:
            if os.path.isfile(image_path):
                return image_path
            else:
            # if not os.path.isfile(image_path) or redownload:
                try:
                    print(photo['src'], image_path)
                    # Download the image
                    urllib.request.urlretrieve(photo['src'], image_path)
                    return image_path
                except Exception as e:
                    print(f"Error {e}")
                return None

def scrape_page_for_movies(soup, output_file, website):
    # find all images under div class="photo"
    if not os.path.isfile(output_file):
        with open(output_file, "w") as f:
            print("#title, runtime, rating, genre, image_path, imdb_url, soup_url", file=f)
    with open(output_file, 'a') as f:
        posts = soup.find_all("div", class_="post")
        for post in posts:
            # find the link with www.imdb.com in it
            movie = {'title': "Error"}
            try:
                movie["title"], movie["runtime"], movie["rating"], movie["genre"], movie["imdb_url"] = get_imdb_info(post)
                movie["filename"] = get_photo(post, movie["title"])
                print(movie["title"], movie["runtime"], movie["rating"], movie["genre"], movie["filename"], movie["imdb_url"], website, sep="\t", file=f)
            except Exception as e:
                print(f"Error {movie['title']} {e}")
                continue


In [29]:
# now I want to loop it on all the pages of the website
# max number of pages is 217

startpage = 101 # for when I get 404
endpage = 217
output_file = "./output/movies.tsv"
for pagenum in range(startpage, endpage + 1):
    website = f"https://moviebarcode.tumblr.com/page/{pagenum}"
    print(f"Scraping page {website}")
    page = requests.get(website)
    soup = BeautifulSoup(page.content, 'html.parser')
    scrape_page_for_movies(soup, output_file, website)

Scraping page https://moviebarcode.tumblr.com/page/101
https://64.media.tumblr.com/e49cc6152bddcfcfb9414306e730280d/tumblr_mw67eksbUZ1qhtovio1_1280.jpg ./images/Pixar Shorts: Jack-Jack Attack (2005).jpg
https://64.media.tumblr.com/3ad966724f7d07dd1deea30ddfb0be60/tumblr_mv4jpdPof91qhtovio1_1280.jpg ./images/Zardoz (1974).jpg
https://64.media.tumblr.com/56dc7b911a2d2e63d9f8a28361277ac7/tumblr_mv5222ioPB1qhtovio1_1280.jpg ./images/Paradies: Hoffnung  Paradise: Hope (2013).jpg
https://64.media.tumblr.com/99b206b58d4643180cc109cbe2ec1d01/tumblr_mvt4s18JQF1qhtovio1_1280.jpg ./images/The Complete Danger 5 (2011)
Season One.jpg
https://64.media.tumblr.com/da5226b98c1e9c6cc3089a5fb646c1c4/tumblr_muaz8kVVXq1qhtovio1_1280.jpg ./images/Down in the Valley (2005).jpg
https://64.media.tumblr.com/8e41f75e9573274c222a04acf50150cd/tumblr_mv4ji4TeYw1qhtovio1_1280.jpg ./images/The Wolf Man (1941).jpg
https://64.media.tumblr.com/462aced25ef2e6c11df00df1ec7c661a/tumblr_mtn16hxyRx1qhtovio1_r1_1280.jpg ./ima

In [25]:
#Example single request
# urllib.request.urlretrieve("https://64.media.tumblr.com/0c74cbb005c13a2eda82f3237f778589/tumblr_obl0ilfHhd1qhtovio1_1280.jpg", "Star Wars: Episode VIII - The Last Jedi (2017).jpg")