In [25]:
!pip install BeautifulSoup4



In [26]:
# I want to scrape images from a website

from bs4 import BeautifulSoup
import requests
import urllib.request
import os

# On the website I want all the images from the page
# I want to save them in a folder called "images"

website = "https://moviebarcode.tumblr.com/"
page = requests.get(website)
soup = BeautifulSoup(page.content, 'html.parser')


In [27]:
import json
def get_imdb_info(post) -> tuple:
    links = post.find_all("a")
    for link in links:
        if "www.imdb.com" in link['href']:
            # get the length of the movie
            imdb_url = link['href'].replace("https://href.li/?", "")
            try:
                movie_page = requests.get(imdb_url)
                movie_soup = BeautifulSoup(movie_page.content, 'html.parser')
                return get_movie_info(movie_soup, imdb_url)
            except Exception as e:
                print(f"Error: {e}")
                return None

In [28]:
def get_movie_info(movie_soup, imdb_url) -> tuple:
    # find the tag script with id ="__NEXT_DATA__" and get the json
    script = movie_soup.find("script", id="__NEXT_DATA__")
    # check script exists
    if script:
        movie_details = json.loads(script.contents[0])
        try:
            runtime = movie_details["props"]["pageProps"]["aboveTheFoldData"]["runtime"]["seconds"]
            titleText = movie_details["props"]["pageProps"]["aboveTheFoldData"]["originalTitleText"]["text"]
            rating = movie_details["props"]["pageProps"]["aboveTheFoldData"]["ratingsSummary"]["aggregateRating"]
            genre_object = movie_details["props"]["pageProps"]["aboveTheFoldData"]["genres"]["genres"]
            title_type = movie_details["props"]["pageProps"]["aboveTheFoldData"]["titleType"]["id"]
            genre_list = [i["text"] for i in genre_object]
            genre = ", ".join(genre_list)
            return titleText, runtime, rating, genre, imdb_url, title_type
        except Exception as e:
            print(f"Error: {e}")
            return None
    else:
        return None

In [29]:

def get_photo(post, movie_title, redownload=False, images_folder="./images") -> str:
    photo = post.find("img")
    if not(photo.has_attr('src') and photo.has_attr('alt')):
        return None
    else:
        photo_name = photo['alt']
        # ensure the image name is safe
        photo_name = photo_name.replace('/', '').replace('\t', '').replace(' ', '_').replace('⇒', '').split('\n')[0]
        image_path = f"{images_folder}/{photo_name}.jpg"
        # check if file exists
        if not redownload:
            if os.path.isfile(image_path):
                return image_path
            else:
            # if not os.path.isfile(image_path) or redownload:
                try:
                    print(photo['src'], image_path)
                    # Download the image
                    urllib.request.urlretrieve(photo['src'], image_path)
                    return image_path
                except Exception as e:
                    print(f"Error {e}")
                return None

In [30]:
def scrape_page_for_movies(soup, output_file, website):
    # find all images under div class="photo"
    if not os.path.isfile(output_file):
        with open(output_file, "w") as f:
            print("#title, runtime, rating, genre, image_path, imdb_url, soup_url, type", file=f)
    with open(output_file, 'a') as f:
        posts = soup.find_all("div", class_="post")
        for post in posts:
            # find the link with www.imdb.com in it
            movie = {'title': "Error"}
            try:
                movie["title"], movie["runtime"], movie["rating"], movie["genre"], movie["imdb_url"], movie['title_type'] = get_imdb_info(post)
                movie["filename"] = get_photo(post, movie["title"])
                print(movie["title"], movie["runtime"], movie["rating"], movie["genre"], movie["filename"], movie["imdb_url"], website, movie['title_type'], sep="\t", file=f)
            except Exception as e:
                print(f"Error {movie['title']} {e}")
                continue

In [31]:
# now I want to loop it on all the pages of the website
# max number of pages is 217

startpage = 1 # for when I get 404
endpage = 217
output_file = "./output/movies.tsv"
for pagenum in range(startpage, endpage + 1):
    website = f"https://moviebarcode.tumblr.com/page/{pagenum}"
    print(f"Scraping page {website}")
    page = requests.get(website)
    soup = BeautifulSoup(page.content, 'html.parser')
    scrape_page_for_movies(soup, output_file, website)

Scraping page https://moviebarcode.tumblr.com/page/1
https://64.media.tumblr.com/a13fd1906a71c2d724835e8026e3bdf1/tumblr_p87ez6Xymv1qhtovio1_1280.jpg ./images/Star_Wars:_Episode_VIII_-_The_Last_Jedi_(2017).jpg
https://64.media.tumblr.com/ec138a6f2741633f1243c51b9b155d28/tumblr_p87f3dpOXT1qhtovio1_1280.jpg ./images/Sequence_from:_Star_Wars:_Episode_VIII_-_The_Last_Jedi_(2017).jpg
https://64.media.tumblr.com/d235d568249d0007539a05e40b6621e2/tumblr_mt45wzGkY51qhtovio1_1280.jpg ./images/And_Then_There_Were_None_(1945).jpg
https://64.media.tumblr.com/0c74cbb005c13a2eda82f3237f778589/tumblr_obl0ilfHhd1qhtovio1_1280.jpg ./images/Die_Ehe_der_Maria_Braun__The_Marriage_of_Maria_Braun_(1979).jpg
https://64.media.tumblr.com/468f4df1c45416a451d2dd87dcf637ac/tumblr_obl0fzTctx1qhtovio1_1280.jpg ./images/A_Perfect_Day_(2015).jpg
https://64.media.tumblr.com/43bfbdb455283643d735bdf32c11bb7e/tumblr_obl0jyQ3MJ1qhtovio1_1280.jpg ./images/Close_Encounters_of_the_Third_Kind_(1977).jpg
https://64.media.tumblr

In [32]:
#Example single request
# urllib.request.urlretrieve("https://64.media.tumblr.com/0c74cbb005c13a2eda82f3237f778589/tumblr_obl0ilfHhd1qhtovio1_1280.jpg", "Star Wars: Episode VIII - The Last Jedi (2017).jpg")

In [39]:
# open the tsv file and remove duplicates
import pandas as pd
df = pd.read_csv("./output/movies.tsv", sep="\t", header=0)
# check for duplicates based on first column and keep only the first
df.drop_duplicates(subset=df.columns[0], keep='first', inplace=True)
df.to_csv("./output/movies_dedupe.tsv", sep="\t", index=False)