Let's get started! 

As always, we start by importing the libraries we'll need for this little exercise. 

In [1]:
from bs4 import BeautifulSoup as bs
from pandas.io.json import json_normalize
import json
import matplotlib as mpl
import pandas as pd
import re
import requests
import seaborn as sns
import unicodedata as uni

And we'll set up some constants that will be driving our data analytics project here. The base URL for the website won't change, and we'll be using my username as the running example. 

In [2]:
base_url = 'https://letterboxd.com/'
account_name = 'JoshLinneburg'
ratings_url = base_url + account_name + '/films/ratings/by/rating/'

Let's start by scraping the webpage of interest: films by rating on my account.

In [3]:
def get_html_soup(url):
    response = requests.get(url).text
    soup = bs(response)
    return soup

In [4]:
soup = get_html_soup(ratings_url)

First thing we're going to do: Figure out how to itemize the films that have been rated *on a single page*.

Start by splitting the page up into a list of movies.

In [5]:
movies_on_page = soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li')

Now to extract the stars and the URL from this HTML.

In [62]:
# Let's only do the first 3 in our print() 
movies_to_show = min(len(movies_on_page), 3)

for i in range(movies_to_show):
    
    # Might not use this later, but let's grab the name of the movie's image (which seems to be the movie name itself)
    movie_name = movies_on_page[i].find('img')['alt'].strip()
    
    # Grab the movie_url
    movie_url = movies_on_page[i].find('div', class_='poster film-poster really-lazy-load')['data-target-link']
    
    # Grab the raw star rating of the movie
    user_movie_star_rating = movies_on_page[i].find('p').find('span').text.strip()
    
    # Convert the star rating to a numeric representation
    # Note "\" is used to breakup a statement into multiple lines
    user_movie_nbr_rating = sum([float(user_movie_star_rating[i].replace('★', '1.0').replace('½', '0.5'))\
                                 for i in range(len(user_movie_star_rating))])
    
    # Print the results
    print('User {0} gave {1}, which can be accessed here: {2}, a rating of {3} which translates to {4} on a numeric scale'\
          .format(account_name, movie_name, movie_url, user_movie_star_rating, user_movie_nbr_rating))
    
    # If we're at the last one, just print out a statement saying there's more we didn't show. 
    # We do minus 1 because our length goes to the value of 3 but list indexing in Python starts at 0
    # So our "i" variable goes 0, 1, 2 for a total of 3 (our length)
    if i == movies_to_show - 1:
        print('\nRemaining results hidden, you get the idea. \n...')

User JoshLinneburg gave The Empire Strikes Back, which can be accessed here: /film/the-empire-strikes-back/, a rating of ★★★★★ which translates to 5.0 on a numeric scale
User JoshLinneburg gave The Dark Knight, which can be accessed here: /film/the-dark-knight/, a rating of ★★★★★ which translates to 5.0 on a numeric scale
User JoshLinneburg gave The Shawshank Redemption, which can be accessed here: /film/the-shawshank-redemption/, a rating of ★★★★★ which translates to 5.0 on a numeric scale

Remaining results hidden, you get the idea. 
...


In [7]:
def get_user_movie_rating(html_soup):
    
    '''
    
    Parses the HTML of a given film on https://letterboxd.com/YOURACCOUNTHERE/films/ratings/ and returns the rating
    the user in question gave the film.
    
    Parameters:
        html_soup (bs4 BeautifulSoup): BeautifulSoup representation of a given movie on a user's '/films/ratings/' page
        This should be a single item in the list soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li')
        
    Returns:
        movie_rating (float): Numeric representation of the star value assigned to a movie by a given user. 
        
    Example:
        url = 'https://letterboxd.com/joshlinneburg/films/ratings/' # URL string
        soup = get_html_soup(url) # Parsed HTML for the URL string
        movies_on_page = soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li') # List of 'li' classes on the page
        get_user_movie_rating(movies_on_page[0]) # Single item in the list
        
        ...
        
        movie_rating = 5.0
    
    '''
    
    raw_movie_rating = html_soup.find('p').find('span').text.strip()
    movie_rating = sum([float(raw_movie_rating[i].replace('★', '1.0').replace('½', '0.5'))\
                        for i in range(len(raw_movie_rating))])
    return movie_rating

In [8]:
get_user_movie_rating(movies_on_page[0])

5.0

In [64]:
def get_movie_url(html_soup):
    
    '''
    
    Parses the HTML of a given film on https://letterboxd.com/YOURACCOUNTHERE/films/ratings/ and returns the URL
    of the movie. 
    
    Parameters:
        html_soup (bs4 BeautifulSoup): BeautifulSoup representation of a given movie on a user's '/films/ratings' page
        This should be a single item in the list soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li')
    
    Returns:
        movie_url (str): Endpoint URL of a movie on letterboxd.com
        Note: The URL does not contain the base_url ('https://letterboxd.com') and only begins at the /film/ endpoint. 
    
    Example:
        url = 'https://letterboxd.com/joshlinneburg/films/ratings/' # URL string
        soup = get_html_soup(url) # Parsed HTML for the URL string
        movies_on_page = soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li') # List of 'li' classes on the page
        get_movie_url(movies_on_page[0]) # Single item in the list
        
        ...
        
        movie_url = /film/the-empire-strikes-back/
    
    '''
    
    movie_url = html_soup.find('div', class_='poster film-poster really-lazy-load')['data-target-link']
    return movie_url

In [10]:
get_movie_url(movies_on_page[0])

'/film/the-empire-strikes-back/'

Next up: Let's start scraping from a movie's page on here so we can access that detailed info about the film. Let's start with attributes that are on the Letterboxd website (title, actors, director, audience rating, genres) and then we can get crazy with items we'll need to scrape IMDb for (box office, release date, awards, etc.).

We'll use *The Fellowship of the Ring* as our working example here.

In [11]:
lotr_fotr_url = base_url + get_movie_url(movies_on_page[4])
lotr_fotr_url

'https://letterboxd.com//film/the-lord-of-the-rings-the-fellowship-of-the-ring/'

Next up: scraping some data about the crew on a film. 

In [12]:
def get_crew_data(movie_url):
    
    '''
    
    Given a URL to a movie on letterboxd.com, this function returns a nested dictionary containing data about the crewmembers
    that worked on the movie. 
    
    Parameters:
        movie_url (str): URL to a film on letterboxd.com.
        Note: Must be the full path of the URL including 'https://letterboxd.com/'
        
    Returns:
        crew_dict (dict): Nested dictionary with data about the crew on a movie from letterboxd.com.
        
    Example: 
        movie_url = 'https://letterboxd.com/film/the-lord-of-the-rings-the-fellowship-of-the-ring/'
        get_crew_data(movie_url)
        
        ...
        
        Output:
        {'total_crew': 73,
         'total_unique_crew': 70,
         'crew_list': [
                       {'crew_role': 'director',
                        'number_assigned': 1,
                        'crew_attributes': [
                                            {'name': 'Peter Jackson',
                                             'url': '/director/peter-jackson/'}
                                           ]
                       }, more entries in crew_list ... 
                      ]
        }
        
    
    '''
    
    soup = get_html_soup(movie_url)
    
    # Categories (Director, Producers, Writers, etc.) of crewmembers available
    crew_roles_avail = soup.find('div', class_='tabbed-content-block column-block').findAll('span')

    # Same information in a list without HTML tags
    crew_roles_avail_list = [tag.text.lower() for tag in crew_roles_avail]

    # Init the crew list
    crew_dict = {}
    crew_list = []
    total_crew_counter = 0
    crew_names_list = []
    unique_crew_names = set()

    for i in range(len(crew_roles_avail_list)):
        crew_role_dict = {}

        # Role is just whatever role we're on 
        crew_role_dict['crew_role'] = crew_roles_avail_list[i]

        # Find the list of names and URLs for a given role
        crew_attributes_list = soup.find('div', class_='tabbed-content-block column-block').findAll('div')[i].findAll('a')

        # Iterate on our counter
        total_crew_counter += len(crew_attributes_list)

        # How many people are assigned to this role
        crew_role_dict['number_assigned'] = len(crew_attributes_list)

        # Init an "inner" list - a list of dictionaries containing the name and URL of each crewmember
        inner_crew_list = [] 

        for i in range(len(crew_attributes_list)):
            inner_crew_dict = {}

            # Crewmember name
            inner_crew_dict['name'] = crew_attributes_list[i].text
            crew_names_list.append(crew_attributes_list[i].text)

            # Crewmember URL
            inner_crew_dict['url'] = crew_attributes_list[i]['href']

            # Append this inner list to the inner dictionary
            inner_crew_list.append(inner_crew_dict)

        # Add our completed inner list to the outer dictionary
        crew_role_dict['crew_attributes'] = inner_crew_list

        # Append to our growing list
        crew_list.append(crew_role_dict)

    # Gets the total number of crewmembers listed
    crew_dict['total_crew'] = total_crew_counter

    # Gets the total number of unique crewmembers listed
    for name in crew_names_list:
        if name not in unique_crew_names:
            unique_crew_names.add(name)

    crew_dict['total_unique_crew'] = len(unique_crew_names)

    crew_dict['crew_list'] = crew_list
    
    return crew_dict

In [13]:
crew_dict = get_crew_data(lotr_fotr_url)
crew_dict

{'total_crew': 73,
 'total_unique_crew': 70,
 'crew_list': [{'crew_role': 'director',
   'number_assigned': 1,
   'crew_attributes': [{'name': 'Peter Jackson',
     'url': '/director/peter-jackson/'}]},
  {'crew_role': 'producers',
   'number_assigned': 9,
   'crew_attributes': [{'name': 'Barrie M. Osborne',
     'url': '/producer/barrie-m-osborne/'},
    {'name': 'Peter Jackson', 'url': '/producer/peter-jackson/'},
    {'name': 'Bob Weinstein', 'url': '/producer/bob-weinstein/'},
    {'name': 'Harvey Weinstein', 'url': '/producer/harvey-weinstein/'},
    {'name': 'Mark Ordesky', 'url': '/producer/mark-ordesky/'},
    {'name': 'Michael Lynne', 'url': '/producer/michael-lynne/'},
    {'name': 'Fran Walsh', 'url': '/producer/fran-walsh/'},
    {'name': 'Robert Shaye', 'url': '/producer/robert-shaye/'},
    {'name': 'Tim Sanders', 'url': '/producer/tim-sanders/'}]},
  {'crew_role': 'writers',
   'number_assigned': 4,
   'crew_attributes': [{'name': 'Peter Jackson',
     'url': '/writer/pe

Let's grab more information about the movie... How about the cast?

In [14]:
def get_cast_data(movie_url, max_cast_to_return = 15):
    
    '''
    
    Given a URL to a movie on letterboxd.com, this function returns a nested dictionary containing data about the cast
    that played in the movie. 
    
    Parameters:
        movie_url (str): URL to a film on letterboxd.com.
        Note: Must be the full path of the URL including 'https://letterboxd.com/'
        
        max_cast_to_return: Max number of cast members to include in output. Default is 15. 
        
    Returns:
        cast_dict (dict): Nested dictionary which contains data about the cast of a movie on Letterboxd.com
        
    Example: 
        movie_url = 'https://letterboxd.com/film/the-lord-of-the-rings-the-fellowship-of-the-ring/'
        get_cast_data(movie_url)
        
        ...
        
        Output: 
        {'total_cast': 80,
         'cast_list': [
                       {'actor_seq_nbr': 0,
                        'actor_name': 'Elijah Wood',
                        'actor_role': 'Frodo Baggins',
                        'actor_url': '/actor/elijah-wood/'
                       },

                       {'actor_seq_nbr': 1,
                        'actor_name': 'Ian McKellen',
                        'actor_role': 'Gandalf the Grey',
                        'actor_url': '/actor/ian-mckellen/'
                       }, more entries in cast_list ...
                      ]
        }    
    
    '''
    
    soup = get_html_soup(movie_url)
    
    available_cast = soup.find('div', class_='cast-list text-sluglist').findAll('a', class_='text-slug tooltip')

    cast_dict = {}
    cast_list = []
    cast_names_list = []
    unique_cast_names = set()

    for i in range(min(len(available_cast), max_cast_to_return)):
        cast_attr_dict = {}
        cast_attr_dict['actor_seq_nbr'] = i
        cast_attr_dict['actor_name'] = available_cast[i].text
        cast_attr_dict['actor_role'] = available_cast[i]['title']
        cast_attr_dict['actor_url'] = available_cast[i]['href']
        unique_cast_names.add(available_cast[i].text)
        cast_list.append(cast_attr_dict)

    # Total number of castmembers
    cast_dict['total_cast'] = len(available_cast)

    # List of castmembers
    cast_dict['cast_list'] = cast_list
    
    return cast_dict

In [15]:
cast_dict = get_cast_data(lotr_fotr_url)
cast_dict

{'total_cast': 80,
 'cast_list': [{'actor_seq_nbr': 0,
   'actor_name': 'Elijah Wood',
   'actor_role': 'Frodo Baggins',
   'actor_url': '/actor/elijah-wood/'},
  {'actor_seq_nbr': 1,
   'actor_name': 'Ian McKellen',
   'actor_role': 'Gandalf the Grey',
   'actor_url': '/actor/ian-mckellen/'},
  {'actor_seq_nbr': 2,
   'actor_name': 'Viggo Mortensen',
   'actor_role': 'Aragorn',
   'actor_url': '/actor/viggo-mortensen/'},
  {'actor_seq_nbr': 3,
   'actor_name': 'Sean Astin',
   'actor_role': 'Samwise "Sam" Gamgee',
   'actor_url': '/actor/sean-astin/'},
  {'actor_seq_nbr': 4,
   'actor_name': 'Liv Tyler',
   'actor_role': 'Arwen Evenstar',
   'actor_url': '/actor/liv-tyler/'},
  {'actor_seq_nbr': 5,
   'actor_name': 'Orlando Bloom',
   'actor_role': 'Legolas',
   'actor_url': '/actor/orlando-bloom/'},
  {'actor_seq_nbr': 6,
   'actor_name': 'John Rhys-Davies',
   'actor_role': 'Gimli',
   'actor_url': '/actor/john-rhys-davies/'},
  {'actor_seq_nbr': 7,
   'actor_name': 'Dominic Monagha

Let's get the title next and then the average rating among all reviewers. We can also grab the movie's description while we're at it too.

In [16]:
def get_movie_title(movie_url):
    
    '''
    Given a URL to a movie on letterboxd.com, this function returns the title of the movie.
    
    
    '''
    
    soup = get_html_soup(movie_url)
    
    movie_title = soup.find('h1', class_='headline-1 js-widont prettify').text.strip()
    return movie_title

In [17]:
movie_title = get_movie_title(lotr_fotr_url)
movie_title

'The Lord of the Rings: The Fellowship of the Ring'

In [18]:
def get_movie_avg_rating(movie_url):
    
    '''
    
    Given a URL to a movie on letterboxd.com, this function returns the average rating of the movie among all users.
    
    
    '''
    soup = get_html_soup(movie_url)
    soup = bs(requests.get(base_url + soup.find('aside', class_='sidebar').find('div', class_='js-csi')['data-src']).text)
    
    movie_avg_rating = soup.find('span', class_='average-rating').find('a').text
    return movie_avg_rating

In [19]:
movie_avg_rating = get_movie_avg_rating(lotr_fotr_url)
movie_avg_rating

'4.3'

In [20]:
def get_movie_desc(movie_url):
    
    '''
    
    Given a URL to a movie on letterboxd.com, this function returns the description of the movie.
    
    '''
    
    soup = get_html_soup(movie_url)
    
    movie_desc = soup.findAll('meta')[3]['content'].strip()
    return movie_desc

In [21]:
movie_desc = get_movie_desc(lotr_fotr_url)
movie_desc

'Young hobbit Frodo Baggins, after inheriting a mysterious ring from his uncle Bilbo, must leave his home in order to keep it from falling into the hands of its evil creator. Along the way, a fellowship is formed to protect the ringbearer and make sure that the ring arrives at its final destination: Mt. Doom, the only place where it can be destroyed.'

Okay, now that we've got a good selection of information about *The Fellowship of the Ring* let's put it together in one large dictionary. 

Then we can get started on scraping IMDb for the last few attributes we'll want:
* The movie's budget and revenue
* The movie's release date

Additionally, we're going to want to scrape the movie's genre(s) from either Letterboxd or IMDb, I'm not sure which one just yet though. 

In [22]:
movie = {}
movie['title'] = movie_title
movie['description'] = movie_desc
movie['avg_rating'] = movie_avg_rating
movie['cast'] = cast_dict
movie['crew'] = crew_dict

In [23]:
movie

{'title': 'The Lord of the Rings: The Fellowship of the Ring',
 'description': 'Young hobbit Frodo Baggins, after inheriting a mysterious ring from his uncle Bilbo, must leave his home in order to keep it from falling into the hands of its evil creator. Along the way, a fellowship is formed to protect the ringbearer and make sure that the ring arrives at its final destination: Mt. Doom, the only place where it can be destroyed.',
 'avg_rating': '4.3',
 'cast': {'total_cast': 80,
  'cast_list': [{'actor_seq_nbr': 0,
    'actor_name': 'Elijah Wood',
    'actor_role': 'Frodo Baggins',
    'actor_url': '/actor/elijah-wood/'},
   {'actor_seq_nbr': 1,
    'actor_name': 'Ian McKellen',
    'actor_role': 'Gandalf the Grey',
    'actor_url': '/actor/ian-mckellen/'},
   {'actor_seq_nbr': 2,
    'actor_name': 'Viggo Mortensen',
    'actor_role': 'Aragorn',
    'actor_url': '/actor/viggo-mortensen/'},
   {'actor_seq_nbr': 3,
    'actor_name': 'Sean Astin',
    'actor_role': 'Samwise "Sam" Gamgee',

Now, let's put everything together into a *bigger* dictionary and add a loop to scrape every film on the page!

In [24]:
get_movie_desc(base_url + get_movie_url(movies_on_page[0]))

'The epic saga continues as Luke Skywalker, in hopes of defeating the evil Galactic Empire, learns the ways of the Jedi from aging master Yoda. But Darth Vader is more determined than ever to capture Luke. Meanwhile, rebel leader Princess Leia, cocky Han Solo, Chewbacca, and droids C-3PO and R2-D2 are thrown into various stages of capture, betrayal and despair.'

In [25]:
def scrape_page_ratings(rating_page_url):
    
    '''
    
    Given a URL to a page of user ratings on letterboxd.com, this function returns a list of nested dictionaries
    which data about each movie rated on that specific page. 
    
    
    '''

    soup = get_html_soup(rating_page_url)
    base_url = 'https://letterboxd.com/'
    
    page_ratings_list = []
    
    try:
        movies_on_page = soup.findAll('ul', class_='poster-list -p150 -grid')[0].findAll('li')
    except IndexError:
        print('No movie ratings to scrape.')
        return None

    for i in range(len(movies_on_page)):

        # New dictionary
        movie_dict = {}

        # Get the URL for the movie
        movie_url = base_url + get_movie_url(movies_on_page[i])
        
        # Get the user's rating for the movie
        movie_user_rating = get_user_movie_rating(movies_on_page[i])
        
        # Get the HTML soup for the movie
        movie_soup = get_html_soup(movie_url)

        # Attributes from the movie page or already stored in variables
        movie_dict['title'] = get_movie_title(movie_url)
        movie_dict['url'] = movie_url
        movie_dict['description'] = get_movie_desc(movie_url)
        movie_dict['avg_rating'] = get_movie_avg_rating(movie_url)
        movie_dict['user_rating'] = movie_user_rating
        movie_dict['cast'] = get_cast_data(movie_url)
        movie_dict['crew'] = get_crew_data(movie_url)
        page_ratings_list.append(movie_dict)

    return page_ratings_list

In [26]:
def get_pages_to_scrape(account_name):
    
    '''
    
    Given an account name on letterboxd.com, this function returns the number of pages of movie ratings there are to
    scrape through for that account. 
    
    
    '''
    
    base_url = 'https://letterboxd.com/'
    account_name = account_name.lower().strip()
    
    # Everyone has at least one page, even if they have no ratingns
    ratings_url = base_url + account_name.lower() + '/films/ratings/by/rating/page/1'
    soup = get_html_soup(ratings_url)
    pages_to_scrape = 1

    try:
        paginate_pages = soup.findAll('div', class_='paginate-pages')[0]

    except IndexError:
        return pages_to_scrape

    pages_to_scrape = int(paginate_pages.findAll('li')[-1].text)
    return pages_to_scrape

In [65]:
def scrape_account_ratings(account_to_scrape):

    '''
    
    Given an account name on letterboxd.com, this function returns a dictionary which contains data about all the films
    the user has rated. 
    
    The dictionary returned is nested - each movie rated is an entry in a list and each entry in the list itself contains 
    a multi-level data structure to include cast, crew, the moving's average rating, etc.
    
    
    '''
    
    
    movie_ratings = {}
    movies_by_page_list = []
    movies_rated_list = []

    account_name = account_to_scrape.lower().strip()
    base_url = 'https://letterboxd.com/'
    account_url = base_url + account_name
    account_ratings_url = account_url + '/films/ratings/by/rating/'
    
    movie_ratings['account'] = account_name
    movie_ratings['account_url'] = account_url
    movie_ratings['account_ratings_url'] = account_ratings_url
    
    pages_to_scrape = get_pages_to_scrape(account_name)
    
    
    for i in range(pages_to_scrape):
        inner_dict = {}
        
        page_number = str(i+1)
    
        ratings_url = base_url + account_name + '/films/ratings/by/rating/page/'+page_number
        page_ratings = scrape_page_ratings(ratings_url)
        
        inner_dict['page'] = page_number
        inner_dict['movies_on_page'] = page_ratings
        
        movies_by_page_list.append(inner_dict)

    for i in range(len(movies_by_page_list)):
        for j in range(len(movies_by_page_list[i]['movies_on_page'])):
            movies_rated_list.append(movies_by_page_list[i]['movies_on_page'][j])    
        
    movie_ratings['movies_rated'] = movies_rated_list
    
    return movie_ratings     

In [28]:
linneburg_ratings = scrape_account_ratings('joshlinneburg') 

In [29]:
linneburg_ratings

{'account': 'joshlinneburg',
 'account_url': 'https://letterboxd.com/joshlinneburg',
 'account_ratings_url': 'https://letterboxd.com/joshlinneburg/films/ratings/by/rating/',
 'movies_rated': [{'title': 'The Empire Strikes Back',
   'url': 'https://letterboxd.com//film/the-empire-strikes-back/',
   'description': 'The epic saga continues as Luke Skywalker, in hopes of defeating the evil Galactic Empire, learns the ways of the Jedi from aging master Yoda. But Darth Vader is more determined than ever to capture Luke. Meanwhile, rebel leader Princess Leia, cocky Han Solo, Chewbacca, and droids C-3PO and R2-D2 are thrown into various stages of capture, betrayal and despair.',
   'avg_rating': '4.4',
   'user_rating': 5.0,
   'cast': {'total_cast': 75,
    'cast_list': [{'actor_seq_nbr': 0,
      'actor_name': 'Mark Hamill',
      'actor_role': 'Luke Skywalker',
      'actor_url': '/actor/mark-hamill/'},
     {'actor_seq_nbr': 1,
      'actor_name': 'Harrison Ford',
      'actor_role': 'Han 